This commit is contained in:
Filip Gralinski 2019-03-10 20:30:05 +01:00
commit 08868f9023
6 changed files with 621 additions and 0 deletions

72
.gitignore vendored Normal file
View File

@ -0,0 +1,72 @@
*~
*.hi
*.o
zrzut*
czest
\#*
.\#*
ekczp
przegladpowszechny
thecompany
monitorpolski
mim
archiveorg
bydg
ustronpl
kfkpolski
znak
urania
filatelista
tysol
morze
meteoryt
brzostek
mediator
muzeumlotnictwa
rybackie
touring
straz
ryglice
mediator
raciborz
zhpkanada
mikroklan
przeroslak
raszyn
gloszabek
cieszanow
onb
glosswietojanski
praktycznyelektronik
willawlochy
azymut
dolice
notatkiplockie
echogabina
hel
taterniczek
malapolska
glosgarbowa
vonscz
rjp
kolyaska
watchtower
archiwumrembertowa
mazowsze
gliwice
zwir
gpw
almanachmuszyny
gorlice
tpn
tygillustr
harnas
audiofonologia
wolnomularstwo
niklot
ziemiagrybowska
elektrit
indeks
zinelibrary
archiveorg2
.stack-work

1
LICENSE Normal file
View File

@ -0,0 +1 @@
Prioprietary.

429
ShadowLibrary/Core.hs Normal file
View File

@ -0,0 +1,429 @@
{-# LANGUAGE QuasiQuotes, TemplateHaskell, TypeFamilies, OverloadedStrings #-}
{-# LANGUAGE GADTs, FlexibleContexts #-}
{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module ShadowLibrary.Core where
import Database.Persist
import Database.Persist.TH
import Database.Persist.Postgresql
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Logger (runNoLoggingT)
import Control.Monad.Trans.Resource (runResourceT)
import Control.Monad.Reader
import Data.Time
import Data.Time.Calendar
import Data.Time.LocalTime
import Data.Time.Zones
import Data.Time.Zones.All
import Data.String.Utils (strip)
import Data.List.Utils as DLU
import Text.XML.HXT.Core
import Network.HTTP
import Network.URI
import Control.Monad.Trans.Maybe
import Control.Monad
import Data.Tree.NTree.TypeDefs
import Data.Maybe
import Control.Monad.Trans
import Text.XML.HXT.XPath
import Text.XML.HXT.Curl
import Text.XML.HXT.HTTP
import Text.Regex.TDFA
import Data.List (isInfixOf, intercalate)
import Data.List.Utils (replace)
import Network.Curl.Opts
polishTimeZone = TimeZone {
timeZoneMinutes = 120,
timeZoneSummerOnly = True,
timeZoneName = ""}
openUrl :: String -> MaybeT IO String
openUrl url = case parseURI url of
Nothing -> fail ""
Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u))
getWebPage :: String -> IO (IOSArrow XmlTree (NTree XNode))
getWebPage url = do
contents <- runMaybeT $ openUrl url
return $ readString [withParseHTML yes, withWarnings no] (fromMaybe "" contents)
downloadDocument = readFromDocument [withParseHTML yes,
withWarnings no,
withEncodingErrors no,
withPreserveComment yes,
withStrictInput yes,
-- withHTTP []
withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")]
]
downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes,
withWarnings no,
withEncodingErrors no,
withPreserveComment yes,
withInputEncoding enc,
withCurl []]
downloadXmlDocument = readFromDocument [withWarnings no,
withEncodingErrors no,
withCurl [] ]
data ShadowLibrary = ShadowLibrary { logoUrl :: Maybe String,
lname :: String,
abbrev :: String,
webpage :: String,
lLevel :: Int }
data ShadowItem = ShadowItem {
url :: Maybe String,
title :: String,
itype :: String,
originalDate :: Maybe String,
creator :: Maybe String,
format :: Maybe String,
lang :: Maybe String,
finalUrl :: String,
description :: Maybe String
} deriving (Show)
defaultShadowItem url title = ShadowItem {
url = Just url,
title = title,
itype = "periodical",
originalDate = Nothing,
creator = Nothing,
format = Just "pdf",
lang = Just "pol",
finalUrl = url,
description = Nothing }
share [mkPersist sqlSettings, mkMigrate "migrateAll"] [persistLowerCase|
Library sql=libraries
logoUrl String Maybe sql=logo_url
lname String
abbrev String
UniqueLname lname
UniqueAbbrev abbrev
webpage String Maybe
numberOfInstitutions Int Maybe sql=number_of_institutions
availablePublications Int Maybe sql=available_publications
plannedPublication Int Maybe sql=planned_publication
oaiAvailable String Maybe sql=oai_available
oaiPlanned String Maybe sql=oai_planned
lLevel Int Maybe sql=llevel
deriving Show
Item sql=items
url String Maybe
wid String Maybe
title String Maybe
body String Maybe
address String Maybe
itype String
published UTCTime Maybe
modified UTCTime Maybe
durationStart UTCTime Maybe sql=duration_start
durationEnd UTCTime Maybe sql=duration_end
-- added UTCTime default=now()
parent Int Maybe
arenaDir String Maybe sql=arena_dir
creator String Maybe
subject String Maybe
description String Maybe
publisher String Maybe
contributor String Maybe
originalDate String Maybe sql=original_date
originalType String Maybe sql=original_type
format String Maybe
originalIdentifier String Maybe sql=original_identifier
originalSource String Maybe sql=original_source
lang String Maybe
relation String Maybe
coverage String Maybe
rights String Maybe
library LibraryId
finalUrl String Maybe sql=final_url
downloaded Bool
totbibbed Bool
oaiIdentifier String Maybe sql=oai_identifier
deriving Show
|]
defaultItem libraryId = Item {
itemUrl = Nothing,
itemWid = Nothing,
itemTitle = Nothing,
itemBody = Nothing,
itemAddress = Nothing,
itemItype = "periodical",
itemPublished = Nothing,
itemModified = Nothing,
itemDurationStart = Nothing,
itemDurationEnd = Nothing,
-- itemAdded = Nothing,
itemParent = Nothing,
itemArenaDir = Nothing,
itemCreator = Nothing,
itemSubject = Nothing,
itemDescription = Nothing,
itemPublisher = Nothing,
itemContributor = Nothing,
itemOriginalDate = Nothing,
itemOriginalType = Nothing,
itemFormat = Just "pdf",
itemOriginalIdentifier = Nothing,
itemOriginalSource = Nothing,
itemLang = Just "pol",
itemRelation = Nothing,
itemCoverage = Nothing,
itemRights = Nothing,
itemLibrary = libraryId,
itemFinalUrl = Nothing,
itemDownloaded = False,
itemTotbibbed = False,
itemOaiIdentifier = Nothing}
shadowToOAI :: ShadowLibrary -> Library
shadowToOAI shadowLibrary = Library (logoUrl shadowLibrary) (lname shadowLibrary) (abbrev shadowLibrary) (Just $ webpage shadowLibrary) Nothing Nothing Nothing Nothing Nothing (Just $ lLevel shadowLibrary)
shadowItemToOAI libraryId shadowItem = (defaultItem libraryId) {
itemUrl = url shadowItem,
itemTitle = Just $ strip $ title shadowItem,
itemItype = itype shadowItem,
itemOriginalDate = originalDate shadowItem,
itemCreator = creator shadowItem,
itemFormat = format shadowItem,
itemLang = lang shadowItem,
itemFinalUrl = Just $ finalUrl shadowItem,
itemDurationStart = durationStart,
itemDurationEnd = durationEnd,
itemDescription = description shadowItem
}
where
(durationStart, durationEnd) = getDuration $ originalDate shadowItem
getDuration :: Maybe String -> (Maybe UTCTime, Maybe UTCTime)
getDuration Nothing = (Nothing, Nothing)
getDuration (Just date) =
case date =~~ ("^(1[6789]|20)[0-9][0-9]$" :: String) of
Just year -> (Just (yearStart year), Just (yearEnd year))
otherwise ->
case date =~~ ("^((1[789]|20)[0-9][0-9])-(0[1-9]|1[0-2])$" :: String) :: Maybe [[String]] of
Just [[_, year, _, month]] -> (Just (monthStart year month), Just (monthEnd year month))
otherwise ->
case date =~~ ("^((1[789]|20)[0-9][0-9])-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$" :: String) :: Maybe [[String]] of
Just [[_, year, _, month, day]] -> (Just (dayStart year month day), Just (dayEnd year month day))
otherwise ->
case date =~~ ("^((1[789]|20)[0-9][0-9])-((1[789]|20)[0-9][0-9])$" :: String) :: Maybe [[String]] of
Just [[_, yearb, _, yeare, _]] -> (Just (yearStart yearb), Just (yearEnd yeare))
otherwise -> (Nothing, Nothing)
where
yearAsInteger year = (read year) :: Integer
monthAsInt month = (read month) :: Int
dayAsInt day = (read day) :: Int
yearStart year = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) 1 1,
localTimeOfDay = midnight }
yearEnd year = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) 12 31,
localTimeOfDay = lastSecond }
monthStart year month = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) (monthAsInt month) 1,
localTimeOfDay = midnight }
monthEnd year month = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) (monthAsInt month) 31,
localTimeOfDay = lastSecond }
dayStart year month day = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) (monthAsInt month) (dayAsInt day),
localTimeOfDay = midnight }
dayEnd year month day = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) (monthAsInt month) (dayAsInt day),
localTimeOfDay = lastSecond }
lastSecond = TimeOfDay {todHour = 23, todMin = 59, todSec = 59 }
localTimeToTimeStamp ltime = localTimeToUTCTZ (tzByLabel Europe__Warsaw) ltime
-- zonedTimeToUTC $ ZonedTime {
-- zonedTimeToLocalTime = ltime,
-- zonedTimeZone = timeZoneForUTCTime Europe__Warsaw }
getLibraryId library = do
ent <- getBy (UniqueLname $ libraryLname library)
libraryId <- case ent of
Just (Entity key _) -> return key
Nothing -> insert library
return libraryId
extractLinks xpathCondition = (downloadDocument &&& this)
>>> first (getXPathTrees xpathCondition
>>> getAttrValue "href")
>>> expandURIFixed
extractLinksGeneralized xpathCondition attr = (downloadDocument &&& this)
>>> first (getXPathTrees xpathCondition
>>> getAttrValue attr)
>>> expandURIFixed
rotateSecTh ((a, b), c) = ((a, c), b)
extractLinksWithText xpathCondition = (downloadDocument &&& this)
>>> first (getXPathTrees xpathCondition
>>> (getAttrValue "href"
&&& (listA (deep isText >>> getText)
>>> arr (intercalate " "))))
>>> arr rotateSecTh
>>> first expandURIFixed
extractLinksWithTitle xpathCondition = (downloadDocument &&& this)
>>> first (getXPathTrees xpathCondition
>>> (getAttrValue "href"
&&& getAttrValue "title"))
>>> arr rotateSecTh
>>> first expandURIFixed
urlPreFixer = arr (Data.List.Utils.replace "[" stupidLeftBracketMarker . Data.List.Utils.replace "]" stupidRightBracketMarker)
urlPostFixer = arr (Data.List.Utils.replace stupidLeftBracketMarker "%5B" . Data.List.Utils.replace stupidRightBracketMarker "%5D")
stupidLeftBracketMarker = "ddsfdfdfdfgfgfrrtrtrrtr"
stupidRightBracketMarker = "wqweweerererrtrtrtrtrtr"
expandURIFixed = (urlPreFixer *** urlPreFixer) >>> expandURI >>> urlPostFixer
extractText = (listA (deep isText >>> getText)
>>> arr (intercalate " "))
loopNext extract xpathConditionForNext = initialStep
>>> loopNextCore extract xpathConditionForNext
>>> arr fst
>>> unlistA
initialList :: [(String, a)]
initialList = []
initialStep = arr (const initialList) &&& this
loopNextCore extract xpathConditionForNext = second
(listA extract &&& extractNext xpathConditionForNext)
>>> arr expandList
>>> ifP (nextFound)
(second (arr fromJust >>> (downloadDocument &&& this))
>>> loopNextCore extract xpathConditionForNext)
(this)
nextFound :: ([(String,a)], Maybe String) -> Bool
nextFound (_, Just _) = True
nextFound (_, Nothing) = False
expandList (l, (e, n)) = (l ++ e, n)
extractNext xpathConditionForNext = listA
(first (getXPathTrees ("(" ++ xpathConditionForNext ++ ")[1]")
>>> getAttrValue "href")
>>> expandURIFixed)
>>> arr listToMaybe
dbSpecification = "host=localhost dbname=oai"
dbConnection = withPostgresqlConn dbSpecification
runOnDb = runNoLoggingT . runResourceT . dbConnection . runSqlConn
insertIntoDatabase shadowLibrary items = runOnDb $ insertIntoDatabaseCore shadowLibrary items
insertIntoDatabaseCore :: MonadIO m => ShadowLibrary
-> [ShadowItem]
-> ReaderT SqlBackend m ()
insertIntoDatabaseCore shadowLibrary items = do
libraryId <- getLibraryId $ shadowToOAI shadowLibrary
let ritems = map (shadowItemToOAI libraryId) items
mapM_ insert ritems
extractFormat :: String -> Maybe String
extractFormat finalUrl
| ".gif" `isInfixOf` finalUrl = Just "gif"
| ".jpg" `isInfixOf` finalUrl = Just "jpg"
| ".djvu" `isInfixOf` finalUrl = Just "djvu"
| ".pdf" `isInfixOf` finalUrl = Just "pdf"
| ".doc" `isInfixOf` finalUrl = Just "doc"
| otherwise = Nothing
baseMonthNameToNumber :: String -> Maybe String
baseMonthNameToNumber "styczeń" = Just "01"
baseMonthNameToNumber "styczen" = Just "01"
baseMonthNameToNumber "stycznia" = Just "01"
baseMonthNameToNumber "luty" = Just "02"
baseMonthNameToNumber "lutego" = Just "02"
baseMonthNameToNumber "marzec" = Just "03"
baseMonthNameToNumber "marca" = Just "03"
baseMonthNameToNumber "kwiecień" = Just "04"
baseMonthNameToNumber "kwiecien" = Just "04"
baseMonthNameToNumber "kwietnia" = Just "04"
baseMonthNameToNumber "maj" = Just "05"
baseMonthNameToNumber "maja" = Just "05"
baseMonthNameToNumber "czerwiec" = Just "06"
baseMonthNameToNumber "czeerwiec" = Just "06"
baseMonthNameToNumber "czerwca" = Just "06"
baseMonthNameToNumber "lipiec" = Just "07"
baseMonthNameToNumber "lipca" = Just "07"
baseMonthNameToNumber "sierpień" = Just "08"
baseMonthNameToNumber "sierpien" = Just "08"
baseMonthNameToNumber "sierpnia" = Just "08"
baseMonthNameToNumber "wrzesień" = Just "09"
baseMonthNameToNumber "wrzesien" = Just "09"
baseMonthNameToNumber "września" = Just "09"
baseMonthNameToNumber "wrzesnia" = Just "09"
baseMonthNameToNumber "październik" = Just "10"
baseMonthNameToNumber "pażdziernik" = Just "10"
baseMonthNameToNumber "pazdziernik" = Just "10"
baseMonthNameToNumber "października" = Just "10"
baseMonthNameToNumber "pazdziernika" = Just "10"
baseMonthNameToNumber "listopad" = Just "11"
baseMonthNameToNumber "listopada" = Just "11"
baseMonthNameToNumber "grudzień" = Just "12"
baseMonthNameToNumber "grudzien" = Just "12"
baseMonthNameToNumber "grudnia" = Just "12"
baseMonthNameToNumber "jesien" = Just "10"
baseMonthNameToNumber _ = Nothing
extractYear :: String -> Maybe String
extractYear n =
case n =~~ ("(1[6789]|20)[0-9][0-9]" :: String) of
Just year -> Just year
otherwise -> Nothing
joinAlts :: [String] -> String
joinAlts = intercalate " // "
clean = arr (DLU.replace "\n" "")
>>> arr strip
extractItems shadowLibrary start extractor = do
page <- getWebPage start
items <- runX $ extractor start page
-- insertIntoDatabase shadowLibrary items
putStrLn (show items)
extractItemsStartingFromUrl shadowLibrary start extractor = do
items <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractor)
-- insertIntoDatabase shadowLibrary items
mapM_ (putStrLn . show) items

43
app/almanachmuszyny.hs Normal file
View File

@ -0,0 +1,43 @@
{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
import ShadowLibrary.Core
import Text.XML.HXT.Core
import Text.XML.HXT.XPath
import Text.XML.HXT.Curl
import Data.List
import Data.List.Utils (replace)
import Text.Regex.Posix
import Text.Printf
extractRecords = extractLinksWithText "//a[@class='roczniki']"
>>> second (arr $ replace "\r\n " "")
>>> first (arr ((++"tr") . init))
>>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]")
toShadowItem :: ((String, String), String) -> ShadowItem
toShadowItem ((url, articleTitle), yearlyTitle) =
(defaultShadowItem url title) {
originalDate = Just date,
itype = "periodical",
format = Just "pdf",
finalUrl = url
}
where title = "Almanach Muszyny " ++ yearlyTitle ++ " " ++ (replace "\r\n" "" (replace "\r\n " "" articleTitle))
date = getDate url
getDate url =
case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of
Just [[_, year]] -> year
otherwise -> error $ "unexpected url: " ++ url
main = do
let start = "http://www.almanachmuszyny.pl/"
let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
lname="Almanach Muszyny",
abbrev="AlmMusz",
lLevel=0,
webpage=start}
extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)

71
shadow-library.cabal Normal file
View File

@ -0,0 +1,71 @@
name: shadow-library
version: 0.1.0.0
synopsis: Initial project template from stack
description: Please see README.md
homepage: http://github.com/name/project
license: Proprietary
license-file: LICENSE
author: Your name here
maintainer: your.address@example.com
-- copyright:
category: Web
build-type: Simple
-- extra-source-files:
cabal-version: >=1.10
library
hs-source-dirs: .
exposed-modules: ShadowLibrary.Core
build-depends: base >= 4.7 && < 5
, curl
, HTTP
, hxt
, hxt-curl
, hxt-http
, hxt-xpath
, MissingH
, monad-logger
, mtl
, network-uri
, persistent
, persistent-postgresql
, persistent-template
, regex-pcre
, regex-tdfa
, resourcet
, text
, time
, transformers
, tz
default-language: Haskell2010
-- executable maly-modelarz-exe
-- hs-source-dirs: app
-- main-is: malymodelarz.hs
-- ghc-options: -threaded -rtsopts -with-rtsopts=-N
-- build-depends: base
-- , hxt
-- , hxt-curl
-- , hxt-xpath
-- , MissingH
-- , regex-posix
-- , shadow-library
-- default-language: Haskell2010
executable almanachmuszyny
hs-source-dirs: app
main-is: almanachmuszyny.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base
, hxt
, hxt-curl
, hxt-xpath
, MissingH
, regex-posix
, shadow-library
default-language: Haskell2010
source-repository head
type: git
location: https://github.com/name/project

5
stack.yaml Normal file
View File

@ -0,0 +1,5 @@
flags: {}
packages:
- '.'
extra-deps: [hxt-xpath-9.1.2.2]
resolver: lts-11.9