commit 08868f90239a3f95c9f1d05abf538b1ac912c17e Author: Filip Gralinski Date: Sun Mar 10 20:30:05 2019 +0100 Init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dbc8eb0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,72 @@ +*~ +*.hi +*.o +zrzut* +czest +\#* +.\#* +ekczp +przegladpowszechny +thecompany +monitorpolski +mim +archiveorg +bydg +ustronpl +kfkpolski +znak +urania +filatelista +tysol +morze +meteoryt +brzostek +mediator +muzeumlotnictwa +rybackie +touring +straz +ryglice +mediator +raciborz +zhpkanada +mikroklan +przeroslak +raszyn +gloszabek +cieszanow +onb +glosswietojanski +praktycznyelektronik +willawlochy +azymut +dolice +notatkiplockie +echogabina +hel +taterniczek +malapolska +glosgarbowa +vonscz +rjp +kolyaska +watchtower +archiwumrembertowa +mazowsze +gliwice +zwir +gpw +almanachmuszyny +gorlice +tpn +tygillustr +harnas +audiofonologia +wolnomularstwo +niklot +ziemiagrybowska +elektrit +indeks +zinelibrary +archiveorg2 +.stack-work diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ce328d6 --- /dev/null +++ b/LICENSE @@ -0,0 +1 @@ +Prioprietary. \ No newline at end of file diff --git a/ShadowLibrary/Core.hs b/ShadowLibrary/Core.hs new file mode 100644 index 0000000..a84de07 --- /dev/null +++ b/ShadowLibrary/Core.hs @@ -0,0 +1,429 @@ +{-# LANGUAGE QuasiQuotes, TemplateHaskell, TypeFamilies, OverloadedStrings #-} +{-# LANGUAGE GADTs, FlexibleContexts #-} +{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} + +module ShadowLibrary.Core where + +import Database.Persist +import Database.Persist.TH +import Database.Persist.Postgresql +import Control.Monad.IO.Class (liftIO) +import Control.Monad.Logger (runNoLoggingT) +import Control.Monad.Trans.Resource (runResourceT) + +import Control.Monad.Reader + +import Data.Time +import Data.Time.Calendar +import Data.Time.LocalTime +import Data.Time.Zones +import Data.Time.Zones.All + +import Data.String.Utils (strip) +import Data.List.Utils as DLU + +import Text.XML.HXT.Core +import Network.HTTP +import Network.URI +import Control.Monad.Trans.Maybe +import Control.Monad +import Data.Tree.NTree.TypeDefs +import Data.Maybe +import Control.Monad.Trans +import Text.XML.HXT.XPath +import Text.XML.HXT.Curl +import Text.XML.HXT.HTTP + +import Text.Regex.TDFA + +import Data.List (isInfixOf, intercalate) + +import Data.List.Utils (replace) + +import Network.Curl.Opts + +polishTimeZone = TimeZone { + timeZoneMinutes = 120, + timeZoneSummerOnly = True, + timeZoneName = ""} + +openUrl :: String -> MaybeT IO String +openUrl url = case parseURI url of + Nothing -> fail "" + Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u)) + +getWebPage :: String -> IO (IOSArrow XmlTree (NTree XNode)) +getWebPage url = do + contents <- runMaybeT $ openUrl url + return $ readString [withParseHTML yes, withWarnings no] (fromMaybe "" contents) + +downloadDocument = readFromDocument [withParseHTML yes, + withWarnings no, + withEncodingErrors no, + withPreserveComment yes, + withStrictInput yes, +-- withHTTP [] + withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")] + ] + +downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes, + withWarnings no, + withEncodingErrors no, + withPreserveComment yes, + withInputEncoding enc, + withCurl []] + +downloadXmlDocument = readFromDocument [withWarnings no, + withEncodingErrors no, + withCurl [] ] + + +data ShadowLibrary = ShadowLibrary { logoUrl :: Maybe String, + lname :: String, + abbrev :: String, + webpage :: String, + lLevel :: Int } + + +data ShadowItem = ShadowItem { + url :: Maybe String, + title :: String, + itype :: String, + originalDate :: Maybe String, + creator :: Maybe String, + format :: Maybe String, + lang :: Maybe String, + finalUrl :: String, + description :: Maybe String + } deriving (Show) + +defaultShadowItem url title = ShadowItem { + url = Just url, + title = title, + itype = "periodical", + originalDate = Nothing, + creator = Nothing, + format = Just "pdf", + lang = Just "pol", + finalUrl = url, + description = Nothing } + + +share [mkPersist sqlSettings, mkMigrate "migrateAll"] [persistLowerCase| +Library sql=libraries + logoUrl String Maybe sql=logo_url + lname String + abbrev String + UniqueLname lname + UniqueAbbrev abbrev + webpage String Maybe + numberOfInstitutions Int Maybe sql=number_of_institutions + availablePublications Int Maybe sql=available_publications + plannedPublication Int Maybe sql=planned_publication + oaiAvailable String Maybe sql=oai_available + oaiPlanned String Maybe sql=oai_planned + lLevel Int Maybe sql=llevel + deriving Show +Item sql=items + url String Maybe + wid String Maybe + title String Maybe + body String Maybe + address String Maybe + itype String + published UTCTime Maybe + modified UTCTime Maybe + durationStart UTCTime Maybe sql=duration_start + durationEnd UTCTime Maybe sql=duration_end +-- added UTCTime default=now() + parent Int Maybe + arenaDir String Maybe sql=arena_dir + creator String Maybe + subject String Maybe + description String Maybe + publisher String Maybe + contributor String Maybe + originalDate String Maybe sql=original_date + originalType String Maybe sql=original_type + format String Maybe + originalIdentifier String Maybe sql=original_identifier + originalSource String Maybe sql=original_source + lang String Maybe + relation String Maybe + coverage String Maybe + rights String Maybe + library LibraryId + finalUrl String Maybe sql=final_url + downloaded Bool + totbibbed Bool + oaiIdentifier String Maybe sql=oai_identifier + deriving Show + |] + +defaultItem libraryId = Item { + itemUrl = Nothing, + itemWid = Nothing, + itemTitle = Nothing, + itemBody = Nothing, + itemAddress = Nothing, + itemItype = "periodical", + itemPublished = Nothing, + itemModified = Nothing, + itemDurationStart = Nothing, + itemDurationEnd = Nothing, +-- itemAdded = Nothing, + itemParent = Nothing, + itemArenaDir = Nothing, + itemCreator = Nothing, + itemSubject = Nothing, + itemDescription = Nothing, + itemPublisher = Nothing, + itemContributor = Nothing, + itemOriginalDate = Nothing, + itemOriginalType = Nothing, + itemFormat = Just "pdf", + itemOriginalIdentifier = Nothing, + itemOriginalSource = Nothing, + itemLang = Just "pol", + itemRelation = Nothing, + itemCoverage = Nothing, + itemRights = Nothing, + itemLibrary = libraryId, + itemFinalUrl = Nothing, + itemDownloaded = False, + itemTotbibbed = False, + itemOaiIdentifier = Nothing} + +shadowToOAI :: ShadowLibrary -> Library +shadowToOAI shadowLibrary = Library (logoUrl shadowLibrary) (lname shadowLibrary) (abbrev shadowLibrary) (Just $ webpage shadowLibrary) Nothing Nothing Nothing Nothing Nothing (Just $ lLevel shadowLibrary) + +shadowItemToOAI libraryId shadowItem = (defaultItem libraryId) { + itemUrl = url shadowItem, + itemTitle = Just $ strip $ title shadowItem, + itemItype = itype shadowItem, + itemOriginalDate = originalDate shadowItem, + itemCreator = creator shadowItem, + itemFormat = format shadowItem, + itemLang = lang shadowItem, + itemFinalUrl = Just $ finalUrl shadowItem, + itemDurationStart = durationStart, + itemDurationEnd = durationEnd, + itemDescription = description shadowItem + } + where + (durationStart, durationEnd) = getDuration $ originalDate shadowItem + +getDuration :: Maybe String -> (Maybe UTCTime, Maybe UTCTime) +getDuration Nothing = (Nothing, Nothing) +getDuration (Just date) = + case date =~~ ("^(1[6789]|20)[0-9][0-9]$" :: String) of + Just year -> (Just (yearStart year), Just (yearEnd year)) + otherwise -> + case date =~~ ("^((1[789]|20)[0-9][0-9])-(0[1-9]|1[0-2])$" :: String) :: Maybe [[String]] of + Just [[_, year, _, month]] -> (Just (monthStart year month), Just (monthEnd year month)) + otherwise -> + case date =~~ ("^((1[789]|20)[0-9][0-9])-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$" :: String) :: Maybe [[String]] of + Just [[_, year, _, month, day]] -> (Just (dayStart year month day), Just (dayEnd year month day)) + otherwise -> + case date =~~ ("^((1[789]|20)[0-9][0-9])-((1[789]|20)[0-9][0-9])$" :: String) :: Maybe [[String]] of + Just [[_, yearb, _, yeare, _]] -> (Just (yearStart yearb), Just (yearEnd yeare)) + otherwise -> (Nothing, Nothing) + where + yearAsInteger year = (read year) :: Integer + monthAsInt month = (read month) :: Int + dayAsInt day = (read day) :: Int + yearStart year = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) 1 1, + localTimeOfDay = midnight } + yearEnd year = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) 12 31, + localTimeOfDay = lastSecond } + monthStart year month = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) (monthAsInt month) 1, + localTimeOfDay = midnight } + monthEnd year month = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) (monthAsInt month) 31, + localTimeOfDay = lastSecond } + dayStart year month day = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) (monthAsInt month) (dayAsInt day), + localTimeOfDay = midnight } + dayEnd year month day = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) (monthAsInt month) (dayAsInt day), + localTimeOfDay = lastSecond } + lastSecond = TimeOfDay {todHour = 23, todMin = 59, todSec = 59 } + +localTimeToTimeStamp ltime = localTimeToUTCTZ (tzByLabel Europe__Warsaw) ltime + +-- zonedTimeToUTC $ ZonedTime { +-- zonedTimeToLocalTime = ltime, +-- zonedTimeZone = timeZoneForUTCTime Europe__Warsaw } + +getLibraryId library = do + ent <- getBy (UniqueLname $ libraryLname library) + libraryId <- case ent of + Just (Entity key _) -> return key + Nothing -> insert library + return libraryId + +extractLinks xpathCondition = (downloadDocument &&& this) + >>> first (getXPathTrees xpathCondition + >>> getAttrValue "href") + >>> expandURIFixed + +extractLinksGeneralized xpathCondition attr = (downloadDocument &&& this) + >>> first (getXPathTrees xpathCondition + >>> getAttrValue attr) + >>> expandURIFixed + +rotateSecTh ((a, b), c) = ((a, c), b) + +extractLinksWithText xpathCondition = (downloadDocument &&& this) + >>> first (getXPathTrees xpathCondition + >>> (getAttrValue "href" + &&& (listA (deep isText >>> getText) + >>> arr (intercalate " ")))) + >>> arr rotateSecTh + >>> first expandURIFixed + +extractLinksWithTitle xpathCondition = (downloadDocument &&& this) + >>> first (getXPathTrees xpathCondition + >>> (getAttrValue "href" + &&& getAttrValue "title")) + >>> arr rotateSecTh + >>> first expandURIFixed + +urlPreFixer = arr (Data.List.Utils.replace "[" stupidLeftBracketMarker . Data.List.Utils.replace "]" stupidRightBracketMarker) +urlPostFixer = arr (Data.List.Utils.replace stupidLeftBracketMarker "%5B" . Data.List.Utils.replace stupidRightBracketMarker "%5D") + +stupidLeftBracketMarker = "ddsfdfdfdfgfgfrrtrtrrtr" +stupidRightBracketMarker = "wqweweerererrtrtrtrtrtr" + +expandURIFixed = (urlPreFixer *** urlPreFixer) >>> expandURI >>> urlPostFixer + + +extractText = (listA (deep isText >>> getText) + >>> arr (intercalate " ")) + +loopNext extract xpathConditionForNext = initialStep + >>> loopNextCore extract xpathConditionForNext + >>> arr fst + >>> unlistA + +initialList :: [(String, a)] +initialList = [] + +initialStep = arr (const initialList) &&& this + + +loopNextCore extract xpathConditionForNext = second + (listA extract &&& extractNext xpathConditionForNext) + >>> arr expandList + >>> ifP (nextFound) + (second (arr fromJust >>> (downloadDocument &&& this)) + >>> loopNextCore extract xpathConditionForNext) + (this) + +nextFound :: ([(String,a)], Maybe String) -> Bool +nextFound (_, Just _) = True +nextFound (_, Nothing) = False + + +expandList (l, (e, n)) = (l ++ e, n) + +extractNext xpathConditionForNext = listA + (first (getXPathTrees ("(" ++ xpathConditionForNext ++ ")[1]") + >>> getAttrValue "href") + >>> expandURIFixed) + >>> arr listToMaybe + + +dbSpecification = "host=localhost dbname=oai" + +dbConnection = withPostgresqlConn dbSpecification + +runOnDb = runNoLoggingT . runResourceT . dbConnection . runSqlConn + +insertIntoDatabase shadowLibrary items = runOnDb $ insertIntoDatabaseCore shadowLibrary items + +insertIntoDatabaseCore :: MonadIO m => ShadowLibrary + -> [ShadowItem] + -> ReaderT SqlBackend m () +insertIntoDatabaseCore shadowLibrary items = do + libraryId <- getLibraryId $ shadowToOAI shadowLibrary + let ritems = map (shadowItemToOAI libraryId) items + mapM_ insert ritems + +extractFormat :: String -> Maybe String +extractFormat finalUrl + | ".gif" `isInfixOf` finalUrl = Just "gif" + | ".jpg" `isInfixOf` finalUrl = Just "jpg" + | ".djvu" `isInfixOf` finalUrl = Just "djvu" + | ".pdf" `isInfixOf` finalUrl = Just "pdf" + | ".doc" `isInfixOf` finalUrl = Just "doc" + | otherwise = Nothing + + + +baseMonthNameToNumber :: String -> Maybe String +baseMonthNameToNumber "styczeń" = Just "01" +baseMonthNameToNumber "styczen" = Just "01" +baseMonthNameToNumber "stycznia" = Just "01" +baseMonthNameToNumber "luty" = Just "02" +baseMonthNameToNumber "lutego" = Just "02" +baseMonthNameToNumber "marzec" = Just "03" +baseMonthNameToNumber "marca" = Just "03" +baseMonthNameToNumber "kwiecień" = Just "04" +baseMonthNameToNumber "kwiecien" = Just "04" +baseMonthNameToNumber "kwietnia" = Just "04" +baseMonthNameToNumber "maj" = Just "05" +baseMonthNameToNumber "maja" = Just "05" +baseMonthNameToNumber "czerwiec" = Just "06" +baseMonthNameToNumber "czeerwiec" = Just "06" +baseMonthNameToNumber "czerwca" = Just "06" +baseMonthNameToNumber "lipiec" = Just "07" +baseMonthNameToNumber "lipca" = Just "07" +baseMonthNameToNumber "sierpień" = Just "08" +baseMonthNameToNumber "sierpien" = Just "08" +baseMonthNameToNumber "sierpnia" = Just "08" +baseMonthNameToNumber "wrzesień" = Just "09" +baseMonthNameToNumber "wrzesien" = Just "09" +baseMonthNameToNumber "września" = Just "09" +baseMonthNameToNumber "wrzesnia" = Just "09" +baseMonthNameToNumber "październik" = Just "10" +baseMonthNameToNumber "pażdziernik" = Just "10" +baseMonthNameToNumber "pazdziernik" = Just "10" +baseMonthNameToNumber "października" = Just "10" +baseMonthNameToNumber "pazdziernika" = Just "10" +baseMonthNameToNumber "listopad" = Just "11" +baseMonthNameToNumber "listopada" = Just "11" +baseMonthNameToNumber "grudzień" = Just "12" +baseMonthNameToNumber "grudzien" = Just "12" +baseMonthNameToNumber "grudnia" = Just "12" +baseMonthNameToNumber "jesien" = Just "10" +baseMonthNameToNumber _ = Nothing + +extractYear :: String -> Maybe String +extractYear n = + case n =~~ ("(1[6789]|20)[0-9][0-9]" :: String) of + Just year -> Just year + otherwise -> Nothing + + +joinAlts :: [String] -> String +joinAlts = intercalate " // " + +clean = arr (DLU.replace "\n" "") + >>> arr strip + +extractItems shadowLibrary start extractor = do + page <- getWebPage start + items <- runX $ extractor start page +-- insertIntoDatabase shadowLibrary items + putStrLn (show items) + +extractItemsStartingFromUrl shadowLibrary start extractor = do + items <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractor) +-- insertIntoDatabase shadowLibrary items + mapM_ (putStrLn . show) items diff --git a/app/almanachmuszyny.hs b/app/almanachmuszyny.hs new file mode 100644 index 0000000..2aab5fb --- /dev/null +++ b/app/almanachmuszyny.hs @@ -0,0 +1,43 @@ + +{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} +import ShadowLibrary.Core + +import Text.XML.HXT.Core +import Text.XML.HXT.XPath +import Text.XML.HXT.Curl +import Data.List +import Data.List.Utils (replace) + +import Text.Regex.Posix +import Text.Printf + +extractRecords = extractLinksWithText "//a[@class='roczniki']" + >>> second (arr $ replace "\r\n " "") + >>> first (arr ((++"tr") . init)) + >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]") + +toShadowItem :: ((String, String), String) -> ShadowItem +toShadowItem ((url, articleTitle), yearlyTitle) = + (defaultShadowItem url title) { + originalDate = Just date, + itype = "periodical", + format = Just "pdf", + finalUrl = url + } + where title = "Almanach Muszyny " ++ yearlyTitle ++ " " ++ (replace "\r\n" "" (replace "\r\n " "" articleTitle)) + date = getDate url + +getDate url = + case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of + Just [[_, year]] -> year + otherwise -> error $ "unexpected url: " ++ url + + +main = do + let start = "http://www.almanachmuszyny.pl/" + let shadowLibrary = ShadowLibrary {logoUrl=Nothing, + lname="Almanach Muszyny", + abbrev="AlmMusz", + lLevel=0, + webpage=start} + extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem) diff --git a/shadow-library.cabal b/shadow-library.cabal new file mode 100644 index 0000000..d917bc9 --- /dev/null +++ b/shadow-library.cabal @@ -0,0 +1,71 @@ +name: shadow-library +version: 0.1.0.0 +synopsis: Initial project template from stack +description: Please see README.md +homepage: http://github.com/name/project +license: Proprietary +license-file: LICENSE +author: Your name here +maintainer: your.address@example.com +-- copyright: +category: Web +build-type: Simple +-- extra-source-files: +cabal-version: >=1.10 + +library + hs-source-dirs: . + exposed-modules: ShadowLibrary.Core + build-depends: base >= 4.7 && < 5 + , curl + , HTTP + , hxt + , hxt-curl + , hxt-http + , hxt-xpath + , MissingH + , monad-logger + , mtl + , network-uri + , persistent + , persistent-postgresql + , persistent-template + , regex-pcre + , regex-tdfa + , resourcet + , text + , time + , transformers + , tz + default-language: Haskell2010 + +-- executable maly-modelarz-exe +-- hs-source-dirs: app +-- main-is: malymodelarz.hs +-- ghc-options: -threaded -rtsopts -with-rtsopts=-N +-- build-depends: base +-- , hxt +-- , hxt-curl +-- , hxt-xpath +-- , MissingH +-- , regex-posix +-- , shadow-library +-- default-language: Haskell2010 + +executable almanachmuszyny + hs-source-dirs: app + main-is: almanachmuszyny.hs + ghc-options: -threaded -rtsopts -with-rtsopts=-N + build-depends: base + , hxt + , hxt-curl + , hxt-xpath + , MissingH + , regex-posix + , shadow-library + default-language: Haskell2010 + + +source-repository head + type: git + location: https://github.com/name/project diff --git a/stack.yaml b/stack.yaml new file mode 100644 index 0000000..6c16955 --- /dev/null +++ b/stack.yaml @@ -0,0 +1,5 @@ +flags: {} +packages: +- '.' +extra-deps: [hxt-xpath-9.1.2.2] +resolver: lts-11.9