From d901389e049f0067b19371716d7fd096186bc778 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81uaksz=20G=C3=B3reczny?= Date: Fri, 2 Apr 2021 12:42:32 +0200 Subject: [PATCH] init --- LICENSE | 1 + ShadowLibrary/Core.hs | 304 +++++++++++++++++++++++++++++++++++++++++ app/almanachmuszyny.hs | 46 +++++++ shadow-library.cabal | 65 +++++++++ stack.yaml | 5 + stack.yaml.lock | 19 +++ 6 files changed, 440 insertions(+) create mode 100644 LICENSE create mode 100644 ShadowLibrary/Core.hs create mode 100644 app/almanachmuszyny.hs create mode 100644 shadow-library.cabal create mode 100644 stack.yaml create mode 100644 stack.yaml.lock diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ce328d6 --- /dev/null +++ b/LICENSE @@ -0,0 +1 @@ +Prioprietary. \ No newline at end of file diff --git a/ShadowLibrary/Core.hs b/ShadowLibrary/Core.hs new file mode 100644 index 0000000..678df33 --- /dev/null +++ b/ShadowLibrary/Core.hs @@ -0,0 +1,304 @@ +{-# LANGUAGE QuasiQuotes, TemplateHaskell, TypeFamilies, OverloadedStrings #-} +{-# LANGUAGE GADTs, FlexibleContexts #-} +{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} + +module ShadowLibrary.Core where + +-- import Database.Persist +-- import Database.Persist.TH +-- import Database.Persist.Postgresql +import Control.Monad.IO.Class (liftIO) +import Control.Monad.Logger (runNoLoggingT) +import Control.Monad.Trans.Resource (runResourceT) + +import Control.Monad.Reader + +import Data.Time +import Data.Time.Calendar +import Data.Time.LocalTime +import Data.Time.Zones +import Data.Time.Zones.All + +import Data.String.Utils (strip) +import Data.List.Utils as DLU + +import Text.XML.HXT.Core +import Network.HTTP +import Network.URI +import Control.Monad.Trans.Maybe +import Control.Monad +import Data.Tree.NTree.TypeDefs +import Data.Maybe +import Control.Monad.Trans +import Text.XML.HXT.XPath +-- import Text.XML.HXT.Curl +import Text.XML.HXT.HTTP + +import Text.Regex.TDFA + +import Data.List (isInfixOf, intercalate) + +import Data.List.Utils (replace) + +-- import Network.Curl.Opts + +polishTimeZone = TimeZone { + timeZoneMinutes = 120, + timeZoneSummerOnly = True, + timeZoneName = ""} + +openUrl :: String -> MaybeT IO String +openUrl url = case parseURI url of + Nothing -> fail "" + Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u)) + +getWebPage :: String -> IO (IOSArrow XmlTree (NTree XNode)) +getWebPage url = do + contents <- runMaybeT $ openUrl url + return $ readString [withParseHTML yes, withWarnings no] (fromMaybe "" contents) + +downloadDocument = readFromDocument [withParseHTML yes, + withWarnings no, + withEncodingErrors no, + withPreserveComment yes, + withStrictInput yes, + withHTTP [] +-- withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")] + ] + +downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes, + withWarnings no, + withEncodingErrors no, + withPreserveComment yes, + withInputEncoding enc, + withHTTP []] +-- withCurl []] + +downloadXmlDocument = readFromDocument [withWarnings no, + withEncodingErrors no, + withHTTP []] +-- withCurl [] ] + + +data ShadowLibrary = ShadowLibrary { logoUrl :: Maybe String, + lname :: String, + abbrev :: String, + webpage :: String, + lLevel :: Int } + + +data ShadowItem = ShadowItem { + url :: Maybe String, + title :: String, + itype :: String, + originalDate :: Maybe String, + creator :: Maybe String, + format :: Maybe String, + lang :: Maybe String, + finalUrl :: String, + description :: Maybe String + } deriving (Show) + +defaultShadowItem url title = ShadowItem { + url = Just url, + title = title, + itype = "periodical", + originalDate = Nothing, + creator = Nothing, + format = Just "pdf", + lang = Just "pol", + finalUrl = url, + description = Nothing } + + +getDuration :: Maybe String -> (Maybe UTCTime, Maybe UTCTime) +getDuration Nothing = (Nothing, Nothing) +getDuration (Just date) = + case date =~~ ("^(1[6789]|20)[0-9][0-9]$" :: String) of + Just year -> (Just (yearStart year), Just (yearEnd year)) + otherwise -> + case date =~~ ("^((1[789]|20)[0-9][0-9])-(0[1-9]|1[0-2])$" :: String) :: Maybe [[String]] of + Just [[_, year, _, month]] -> (Just (monthStart year month), Just (monthEnd year month)) + otherwise -> + case date =~~ ("^((1[789]|20)[0-9][0-9])-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$" :: String) :: Maybe [[String]] of + Just [[_, year, _, month, day]] -> (Just (dayStart year month day), Just (dayEnd year month day)) + otherwise -> + case date =~~ ("^((1[789]|20)[0-9][0-9])-((1[789]|20)[0-9][0-9])$" :: String) :: Maybe [[String]] of + Just [[_, yearb, _, yeare, _]] -> (Just (yearStart yearb), Just (yearEnd yeare)) + otherwise -> (Nothing, Nothing) + where + yearAsInteger year = (read year) :: Integer + monthAsInt month = (read month) :: Int + dayAsInt day = (read day) :: Int + yearStart year = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) 1 1, + localTimeOfDay = midnight } + yearEnd year = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) 12 31, + localTimeOfDay = lastSecond } + monthStart year month = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) (monthAsInt month) 1, + localTimeOfDay = midnight } + monthEnd year month = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) (monthAsInt month) 31, + localTimeOfDay = lastSecond } + dayStart year month day = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) (monthAsInt month) (dayAsInt day), + localTimeOfDay = midnight } + dayEnd year month day = localTimeToTimeStamp $ LocalTime { + localDay = fromGregorian (yearAsInteger year) (monthAsInt month) (dayAsInt day), + localTimeOfDay = lastSecond } + lastSecond = TimeOfDay {todHour = 23, todMin = 59, todSec = 59 } + +localTimeToTimeStamp ltime = localTimeToUTCTZ (tzByLabel Europe__Warsaw) ltime + +-- zonedTimeToUTC $ ZonedTime { +-- zonedTimeToLocalTime = ltime, +-- zonedTimeZone = timeZoneForUTCTime Europe__Warsaw } + +extractLinks xpathCondition = (downloadDocument &&& this) + >>> first (getXPathTrees xpathCondition + >>> getAttrValue "href") + >>> expandURIFixed + +extractLinksGeneralized xpathCondition attr = (downloadDocument &&& this) + >>> first (getXPathTrees xpathCondition + >>> getAttrValue attr) + >>> expandURIFixed + +rotateSecTh ((a, b), c) = ((a, c), b) + +extractLinksWithText xpathCondition = (downloadDocument &&& this) + >>> first (getXPathTrees xpathCondition + >>> (getAttrValue "href" + &&& (listA (deep isText >>> getText) + >>> arr (intercalate " ")))) + >>> arr rotateSecTh + >>> first expandURIFixed + +extractLinksWithTitle xpathCondition = (downloadDocument &&& this) + >>> first (getXPathTrees xpathCondition + >>> (getAttrValue "href" + &&& getAttrValue "title")) + >>> arr rotateSecTh + >>> first expandURIFixed + +urlPreFixer = arr (Data.List.Utils.replace "[" stupidLeftBracketMarker . Data.List.Utils.replace "]" stupidRightBracketMarker) +urlPostFixer = arr (Data.List.Utils.replace stupidLeftBracketMarker "%5B" . Data.List.Utils.replace stupidRightBracketMarker "%5D") + +stupidLeftBracketMarker = "ddsfdfdfdfgfgfrrtrtrrtr" +stupidRightBracketMarker = "wqweweerererrtrtrtrtrtr" + +expandURIFixed = (urlPreFixer *** urlPreFixer) >>> expandURI >>> urlPostFixer + + +extractText = (listA (deep isText >>> getText) + >>> arr (intercalate " ")) + +loopNext extract xpathConditionForNext = initialStep + >>> loopNextCore extract xpathConditionForNext + >>> arr fst + >>> unlistA + +initialList :: [(String, a)] +initialList = [] + +initialStep = arr (const initialList) &&& this + + +loopNextCore extract xpathConditionForNext = second + (listA extract &&& extractNext xpathConditionForNext) + >>> arr expandList + >>> ifP (nextFound) + (second (arr fromJust >>> (downloadDocument &&& this)) + >>> loopNextCore extract xpathConditionForNext) + (this) + +nextFound :: ([(String,a)], Maybe String) -> Bool +nextFound (_, Just _) = True +nextFound (_, Nothing) = False + + +expandList (l, (e, n)) = (l ++ e, n) + +extractNext xpathConditionForNext = listA + (first (getXPathTrees ("(" ++ xpathConditionForNext ++ ")[1]") + >>> getAttrValue "href") + >>> expandURIFixed) + >>> arr listToMaybe + + +extractFormat :: String -> Maybe String +extractFormat finalUrl + | ".gif" `isInfixOf` finalUrl = Just "gif" + | ".jpg" `isInfixOf` finalUrl = Just "jpg" + | ".djvu" `isInfixOf` finalUrl = Just "djvu" + | ".pdf" `isInfixOf` finalUrl = Just "pdf" + | ".doc" `isInfixOf` finalUrl = Just "doc" + | otherwise = Nothing + + + +baseMonthNameToNumber :: String -> Maybe String +baseMonthNameToNumber "styczeń" = Just "01" +baseMonthNameToNumber "styczen" = Just "01" +baseMonthNameToNumber "stycznia" = Just "01" +baseMonthNameToNumber "luty" = Just "02" +baseMonthNameToNumber "lutego" = Just "02" +baseMonthNameToNumber "marzec" = Just "03" +baseMonthNameToNumber "marca" = Just "03" +baseMonthNameToNumber "kwiecień" = Just "04" +baseMonthNameToNumber "kwiecien" = Just "04" +baseMonthNameToNumber "kwietnia" = Just "04" +baseMonthNameToNumber "maj" = Just "05" +baseMonthNameToNumber "maja" = Just "05" +baseMonthNameToNumber "czerwiec" = Just "06" +baseMonthNameToNumber "czeerwiec" = Just "06" +baseMonthNameToNumber "czerwca" = Just "06" +baseMonthNameToNumber "lipiec" = Just "07" +baseMonthNameToNumber "lipca" = Just "07" +baseMonthNameToNumber "sierpień" = Just "08" +baseMonthNameToNumber "sierpien" = Just "08" +baseMonthNameToNumber "sierpnia" = Just "08" +baseMonthNameToNumber "wrzesień" = Just "09" +baseMonthNameToNumber "wrzesien" = Just "09" +baseMonthNameToNumber "września" = Just "09" +baseMonthNameToNumber "wrzesnia" = Just "09" +baseMonthNameToNumber "październik" = Just "10" +baseMonthNameToNumber "pażdziernik" = Just "10" +baseMonthNameToNumber "pazdziernik" = Just "10" +baseMonthNameToNumber "października" = Just "10" +baseMonthNameToNumber "pazdziernika" = Just "10" +baseMonthNameToNumber "listopad" = Just "11" +baseMonthNameToNumber "listopada" = Just "11" +baseMonthNameToNumber "grudzień" = Just "12" +baseMonthNameToNumber "grudzien" = Just "12" +baseMonthNameToNumber "grudnia" = Just "12" +baseMonthNameToNumber "jesien" = Just "10" +baseMonthNameToNumber _ = Nothing + +extractYear :: String -> Maybe String +extractYear n = + case n =~~ ("(1[6789]|20)[0-9][0-9]" :: String) of + Just year -> Just year + otherwise -> Nothing + + +joinAlts :: [String] -> String +joinAlts = intercalate " // " + +clean = arr (DLU.replace "\n" "") + >>> arr strip + +extractItems shadowLibrary start extractor = do + page <- getWebPage start + items <- runX $ extractor start page +-- insertIntoDatabase shadowLibrary items + putStrLn (show items) + +extractItemsStartingFromUrl shadowLibrary start extractor = do + items <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractor) +-- insertIntoDatabase shadowLibrary items + mapM_ (putStrLn . show) items diff --git a/app/almanachmuszyny.hs b/app/almanachmuszyny.hs new file mode 100644 index 0000000..ebc4f5c --- /dev/null +++ b/app/almanachmuszyny.hs @@ -0,0 +1,46 @@ + +{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} +import ShadowLibrary.Core + +import Text.XML.HXT.Core +import Text.XML.HXT.XPath +-- import Text.XML.HXT.Curl +import Data.List +import Data.List.Utils (replace) + +import Text.Regex.Posix +import Text.Printf + + +extractRecords = extractLinksWithText "//a[@class='roczniki']" -- pary adres-tytuł + >>> second (arr $ replace "\r\n " " ") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków + >>> first (arr ((++"tr") . init)) -- modyfikujemy pierwszy element pary, czyli adres URL + >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego + -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika) + +-- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem +toShadowItem :: ((String, String), String) -> ShadowItem +toShadowItem ((url, articleTitle), yearlyTitle) = + (defaultShadowItem url title) { + originalDate = Just date, + itype = "periodical", + format = Just "pdf", + finalUrl = url + } + where title = "Almanach Muszyny " ++ yearlyTitle ++ " " ++ (replace "\r\n" "" (replace "\r\n " "" articleTitle)) + date = getDate url + +getDate url = + case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of + Just [[_, year]] -> year + otherwise -> error $ "unexpected url: " ++ url + + +main = do + let start = "http://www.almanachmuszyny.pl/" + let shadowLibrary = ShadowLibrary {logoUrl=Nothing, + lname="Almanach Muszyny", + abbrev="AlmMusz", + lLevel=0, + webpage=start} + extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem) diff --git a/shadow-library.cabal b/shadow-library.cabal new file mode 100644 index 0000000..cd77195 --- /dev/null +++ b/shadow-library.cabal @@ -0,0 +1,65 @@ +name: shadow-library +version: 0.1.0.0 +synopsis: Initial project template from stack +description: Please see README.md +homepage: http://github.com/name/project +license: Proprietary +license-file: LICENSE +author: Your name here +maintainer: your.address@example.com +-- copyright: +category: Web +build-type: Simple +-- extra-source-files: +cabal-version: >=1.10 + +library + hs-source-dirs: . + exposed-modules: ShadowLibrary.Core + build-depends: base >= 4.7 && < 5 + , HTTP + , hxt + , hxt-http + , hxt-xpath + , MissingH + , monad-logger + , mtl + , network-uri + , regex-pcre + , regex-tdfa + , resourcet + , text + , time + , transformers + , tz + default-language: Haskell2010 + +-- executable maly-modelarz-exe +-- hs-source-dirs: app +-- main-is: malymodelarz.hs +-- ghc-options: -threaded -rtsopts -with-rtsopts=-N +-- build-depends: base +-- , hxt +-- , hxt-curl +-- , hxt-xpath +-- , MissingH +-- , regex-posix +-- , shadow-library +-- default-language: Haskell2010 + +executable almanachmuszyny + hs-source-dirs: app + main-is: almanachmuszyny.hs + ghc-options: -threaded -rtsopts -with-rtsopts=-N + build-depends: base + , hxt + , hxt-xpath + , MissingH + , regex-posix + , shadow-library + default-language: Haskell2010 + + +source-repository head + type: git + location: https://github.com/name/project diff --git a/stack.yaml b/stack.yaml new file mode 100644 index 0000000..6c16955 --- /dev/null +++ b/stack.yaml @@ -0,0 +1,5 @@ +flags: {} +packages: +- '.' +extra-deps: [hxt-xpath-9.1.2.2] +resolver: lts-11.9 diff --git a/stack.yaml.lock b/stack.yaml.lock new file mode 100644 index 0000000..cc80ef3 --- /dev/null +++ b/stack.yaml.lock @@ -0,0 +1,19 @@ +# This file was autogenerated by Stack. +# You should not edit this file by hand. +# For more information, please see the documentation at: +# https://docs.haskellstack.org/en/stable/lock_files + +packages: +- completed: + hackage: hxt-xpath-9.1.2.2@sha256:9cd590ae93a04573db8f90fa4094625ebd97dded45da7667c577ce6b38a42900,1999 + pantry-tree: + size: 2225 + sha256: aee2f75974e868ff429b8ff349a29667536c60397098f5dfedc968d1951511bb + original: + hackage: hxt-xpath-9.1.2.2 +snapshots: +- completed: + size: 507596 + url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/11/9.yaml + sha256: 42f472dbf06482da1b3319241f3e3b3593a45bd7d4f537d2789f21386b9b2ad3 + original: lts-11.9