{-# LANGUAGE QuasiQuotes, TemplateHaskell, TypeFamilies, OverloadedStrings #-} {-# LANGUAGE GADTs, FlexibleContexts #-} {-# LANGUAGE Arrows, NoMonomorphismRestriction #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} module ShadowLibrary.Core where -- import Database.Persist -- import Database.Persist.TH -- import Database.Persist.Postgresql import Control.Monad.IO.Class (liftIO) import Control.Monad.Logger (runNoLoggingT) import Control.Monad.Trans.Resource (runResourceT) import Control.Monad.Reader import Data.Time import Data.Time.Calendar import Data.Time.LocalTime import Data.Time.Zones import Data.Time.Zones.All import Data.String.Utils (strip) import Data.List.Utils as DLU import Text.XML.HXT.Core import Network.HTTP import Network.URI import Control.Monad.Trans.Maybe import Control.Monad import Data.Tree.NTree.TypeDefs import Data.Maybe import Control.Monad.Trans import Text.XML.HXT.XPath -- import Text.XML.HXT.Curl import Text.XML.HXT.HTTP import Text.Regex.TDFA import Data.List (isInfixOf, intercalate) import Data.List.Utils (replace) -- import Network.Curl.Opts polishTimeZone = TimeZone { timeZoneMinutes = 120, timeZoneSummerOnly = True, timeZoneName = ""} openUrl :: String -> MaybeT IO String openUrl url = case parseURI url of Nothing -> fail "" Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u)) getWebPage :: String -> IO (IOSArrow XmlTree (NTree XNode)) getWebPage url = do contents <- runMaybeT $ openUrl url return $ readString [withParseHTML yes, withWarnings no] (fromMaybe "" contents) downloadDocument = readFromDocument [withParseHTML yes, withWarnings no, withEncodingErrors no, withPreserveComment yes, withStrictInput yes, withHTTP [] -- withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")] ] downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes, withWarnings no, withEncodingErrors no, withPreserveComment yes, withInputEncoding enc, withHTTP []] -- withCurl []] downloadXmlDocument = readFromDocument [withWarnings no, withEncodingErrors no, withHTTP []] -- withCurl [] ] data ShadowLibrary = ShadowLibrary { logoUrl :: Maybe String, lname :: String, abbrev :: String, webpage :: String, lLevel :: Int } data ShadowItem = ShadowItem { url :: Maybe String, title :: String, itype :: String, originalDate :: Maybe String, creator :: Maybe String, format :: Maybe String, lang :: Maybe String, finalUrl :: String, description :: Maybe String } deriving (Show) defaultShadowItem url title = ShadowItem { url = Just url, title = title, itype = "periodical", originalDate = Nothing, creator = Nothing, format = Just "pdf", lang = Just "pol", finalUrl = url, description = Nothing } getDuration :: Maybe String -> (Maybe UTCTime, Maybe UTCTime) getDuration Nothing = (Nothing, Nothing) getDuration (Just date) = case date =~~ ("^(1[6789]|20)[0-9][0-9]$" :: String) of Just year -> (Just (yearStart year), Just (yearEnd year)) otherwise -> case date =~~ ("^((1[789]|20)[0-9][0-9])-(0[1-9]|1[0-2])$" :: String) :: Maybe [[String]] of Just [[_, year, _, month]] -> (Just (monthStart year month), Just (monthEnd year month)) otherwise -> case date =~~ ("^((1[789]|20)[0-9][0-9])-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$" :: String) :: Maybe [[String]] of Just [[_, year, _, month, day]] -> (Just (dayStart year month day), Just (dayEnd year month day)) otherwise -> case date =~~ ("^((1[789]|20)[0-9][0-9])-((1[789]|20)[0-9][0-9])$" :: String) :: Maybe [[String]] of Just [[_, yearb, _, yeare, _]] -> (Just (yearStart yearb), Just (yearEnd yeare)) otherwise -> (Nothing, Nothing) where yearAsInteger year = (read year) :: Integer monthAsInt month = (read month) :: Int dayAsInt day = (read day) :: Int yearStart year = localTimeToTimeStamp $ LocalTime { localDay = fromGregorian (yearAsInteger year) 1 1, localTimeOfDay = midnight } yearEnd year = localTimeToTimeStamp $ LocalTime { localDay = fromGregorian (yearAsInteger year) 12 31, localTimeOfDay = lastSecond } monthStart year month = localTimeToTimeStamp $ LocalTime { localDay = fromGregorian (yearAsInteger year) (monthAsInt month) 1, localTimeOfDay = midnight } monthEnd year month = localTimeToTimeStamp $ LocalTime { localDay = fromGregorian (yearAsInteger year) (monthAsInt month) 31, localTimeOfDay = lastSecond } dayStart year month day = localTimeToTimeStamp $ LocalTime { localDay = fromGregorian (yearAsInteger year) (monthAsInt month) (dayAsInt day), localTimeOfDay = midnight } dayEnd year month day = localTimeToTimeStamp $ LocalTime { localDay = fromGregorian (yearAsInteger year) (monthAsInt month) (dayAsInt day), localTimeOfDay = lastSecond } lastSecond = TimeOfDay {todHour = 23, todMin = 59, todSec = 59 } localTimeToTimeStamp ltime = localTimeToUTCTZ (tzByLabel Europe__Warsaw) ltime -- zonedTimeToUTC $ ZonedTime { -- zonedTimeToLocalTime = ltime, -- zonedTimeZone = timeZoneForUTCTime Europe__Warsaw } extractLinks xpathCondition = (downloadDocument &&& this) >>> first (getXPathTrees xpathCondition >>> getAttrValue "href") >>> expandURIFixed extractLinksGeneralized xpathCondition attr = (downloadDocument &&& this) >>> first (getXPathTrees xpathCondition >>> getAttrValue attr) >>> expandURIFixed rotateSecTh ((a, b), c) = ((a, c), b) extractLinksWithText xpathCondition = (downloadDocument &&& this) >>> first (getXPathTrees xpathCondition >>> (getAttrValue "href" &&& (listA (deep isText >>> getText) >>> arr (intercalate " ")))) >>> arr rotateSecTh >>> first expandURIFixed extractLinksWithTitle xpathCondition = (downloadDocument &&& this) >>> first (getXPathTrees xpathCondition >>> (getAttrValue "href" &&& getAttrValue "title")) >>> arr rotateSecTh >>> first expandURIFixed urlPreFixer = arr (Data.List.Utils.replace "[" stupidLeftBracketMarker . Data.List.Utils.replace "]" stupidRightBracketMarker) urlPostFixer = arr (Data.List.Utils.replace stupidLeftBracketMarker "%5B" . Data.List.Utils.replace stupidRightBracketMarker "%5D") stupidLeftBracketMarker = "ddsfdfdfdfgfgfrrtrtrrtr" stupidRightBracketMarker = "wqweweerererrtrtrtrtrtr" expandURIFixed = (urlPreFixer *** urlPreFixer) >>> expandURI >>> urlPostFixer extractText = (listA (deep isText >>> getText) >>> arr (intercalate " ")) loopNext extract xpathConditionForNext = initialStep >>> loopNextCore extract xpathConditionForNext >>> arr fst >>> unlistA initialList :: [(String, a)] initialList = [] initialStep = arr (const initialList) &&& this loopNextCore extract xpathConditionForNext = second (listA extract &&& extractNext xpathConditionForNext) >>> arr expandList >>> ifP (nextFound) (second (arr fromJust >>> (downloadDocument &&& this)) >>> loopNextCore extract xpathConditionForNext) (this) nextFound :: ([(String,a)], Maybe String) -> Bool nextFound (_, Just _) = True nextFound (_, Nothing) = False expandList (l, (e, n)) = (l ++ e, n) extractNext xpathConditionForNext = listA (first (getXPathTrees ("(" ++ xpathConditionForNext ++ ")[1]") >>> getAttrValue "href") >>> expandURIFixed) >>> arr listToMaybe extractFormat :: String -> Maybe String extractFormat finalUrl | ".gif" `isInfixOf` finalUrl = Just "gif" | ".jpg" `isInfixOf` finalUrl = Just "jpg" | ".djvu" `isInfixOf` finalUrl = Just "djvu" | ".pdf" `isInfixOf` finalUrl = Just "pdf" | ".doc" `isInfixOf` finalUrl = Just "doc" | otherwise = Nothing baseMonthNameToNumber :: String -> Maybe String baseMonthNameToNumber "styczeń" = Just "01" baseMonthNameToNumber "styczen" = Just "01" baseMonthNameToNumber "stycznia" = Just "01" baseMonthNameToNumber "luty" = Just "02" baseMonthNameToNumber "lutego" = Just "02" baseMonthNameToNumber "marzec" = Just "03" baseMonthNameToNumber "marca" = Just "03" baseMonthNameToNumber "kwiecień" = Just "04" baseMonthNameToNumber "kwiecien" = Just "04" baseMonthNameToNumber "kwietnia" = Just "04" baseMonthNameToNumber "maj" = Just "05" baseMonthNameToNumber "maja" = Just "05" baseMonthNameToNumber "czerwiec" = Just "06" baseMonthNameToNumber "czeerwiec" = Just "06" baseMonthNameToNumber "czerwca" = Just "06" baseMonthNameToNumber "lipiec" = Just "07" baseMonthNameToNumber "lipca" = Just "07" baseMonthNameToNumber "sierpień" = Just "08" baseMonthNameToNumber "sierpien" = Just "08" baseMonthNameToNumber "sierpnia" = Just "08" baseMonthNameToNumber "wrzesień" = Just "09" baseMonthNameToNumber "wrzesien" = Just "09" baseMonthNameToNumber "września" = Just "09" baseMonthNameToNumber "wrzesnia" = Just "09" baseMonthNameToNumber "październik" = Just "10" baseMonthNameToNumber "pażdziernik" = Just "10" baseMonthNameToNumber "pazdziernik" = Just "10" baseMonthNameToNumber "października" = Just "10" baseMonthNameToNumber "pazdziernika" = Just "10" baseMonthNameToNumber "listopad" = Just "11" baseMonthNameToNumber "listopada" = Just "11" baseMonthNameToNumber "grudzień" = Just "12" baseMonthNameToNumber "grudzien" = Just "12" baseMonthNameToNumber "grudnia" = Just "12" baseMonthNameToNumber "jesien" = Just "10" baseMonthNameToNumber _ = Nothing extractYear :: String -> Maybe String extractYear n = case n =~~ ("(1[6789]|20)[0-9][0-9]" :: String) of Just year -> Just year otherwise -> Nothing joinAlts :: [String] -> String joinAlts = intercalate " // " clean = arr (DLU.replace "\n" "") >>> arr strip extractItems shadowLibrary start extractor = do page <- getWebPage start items <- runX $ extractor start page -- insertIntoDatabase shadowLibrary items putStrLn (show items) getTitle :: ShadowItem -> String getTitle (ShadowItem _ title _ _ _ _ _ _ _) = title isEmpty :: String -> Bool isEmpty str = length str == 0 removeTitleless :: [ShadowItem] -> [ShadowItem] removeTitleless = filter (\si -> not .isEmpty . getTitle $ si) extractItemsStartingFromUrl shadowLibrary start extractor = do items <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractor) mapM_ (putStrLn . show) (removeTitleless items)