isi-2021-lab3-twilight-library/ShadowLibrary/Core.hs

370 lines
14 KiB
Haskell

{-# LANGUAGE QuasiQuotes, TemplateHaskell, TypeFamilies, OverloadedStrings #-}
{-# LANGUAGE GADTs, FlexibleContexts #-}
{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module ShadowLibrary.Core where
-- import Database.Persist
-- import Database.Persist.TH
-- import Database.Persist.Postgresql
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Logger (runNoLoggingT)
import Control.Monad.Trans.Resource (runResourceT)
import Control.Monad.Reader
import Data.Time
import Data.Time.Calendar
import Data.Time.LocalTime
import Data.Time.Zones
import Data.Time.Zones.All
import Data.String.Utils (strip)
import Data.List.Utils as DLU
import Data.List (groupBy)
import Text.XML.HXT.Core
import Network.HTTP
import Network.URI
import Control.Monad.Trans.Maybe
import Control.Monad
import Data.Tree.NTree.TypeDefs
import Data.Maybe
import Control.Monad.Trans
import Text.XML.HXT.XPath
-- import Text.XML.HXT.Curl
import Text.XML.HXT.HTTP
import Text.Regex.TDFA
import Data.List (isInfixOf, intercalate)
import Data.List.Utils (replace)
-- import Network.Curl.Opts
polishTimeZone = TimeZone {
timeZoneMinutes = 120,
timeZoneSummerOnly = True,
timeZoneName = ""}
openUrl :: String -> MaybeT IO String
openUrl url = case parseURI url of
Nothing -> fail ""
Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u))
getWebPage :: String -> IO (IOSArrow XmlTree (NTree XNode))
getWebPage url = do
contents <- runMaybeT $ openUrl url
return $ readString [withParseHTML yes, withWarnings no] (fromMaybe "" contents)
downloadDocument = readFromDocument [withParseHTML yes,
withWarnings no,
withEncodingErrors no,
withPreserveComment yes,
withStrictInput yes,
withHTTP []
-- withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")]
]
downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes,
withWarnings no,
withEncodingErrors no,
withPreserveComment yes,
withInputEncoding enc,
withHTTP []]
-- withCurl []]
downloadXmlDocument = readFromDocument [withWarnings no,
withEncodingErrors no,
withHTTP []]
-- withCurl [] ]
data ShadowLibrary = ShadowLibrary { logoUrl :: Maybe String,
lname :: String,
abbrev :: String,
webpage :: String,
lLevel :: Int }
data ShadowItem = ShadowItem {
url :: Maybe String,
title :: String,
itype :: String,
originalDate :: Maybe String,
creator :: Maybe String,
format :: Maybe String,
lang :: Maybe String,
finalUrl :: String,
description :: Maybe String,
fileSize :: Maybe String
} deriving (Show)
defaultShadowItem url title = ShadowItem {
url = Just url,
title = title,
itype = "periodical",
originalDate = Nothing,
creator = Nothing,
format = Just "pdf",
lang = Just "pol",
finalUrl = url,
description = Nothing,
fileSize = Nothing}
getDuration :: Maybe String -> (Maybe UTCTime, Maybe UTCTime)
getDuration Nothing = (Nothing, Nothing)
getDuration (Just date) =
case date =~~ ("^(1[6789]|20)[0-9][0-9]$" :: String) of
Just year -> (Just (yearStart year), Just (yearEnd year))
otherwise ->
case date =~~ ("^((1[789]|20)[0-9][0-9])-(0[1-9]|1[0-2])$" :: String) :: Maybe [[String]] of
Just [[_, year, _, month]] -> (Just (monthStart year month), Just (monthEnd year month))
otherwise ->
case date =~~ ("^((1[789]|20)[0-9][0-9])-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$" :: String) :: Maybe [[String]] of
Just [[_, year, _, month, day]] -> (Just (dayStart year month day), Just (dayEnd year month day))
otherwise ->
case date =~~ ("^((1[789]|20)[0-9][0-9])-((1[789]|20)[0-9][0-9])$" :: String) :: Maybe [[String]] of
Just [[_, yearb, _, yeare, _]] -> (Just (yearStart yearb), Just (yearEnd yeare))
otherwise -> (Nothing, Nothing)
where
yearAsInteger year = (read year) :: Integer
monthAsInt month = (read month) :: Int
dayAsInt day = (read day) :: Int
yearStart year = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) 1 1,
localTimeOfDay = midnight }
yearEnd year = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) 12 31,
localTimeOfDay = lastSecond }
monthStart year month = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) (monthAsInt month) 1,
localTimeOfDay = midnight }
monthEnd year month = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) (monthAsInt month) 31,
localTimeOfDay = lastSecond }
dayStart year month day = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) (monthAsInt month) (dayAsInt day),
localTimeOfDay = midnight }
dayEnd year month day = localTimeToTimeStamp $ LocalTime {
localDay = fromGregorian (yearAsInteger year) (monthAsInt month) (dayAsInt day),
localTimeOfDay = lastSecond }
lastSecond = TimeOfDay {todHour = 23, todMin = 59, todSec = 59 }
localTimeToTimeStamp ltime = localTimeToUTCTZ (tzByLabel Europe__Warsaw) ltime
-- zonedTimeToUTC $ ZonedTime {
-- zonedTimeToLocalTime = ltime,
-- zonedTimeZone = timeZoneForUTCTime Europe__Warsaw }
extractLinks xpathCondition = (downloadDocument &&& this)
>>> first (getXPathTrees xpathCondition
>>> getAttrValue "href")
>>> expandURIFixed
extractLinksGeneralized xpathCondition attr = (downloadDocument &&& this)
>>> first (getXPathTrees xpathCondition
>>> getAttrValue attr)
>>> expandURIFixed
rotateSecTh ((a, b), c) = ((a, c), b)
extractLinksWithText xpathCondition = (downloadDocument &&& this)
>>> first (getXPathTrees xpathCondition
>>> (getAttrValue "href"
&&& (listA (deep isText >>> getText)
>>> arr (intercalate " "))))
>>> arr rotateSecTh
>>> first expandURIFixed
extractLinksWithTitle xpathCondition = (downloadDocument &&& this)
>>> first (getXPathTrees xpathCondition
>>> (getAttrValue "href"
&&& getAttrValue "title"))
>>> arr rotateSecTh
>>> first expandURIFixed
urlPreFixer = arr (Data.List.Utils.replace "[" stupidLeftBracketMarker . Data.List.Utils.replace "]" stupidRightBracketMarker)
urlPostFixer = arr (Data.List.Utils.replace stupidLeftBracketMarker "%5B" . Data.List.Utils.replace stupidRightBracketMarker "%5D")
stupidLeftBracketMarker = "ddsfdfdfdfgfgfrrtrtrrtr"
stupidRightBracketMarker = "wqweweerererrtrtrtrtrtr"
expandURIFixed = (urlPreFixer *** urlPreFixer) >>> expandURI >>> urlPostFixer
extractText = (listA (deep isText >>> getText)
>>> arr (intercalate " "))
loopNext extract xpathConditionForNext = initialStep
>>> loopNextCore extract xpathConditionForNext
>>> arr fst
>>> unlistA
initialList :: [(String, a)]
initialList = []
initialStep = arr (const initialList) &&& this
loopNextCore extract xpathConditionForNext = second
(listA extract &&& extractNext xpathConditionForNext)
>>> arr expandList
>>> ifP (nextFound)
(second (arr fromJust >>> (downloadDocument &&& this))
>>> loopNextCore extract xpathConditionForNext)
(this)
nextFound :: ([(String,a)], Maybe String) -> Bool
nextFound (_, Just _) = True
nextFound (_, Nothing) = False
expandList (l, (e, n)) = (l ++ e, n)
extractNext xpathConditionForNext = listA
(first (getXPathTrees ("(" ++ xpathConditionForNext ++ ")[1]")
>>> getAttrValue "href")
>>> expandURIFixed)
>>> arr listToMaybe
extractFormat :: String -> Maybe String
extractFormat finalUrl
| ".gif" `isInfixOf` finalUrl = Just "gif"
| ".jpg" `isInfixOf` finalUrl = Just "jpg"
| ".djvu" `isInfixOf` finalUrl = Just "djvu"
| ".pdf" `isInfixOf` finalUrl = Just "pdf"
| ".doc" `isInfixOf` finalUrl = Just "doc"
| otherwise = Nothing
baseMonthNameToNumber :: String -> Maybe String
baseMonthNameToNumber "STYCZEŃ" = Just "01"
baseMonthNameToNumber "STYCZEN" = Just "01"
baseMonthNameToNumber "STYCZNIA" = Just "01"
baseMonthNameToNumber "LUTY" = Just "02"
baseMonthNameToNumber "LUTEGO" = Just "02"
baseMonthNameToNumber "MARZEC" = Just "03"
baseMonthNameToNumber "MARCA" = Just "03"
baseMonthNameToNumber "KWIECIEŃ" = Just "04"
baseMonthNameToNumber "KWIECIEN" = Just "04"
baseMonthNameToNumber "KWIETNIA" = Just "04"
baseMonthNameToNumber "MAJ" = Just "05"
baseMonthNameToNumber "MAJA" = Just "05"
baseMonthNameToNumber "CZERWIEC" = Just "06"
baseMonthNameToNumber "CZEERWIEC" = Just "06"
baseMonthNameToNumber "CZERWCA" = Just "06"
baseMonthNameToNumber "LIPIEC" = Just "07"
baseMonthNameToNumber "LIPCA" = Just "07"
baseMonthNameToNumber "SIERPIEŃ" = Just "08"
baseMonthNameToNumber "SIERPIEN" = Just "08"
baseMonthNameToNumber "SIERPNIA" = Just "08"
baseMonthNameToNumber "WRZESIEŃ" = Just "09"
baseMonthNameToNumber "WRZESIEN" = Just "09"
baseMonthNameToNumber "WRZEŚNIA" = Just "09"
baseMonthNameToNumber "WRZESNIA" = Just "09"
baseMonthNameToNumber "PAŹDZIERNIK" = Just "10"
baseMonthNameToNumber "PAŻDZIERNIK" = Just "10"
baseMonthNameToNumber "PAZDZIERNIK" = Just "10"
baseMonthNameToNumber "PAŹDZIERNIKA" = Just "10"
baseMonthNameToNumber "PAZDZIERNIKA" = Just "10"
baseMonthNameToNumber "LISTOPAD" = Just "11"
baseMonthNameToNumber "LISTOPADA" = Just "11"
baseMonthNameToNumber "GRUDZIEŃ" = Just "12"
baseMonthNameToNumber "GRUDZIEN" = Just "12"
baseMonthNameToNumber "GRUDNIA" = Just "12"
baseMonthNameToNumber "GRUDZEN" = Just "12"
baseMonthNameToNumber "JESIEN" = Just "10"
baseMonthNameToNumber _ = Nothing
extractYear :: String -> Maybe String
extractYear n =
case n =~~ ("(1[6789]|20)[0-9][0-9]" :: String) of
Just year -> Just year
otherwise -> Nothing
joinAlts :: [String] -> String
joinAlts = intercalate " // "
clean = arr (DLU.replace "\n" "")
>>> arr strip
extractItems shadowLibrary start extractor = do
page <- getWebPage start
items <- runX $ extractor start page
-- insertIntoDatabase shadowLibrary items
putStrLn (show items)
getTitle :: ShadowItem -> String
getTitle (ShadowItem _ title _ _ _ _ _ _ _ _) = title
isEmpty :: String -> Bool
isEmpty str = length str == 0
removeTitleless :: [ShadowItem] -> [ShadowItem]
removeTitleless = filter (\si -> not .isEmpty . getTitle $ si)
shadowItemWithDate url title date desc = ShadowItem {
url = Just url,
title = title,
itype = "periodical",
originalDate = extractMonthAndYear date,
creator = Nothing,
format = Just "pdf",
lang = Just "pol",
finalUrl = url,
description = Nothing,
fileSize = extractFileSize desc}
unpackMonth :: Maybe String -> String
unpackMonth month = case month of
Just m -> m
Nothing -> ""
extractMonthAndYear :: String -> Maybe String
extractMonthAndYear n =
case n =~~ ("([A-Z]*) ((19|20)[0-9][0-9])" :: String) :: Maybe [[String]] of
Just [[_, month, year, _]] -> Just (year ++ "-" ++ unpackMonth (baseMonthNameToNumber month))
otherwise ->
case n =~~ ("((19|20)[0-9][0-9])" :: String) of
Just year -> Just year
otherwise -> Nothing
extractFileSize :: String -> Maybe String
extractFileSize n =
case n =~~ ("([0-9]*)\\,([0-9]*) MB" :: String) of
Just fileSize -> Just fileSize
otherwise -> Nothing
getUrl :: ShadowItem -> String
getUrl (ShadowItem (Just url) _ _ _ _ _ _ _ _ _) = url
getUrl (ShadowItem Nothing _ _ _ _ _ _ _ _ _) = ""
groupByUrl :: [ShadowItem] -> [[ShadowItem]]
groupByUrl = groupBy (\si1 si2 -> (getUrl si1) == (getUrl si2))
fromSimilar :: (ShadowItem, ShadowItem, ShadowItem) -> ShadowItem
fromSimilar (s1, s2, s3) = shadowItemWithDate (getUrl s1) (getTitle s1) (getTitle s2) (getTitle s3)
toTuple :: [ShadowItem] -> (ShadowItem, ShadowItem, ShadowItem)
toTuple (x1:x2:x3:xs) = (x1, x2, x3)
mergeSimilar :: [ShadowItem] -> [ShadowItem]
mergeSimilar shadowItems =
let grouped = groupByUrl shadowItems
in map (\shadowItems -> if length shadowItems == 3
then
fromSimilar . toTuple $ shadowItems
else shadowItems !! 0) grouped
extractItemsStartingFromUrl shadowLibrary start extractor = do
items <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractor)
mapM_ (putStrLn . show) (mergeSimilar . removeTitleless $ items)