import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath import Text.XML.HXT.Curl import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf extractLinksWithTitleAndText xpathCondition = (downloadDocument &&& this) >>> first (getXPathTrees xpathCondition >>> (getAttrValue "href" &&& getAttrValue "title" &&& (listA (deep isText >>> getText) >>> arr (intercalate " ")))) >>> arr rotateSecTh >>> first expandURIFixed --extractRecords = extractCustomLinks "//tr[@class=''] | //tr[@class='_a']" "//td[@cell-header='Nazwa pliku']/a" "//td[@class='size']" extractRecords = extractLinksWithTitleAndText "//td[not(contains(@cell-header,'Nazwa pliku'))]/a[contains(@href,'.pdf') and @target='DownloadWin']" toShadowItem :: (String, (String, String)) -> ShadowItem toShadowItem (url, (info, articleTitle)) = (defaultShadowItem url title) { originalDate = Just date, itype = "periodical", finalUrl = url, format = format, description = Just desc, lang = Just "pol" } where title = replace "\r\n \r\n Plik: " "" articleTitle desc = "Article nr: " ++ getArticleNr title ++ " Size: " ++ getFileSize info date = getDate title format = extractFormat url getDate title = case title =~~ "(19[0-9][0-9]|20[0-9][0-9])" :: Maybe [[String]] of Just [[_, year]] -> year otherwise -> "No date for: " ++ title getArticleNr title = case title =~~ "([0-9][0-9]/)" :: Maybe [[String]] of Just [[_, nr]] -> replace "/" "" nr otherwise -> "No article nr for: " ++ title getFileSize title = case title =~~ "(Rozmiar: [0-9]+.[0-9]+[A-Z]+)" :: Maybe [[String]] of Just [[_, size]] -> replace "Rozmiar: " "" size otherwise -> "No file size for: " ++ title main = do let start = "https://www.bierun.pl/mieszkancy/archiwum" let shadowLibrary = ShadowLibrary {logoUrl=Nothing, lname="Miasto Bieruń", abbrev="Bieruń", lLevel=0, webpage=start} extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)