import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath import Text.XML.HXT.Curl import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf --extractCustomLinks xpathCondition1 = (downloadDocument &&& this) -- >>> first (getXPathTrees xpathCondition1 -- >>> (getAttrValue "href" -- &&& getAttrValue "title" -- )) -- >>> arr rotateSecTh -- >>> first expandURIFixed extractCustomLinks xpathCondition1 xpathCondition2 xpathCondition3= (downloadDocument &&& this) >>> first (getXPathTrees xpathCondition1 >>> first (getXPathTrees xpathCondition2 >>> (getAttrValue "href" &&& getAttrValue "title")) >>> second (getXPathTrees xpathCondition3 >>> getAttrValue "cell-header") ) >>> arr rotateSecThX >>> first expandURIFixed >>> second expandURIFixed extractRecords = extractCustomLinks "//tr[@class=''] | //tr[@class='_a']" "//td[@cell-header='Nazwa pliku']/a" "//td[@class='size']" --extractRecords = extractCustomLinks "//td[@cell-header='Nazwa pliku']/a" toShadowItem :: (String, String) -> ShadowItem toShadowItem (url, articleTitle) = (defaultShadowItem url title) { originalDate = Just date, itype = "periodical", format = Just "pdf", finalUrl = url, description = Just desc } where title = "Miasto Bierun: " ++ " url: " ++ url ++ " title: " ++ articleTitle date = getDate articleTitle desc = getArticleNr articleTitle ++ " size: " getDate title = case title =~~ "(19[0-9][0-9]|20[0-9][0-9])" :: Maybe [[String]] of Just [[_, year]] -> year otherwise -> "No date for: " ++ title getArticleNr title = case title =~~ "([0-9][0-9]/)" :: Maybe [[String]] of Just [[_, nr]] -> "Article nr: " ++ (replace "/" "" nr) otherwise -> "No article nr for: " ++ title main = do let start = "https://www.bierun.pl/mieszkancy/archiwum" let shadowLibrary = ShadowLibrary {logoUrl=Nothing, lname="Miasto Bieruń", abbrev="Bieruń", lLevel=0, webpage=start} extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)