{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf extractNestedLinksWithText xpathCondition = (downloadDocument &&& this) >>> first (getXPathTrees xpathCondition >>> ((getXPathTrees "//a" >>> getAttrValue "href") &&& (listA (deep isText >>> getText) >>> arr (intercalate " ")))) >>> arr rotateSecTh >>> first expandURIFixed extractRecords = extractLinksWithText "//div[@class='entry-content']/p/a[contains(@href, 'id')]" >>> first (arr $ replace "http:" "https:") >>> first (extractNestedLinksWithText "//div[@class='entry-content']/p[strong[a]] | //div[@class='entry-content']/p[a]") toShadowItem :: ((String, String), String) -> ShadowItem toShadowItem ((url, articleTitle), magazineTitle) = (defaultShadowItem url title) { originalDate = Just date, itype = "periodical", format = Just "pdf", finalUrl = url } where title = magazineTitle ++ " - " ++ (replace " \8211 pdf" "" articleTitle) date = getYear url getYear :: String -> String getYear url = case url =~~ "/(rocznik[0-9][0-9])/" :: Maybe [[String]] of Just [[_, raw_year]] -> "19" ++ (replace "rocznik" "" raw_year) otherwise -> case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of Just [[_, year]] -> year otherwise -> "" main = do let baseUrl = "https://www.ptd.pl/?page_id=7" let shadowLibrary = ShadowLibrary {logoUrl=Nothing, lname="Polskie Towarzystwo Dendrologiczne", abbrev="ptd", lLevel=0, webpage=baseUrl} extractItemsStartingFromUrl shadowLibrary baseUrl (extractRecords >>> arr toShadowItem)