{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath -- import Text.XML.HXT.Curl import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf extractRecords = extractLinks "//a[contains(@href, '.pdf')]" toShadowItem :: String -> ShadowItem toShadowItem url = (defaultShadowItem url title) { originalDate = Just date, itype = "periodical", format = Just "pdf", finalUrl = url } where date = last $ getAllTextMatches $ url =~ "(19[0-9][0-9]|20[0-9][0-9])" :: String titleToProcess = last $ getAllTextMatches $ url =~ "/[a-zA-Z ]+[-_]" :: String title = titleToProcess =~ "[a-zA-Z ]+" :: String main = do let start = "http://polunima.pl/teatr-lalek/" let shadowLibrary = ShadowLibrary {logoUrl=Nothing, lname="Teatr Lalek", abbrev="Teatr", lLevel=0, webpage=start} extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)