{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf extractRecords = extractLinksWithText "//a" >>> first (extractLinksWithText "//a[contains(@href,'.pdf')]") toShadowItem :: ((String, String), String) -> ShadowItem toShadowItem ((url, articleTitle), yearlyTitle) = (defaultShadowItem url title) { originalDate = extractMonthAndYear title, itype = "periodical", format = Just "pdf", finalUrl = url } where title = "Kosmos " ++ yearlyTitle ++ (replace ".pdf" "" articleTitle) extractMonthAndYear :: String -> Maybe String extractMonthAndYear n = case n =~~ ("(1[6789]|20)[0-9][0-9]" :: String) of Just year -> Just year otherwise -> Nothing main = do let start = "http://kosmos.icm.edu.pl/PDF/" let shadowLibrary = ShadowLibrary {logoUrl=Nothing, lname="kosmos.icm.edu", abbrev="Kos", lLevel=0, webpage=start} extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)