{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath -- import Text.XML.HXT.Curl import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf extractRecords = extractLinksWithText "//a[contains(@href, '.pdf')]" -- pary adres-tytuł >>> second (arr $ replace "\n\t\t\t\t\t\t\t\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków >>> second (arr $ replace "\n\t\t\t\t\t\t\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków >>> second (arr $ replace "\324" "n") >>> second (arr $ replace "\281" "e") >>> second (arr $ replace "\380" "z") >>> second (arr $ replace "\322" "l") >>> second (arr $ replace "\243" "o") >>> second (arr $ replace "\347" "s") >>> second (arr $ replace "\263" "c") >>> second (arr $ replace "\346" "S") -- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem toShadowItem :: (String, String) -> ShadowItem toShadowItem (url, articleTitle) = (defaultShadowItem url title) { originalDate = extractMonthAndYear title, itype = "periodical", format = Just "pdf", finalUrl = url, description = extractFileSize title } where title = articleTitle extractMonthAndYear :: String -> Maybe String extractMonthAndYear n = case n =~~ ("[a-z]* (1[6789]|20)[0-9][0-9]" :: String) of Just year -> Just year otherwise -> Nothing extractFileSize :: String -> Maybe String extractFileSize n = case n =~~ ("([0-9]*)\\.([0-9]*) (B|kB|MB)" :: String) of Just fileSize -> Just fileSize otherwise -> Nothing main = do let start = "http://podkarpacki.civitaschristiana.pl/formacja/zeszyty-formacyjne/" let shadowLibrary = ShadowLibrary { logoUrl=Nothing, lname="Podkarpacki", abbrev="podk", lLevel=0, webpage=start} extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)