{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath import Text.XML.HXT.Curl import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf extractRecords = extractLinksWithText "//a[@class='image-link']" -- pary adres-tytuł >>> second (arr $ replace "\n" " ") >>> first (extractLinksWithText "//div/a[contains(@href,'.pdf') and not(@class)]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika) -- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem toShadowItem :: ((String, String), String) -> ShadowItem toShadowItem ((url, articleTitle), yearlyTitle) = (defaultShadowItem url title) { originalDate = Just date, itype = "periodical", format = Just "pdf", finalUrl = url } where title = "Pbsociety " ++ yearlyTitle date = getDate url getDate url = case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of Just [[_, year]] -> year otherwise -> error $ "unexpected url: " ++ url main = do let start = "https://pbsociety.org.pl/repository/discover?filtertype_1=has_content_in_original_bundle&filter_relational_operator_1=equals&filter_1=true&filtertype_2=title&filter_relational_operator_2=contains&filter_2=&submit_apply_filter=Apply&query=&scope=%2F&rpp=260&sort_by=score&order=desc" let shadowLibrary = ShadowLibrary {logoUrl=Nothing, lname="Polish Botanical Society Repository", abbrev="Pbsociety", lLevel=0, webpage=start} extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)