{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf extractRecords = extractLinksWithText "//div[@class='artifact-title']/a" -- pary adres-tytuł extractPages = extractLinksWithText "//div[@class='pagination-masked clearfix top']//a[@class='next-page-link']" -- pary adres-tytuł extractPublicationFiles = extractLinksWithText "//div[@class='file-link']/a" -- pary adres-tytuł runExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPages) runDocumentsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractRecords) runFileExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPublicationFiles) mapToUrl :: ([Char], [Char]) -> [Char] mapToUrl (url, title) = url merge [] ys = ys merge (x:xs) ys = x:merge ys xs toShadowItem :: (([Char], [Char]), ([Char], [Char])) -> ShadowItem toShadowItem ((fileUrl, fileTitle), (publicationUrl, publicationTitle)) = (defaultShadowItem fileUrl title) { itype = "periodical", format = Just "pdf", finalUrl = fileUrl } where title = "Polskie Towarzystwo Botaniczne " ++ publicationTitle withEmptyCheck current [] = do publications <- runDocumentsExtractor current let publicationUrls = map mapToUrl publications publicationFiles <- mapM runFileExtractor publicationUrls let publicationFileItems = map toShadowItem (zip (map head publicationFiles) publications) return publicationFileItems withEmptyCheck current nextUrls = do let single = head nextUrls publications <- runDocumentsExtractor current let publicationUrls = map mapToUrl publications publicationFiles <- mapM runFileExtractor publicationUrls let publicationFileItems = map toShadowItem (zip (map head publicationFiles) publications) recursive <- getAllPages single let results = merge publicationFileItems recursive return results getAllPages url = do items <- runExtractor url let urls = map mapToUrl items results <- (withEmptyCheck url urls) return results main = do let start = "https://pbsociety.org.pl/repository/discover?filtertype=has_content_in_original_bundle&filter_relational_operator=equals&filter=true" results <- getAllPages start print results