{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf extractRecords = extractLinksWithText "//div[@class='artifact-title']/a" -- pary adres-tytuł extractPages = extractLinksWithText "//div[@class='pagination-masked clearfix top']//a[@class='next-page-link']" -- pary adres-tytuł extractPublicationFiles = extractLinksWithText "//div[@class='file-link']/a" -- pary adres-tytuł runExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPages) runDocumentsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractRecords) runFileExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPublicationFiles) mapToUrl :: ([Char], [Char]) -> [Char] mapToUrl (url, title) = url merge [] ys = ys merge (x:xs) ys = x:merge ys xs withEmptyCheck current [] = do publications <- runDocumentsExtractor current let publicationUrls = map mapToUrl publications publicationFiles <- mapM runFileExtractor publicationUrls let publicationFileUrls = map mapToUrl (map head publicationFiles) return publicationFileUrls withEmptyCheck current nextUrls = do let single = head nextUrls publications <- runDocumentsExtractor current let publicationUrls = map mapToUrl publications publicationFiles <- mapM runFileExtractor publicationUrls let publicationFileUrls = map mapToUrl (map head publicationFiles) recursive <- getAllPages single let results = merge publicationFileUrls recursive return results getAllPages url = do items <- runExtractor url let urls = map mapToUrl items results <- (withEmptyCheck url urls) return results main = do let start = "https://pbsociety.org.pl/repository/discover?filtertype=has_content_in_original_bundle&filter_relational_operator=equals&filter=true" results <- getAllPages start print results