2022-03-23 12:43:37 +01:00
|
|
|
|
|
|
|
{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
|
|
|
|
import ShadowLibrary.Core
|
|
|
|
|
|
|
|
import Text.XML.HXT.Core
|
|
|
|
import Text.XML.HXT.XPath
|
|
|
|
import Data.List
|
|
|
|
import Data.List.Utils (replace)
|
|
|
|
|
|
|
|
import Text.Regex.Posix
|
|
|
|
import Text.Printf
|
|
|
|
|
|
|
|
|
2022-03-28 21:06:41 +02:00
|
|
|
extractRecords = extractLinksWithText "//div[@class='artifact-title']/a" -- pary adres-tytuł
|
2022-03-30 10:31:59 +02:00
|
|
|
|
|
|
|
extractPages = extractLinksWithText "//div[@class='pagination-masked clearfix top']//a[@class='next-page-link']" -- pary adres-tytuł
|
|
|
|
|
|
|
|
extractPublicationFiles = extractLinksWithText "//div[@class='file-link']/a" -- pary adres-tytuł
|
2022-03-23 12:43:37 +01:00
|
|
|
|
2022-03-28 21:06:41 +02:00
|
|
|
runExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPages)
|
|
|
|
|
|
|
|
runDocumentsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractRecords)
|
|
|
|
|
2022-03-30 10:31:59 +02:00
|
|
|
runFileExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPublicationFiles)
|
|
|
|
|
2022-03-28 21:06:41 +02:00
|
|
|
|
|
|
|
mapToUrl :: ([Char], [Char]) -> [Char]
|
|
|
|
mapToUrl (url, title) = url
|
|
|
|
|
2022-03-30 10:31:59 +02:00
|
|
|
|
2022-03-28 21:06:41 +02:00
|
|
|
merge [] ys = ys
|
|
|
|
merge (x:xs) ys = x:merge ys xs
|
|
|
|
|
|
|
|
|
|
|
|
withEmptyCheck current [] = do
|
|
|
|
publications <- runDocumentsExtractor current
|
|
|
|
let publicationUrls = map mapToUrl publications
|
2022-03-30 10:31:59 +02:00
|
|
|
publicationFiles <- mapM runFileExtractor publicationUrls
|
|
|
|
let publicationFileUrls = map mapToUrl (map head publicationFiles)
|
|
|
|
return publicationFileUrls
|
2022-03-28 21:06:41 +02:00
|
|
|
|
|
|
|
withEmptyCheck current nextUrls = do
|
|
|
|
let single = head nextUrls
|
|
|
|
publications <- runDocumentsExtractor current
|
|
|
|
let publicationUrls = map mapToUrl publications
|
2022-03-30 10:31:59 +02:00
|
|
|
publicationFiles <- mapM runFileExtractor publicationUrls
|
|
|
|
let publicationFileUrls = map mapToUrl (map head publicationFiles)
|
2022-03-28 21:06:41 +02:00
|
|
|
recursive <- getAllPages single
|
2022-03-30 10:31:59 +02:00
|
|
|
let results = merge publicationFileUrls recursive
|
2022-03-28 21:06:41 +02:00
|
|
|
return results
|
|
|
|
|
2022-03-30 10:31:59 +02:00
|
|
|
|
2022-03-28 21:06:41 +02:00
|
|
|
getAllPages url = do
|
|
|
|
items <- runExtractor url
|
|
|
|
let urls = map mapToUrl items
|
|
|
|
results <- (withEmptyCheck url urls)
|
|
|
|
return results
|
|
|
|
|
2022-03-23 12:43:37 +01:00
|
|
|
|
|
|
|
main = do
|
2022-03-28 21:06:41 +02:00
|
|
|
let start = "https://pbsociety.org.pl/repository/discover?filtertype=has_content_in_original_bundle&filter_relational_operator=equals&filter=true"
|
|
|
|
results <- getAllPages start
|
|
|
|
print results
|