twilight-library/app/pbsociety.hs

74 lines
2.3 KiB
Haskell
Raw Normal View History

2022-03-23 12:43:37 +01:00
{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
import ShadowLibrary.Core
import Text.XML.HXT.Core
import Text.XML.HXT.XPath
import Data.List
import Data.List.Utils (replace)
import Text.Regex.Posix
import Text.Printf
2022-03-28 21:06:41 +02:00
extractRecords = extractLinksWithText "//div[@class='artifact-title']/a" -- pary adres-tytuł
2022-03-30 10:31:59 +02:00
extractPages = extractLinksWithText "//div[@class='pagination-masked clearfix top']//a[@class='next-page-link']" -- pary adres-tytuł
extractPublicationFiles = extractLinksWithText "//div[@class='file-link']/a" -- pary adres-tytuł
2022-03-23 12:43:37 +01:00
2022-03-28 21:06:41 +02:00
runExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPages)
runDocumentsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractRecords)
2022-03-30 10:31:59 +02:00
runFileExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPublicationFiles)
2022-03-28 21:06:41 +02:00
mapToUrl :: ([Char], [Char]) -> [Char]
mapToUrl (url, title) = url
2022-03-30 10:31:59 +02:00
2022-03-28 21:06:41 +02:00
merge [] ys = ys
merge (x:xs) ys = x:merge ys xs
2022-05-09 09:14:23 +02:00
toShadowItem :: (([Char], [Char]), ([Char], [Char])) -> ShadowItem
toShadowItem ((fileUrl, fileTitle), (publicationUrl, publicationTitle)) =
(defaultShadowItem fileUrl title) {
2022-05-09 08:42:14 +02:00
itype = "periodical",
format = Just "pdf",
2022-05-09 09:14:23 +02:00
finalUrl = fileUrl
2022-05-09 08:42:14 +02:00
}
2022-05-09 09:14:23 +02:00
where title = "Polskie Towarzystwo Botaniczne " ++ publicationTitle
2022-05-09 08:42:14 +02:00
2022-03-28 21:06:41 +02:00
withEmptyCheck current [] = do
publications <- runDocumentsExtractor current
let publicationUrls = map mapToUrl publications
2022-03-30 10:31:59 +02:00
publicationFiles <- mapM runFileExtractor publicationUrls
2022-05-09 09:14:23 +02:00
let publicationFileItems = map toShadowItem (zip (map head publicationFiles) publications)
2022-05-09 08:42:14 +02:00
return publicationFileItems
2022-03-28 21:06:41 +02:00
withEmptyCheck current nextUrls = do
let single = head nextUrls
publications <- runDocumentsExtractor current
let publicationUrls = map mapToUrl publications
2022-03-30 10:31:59 +02:00
publicationFiles <- mapM runFileExtractor publicationUrls
2022-05-09 09:14:23 +02:00
let publicationFileItems = map toShadowItem (zip (map head publicationFiles) publications)
2022-03-28 21:06:41 +02:00
recursive <- getAllPages single
2022-05-09 08:42:14 +02:00
let results = merge publicationFileItems recursive
2022-03-28 21:06:41 +02:00
return results
2022-03-30 10:31:59 +02:00
2022-03-28 21:06:41 +02:00
getAllPages url = do
items <- runExtractor url
let urls = map mapToUrl items
results <- (withEmptyCheck url urls)
return results
2022-03-23 12:43:37 +01:00
main = do
2022-03-28 21:06:41 +02:00
let start = "https://pbsociety.org.pl/repository/discover?filtertype=has_content_in_original_bundle&filter_relational_operator=equals&filter=true"
results <- getAllPages start
print results