From b56459626dc68851c8502657ce1b2c98245867fc Mon Sep 17 00:00:00 2001 From: Jakub Adamski Date: Mon, 28 Mar 2022 21:06:41 +0200 Subject: [PATCH] bot-working --- app/pbsociety.hs | 62 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/app/pbsociety.hs b/app/pbsociety.hs index 4891207..f1bfb07 100644 --- a/app/pbsociety.hs +++ b/app/pbsociety.hs @@ -12,12 +12,14 @@ import Text.Regex.Posix import Text.Printf -extractRecords = extractLinksWithText "//a" -- pary adres-tytuł +extractRecords = extractLinksWithText "//div[@class='artifact-title']/a" -- pary adres-tytuł -- >>> second (arr $ replace "\r\n " " ") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków -- >>> first (arr ((++"tr") . init)) -- modyfikujemy pierwszy element pary, czyli adres URL -- >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika) +extractPages = extractLinksWithText "//div[@class='pagination-masked clearfix top']//a[@class='next-page-link']" + -- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem toShadowItem :: ((String, String), String) -> ShadowItem toShadowItem ((url, articleTitle), yearlyTitle) = @@ -35,12 +37,66 @@ getDate url = Just [[_, year]] -> year otherwise -> error $ "unexpected url: " ++ url +runExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPages) + +runDocumentsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractRecords) + +runExtractorMultiple (url, title) = runExtractor url + +mapToUrl :: ([Char], [Char]) -> [Char] +mapToUrl (url, title) = url + +merge [] ys = ys +merge (x:xs) ys = x:merge ys xs + +addDimension array = [array] + +withEmptyCheck current [] = do + publications <- runDocumentsExtractor current + let publicationUrls = map mapToUrl publications + return publicationUrls + +withEmptyCheck current nextUrls = do + let single = head nextUrls + publications <- runDocumentsExtractor current + let publicationUrls = map mapToUrl publications + --print publicationUrls + --TODO how to combine publications? + --let mapped = map addDimension publications + --print mapped + --print current + recursive <- getAllPages single + --print recursive + let results = merge publicationUrls recursive + --print results + return results + +getAllPages url = do + items <- runExtractor url + let urls = map mapToUrl items + results <- (withEmptyCheck url urls) + return results + main = do - let start = "https://pbsociety.org.pl/repository/" + let start = "https://pbsociety.org.pl/repository/discover?filtertype=has_content_in_original_bundle&filter_relational_operator=equals&filter=true" let shadowLibrary = ShadowLibrary {logoUrl=Nothing, lname="Polskie Towarzystwo Botaniczne", abbrev="PBSociety", lLevel=0, webpage=start} - extractItemsStartingFromUrl shadowLibrary start extractRecords + + --items <- runExtractor start + --let items2 = map mapToUrl items + --items3 <- mapM runExtractor items2 + + + --let items3 = map runExtractor items2 + --extractItemsStartingFromUrl shadowLibrary start extractRecords2 + --map putStrLn items3 + --mapM_ (putStrLn . show) items + --putStrLn items3 + + results <- getAllPages start + print results + --print items