From b56459626dc68851c8502657ce1b2c98245867fc Mon Sep 17 00:00:00 2001
From: Jakub Adamski <kuba@exemplum.pl>
Date: Mon, 28 Mar 2022 21:06:41 +0200
Subject: [PATCH] bot-working

---
 app/pbsociety.hs | 62 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/app/pbsociety.hs b/app/pbsociety.hs
index 4891207..f1bfb07 100644
--- a/app/pbsociety.hs
+++ b/app/pbsociety.hs
@@ -12,12 +12,14 @@ import Text.Regex.Posix
 import Text.Printf
 
 
-extractRecords = extractLinksWithText "//a"  -- pary adres-tytuł
+extractRecords = extractLinksWithText "//div[@class='artifact-title']/a"  -- pary adres-tytuł
                  -- >>> second (arr $ replace "\r\n            " " ") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
                  -- >>> first (arr ((++"tr") . init))  -- modyfikujemy pierwszy element pary, czyli adres URL
                  -- >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
                  -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika)
 
+extractPages = extractLinksWithText "//div[@class='pagination-masked clearfix top']//a[@class='next-page-link']" 
+
 -- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
 toShadowItem :: ((String, String), String) -> ShadowItem
 toShadowItem ((url, articleTitle), yearlyTitle) =
@@ -35,12 +37,66 @@ getDate url =
     Just [[_, year]] -> year
     otherwise -> error $ "unexpected url: " ++ url
 
+runExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPages)
+
+runDocumentsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractRecords)
+
+runExtractorMultiple (url, title) = runExtractor url
+
+mapToUrl :: ([Char], [Char]) -> [Char]
+mapToUrl (url, title) = url
+
+merge [] ys = ys
+merge (x:xs) ys = x:merge ys xs
+
+addDimension array = [array]
+
+withEmptyCheck current [] = do 
+  publications <- runDocumentsExtractor current
+  let publicationUrls = map mapToUrl publications
+  return publicationUrls
+  
+withEmptyCheck current nextUrls = do
+  let single = head nextUrls
+  publications <- runDocumentsExtractor current
+  let publicationUrls = map mapToUrl publications
+  --print publicationUrls
+  --TODO how to combine publications?
+  --let mapped = map addDimension publications
+  --print mapped
+  --print current
+  recursive <- getAllPages single
+  --print recursive
+  let results = merge publicationUrls recursive
+  --print results
+  return results
+
+getAllPages url = do
+  items <- runExtractor url
+  let urls = map mapToUrl items
+  results <- (withEmptyCheck url urls)
+  return results
+
 
 main = do
-    let start = "https://pbsociety.org.pl/repository/"
+    let start = "https://pbsociety.org.pl/repository/discover?filtertype=has_content_in_original_bundle&filter_relational_operator=equals&filter=true"
     let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
                                        lname="Polskie Towarzystwo Botaniczne",
                                        abbrev="PBSociety",
                                        lLevel=0,
                                        webpage=start}
-    extractItemsStartingFromUrl shadowLibrary start extractRecords
+
+    --items <- runExtractor start
+    --let items2 = map mapToUrl items
+    --items3 <- mapM runExtractor items2
+
+
+    --let items3 = map runExtractor items2
+    --extractItemsStartingFromUrl shadowLibrary start extractRecords2
+    --map putStrLn items3
+    --mapM_ (putStrLn . show) items
+    --putStrLn items3
+
+    results <- getAllPages start
+    print results
+    --print items