bot-working

2022-03-28 21:06:41 +02:00 · 2022-03-28 21:06:41 +02:00 · b56459626d
commit b56459626d
parent e8a02c5f07
1 changed files with 59 additions and 3 deletions
--- a/app/pbsociety.hs
+++ b/app/pbsociety.hs
@ -12,12 +12,14 @@ import Text.Regex.Posix
 import Text.Printf


-extractRecords = extractLinksWithText "//a"  -- pary adres-tytuł
+extractRecords = extractLinksWithText "//div[@class='artifact-title']/a"  -- pary adres-tytuł
                 -- >>> second (arr $ replace "\r\n            " " ") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
                 -- >>> first (arr ((++"tr") . init))  -- modyfikujemy pierwszy element pary, czyli adres URL
                 -- >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
                 -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika)

+extractPages = extractLinksWithText "//div[@class='pagination-masked clearfix top']//a[@class='next-page-link']" 
+
 -- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
 toShadowItem :: ((String, String), String) -> ShadowItem
 toShadowItem ((url, articleTitle), yearlyTitle) =
@ -35,12 +37,66 @@ getDate url =
    Just [[_, year]] -> year
    otherwise -> error $ "unexpected url: " ++ url

+runExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPages)
+
+runDocumentsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractRecords)
+
+runExtractorMultiple (url, title) = runExtractor url
+
+mapToUrl :: ([Char], [Char]) -> [Char]
+mapToUrl (url, title) = url
+
+merge [] ys = ys
+merge (x:xs) ys = x:merge ys xs
+
+addDimension array = [array]
+
+withEmptyCheck current [] = do 
+  publications <- runDocumentsExtractor current
+  let publicationUrls = map mapToUrl publications
+  return publicationUrls
+  
+withEmptyCheck current nextUrls = do
+  let single = head nextUrls
+  publications <- runDocumentsExtractor current
+  let publicationUrls = map mapToUrl publications
+  --print publicationUrls
+  --TODO how to combine publications?
+  --let mapped = map addDimension publications
+  --print mapped
+  --print current
+  recursive <- getAllPages single
+  --print recursive
+  let results = merge publicationUrls recursive
+  --print results
+  return results
+
+getAllPages url = do
+  items <- runExtractor url
+  let urls = map mapToUrl items
+  results <- (withEmptyCheck url urls)
+  return results
+

 main = do
-    let start = "https://pbsociety.org.pl/repository/"
+    let start = "https://pbsociety.org.pl/repository/discover?filtertype=has_content_in_original_bundle&filter_relational_operator=equals&filter=true"
    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
                                       lname="Polskie Towarzystwo Botaniczne",
                                       abbrev="PBSociety",
                                       lLevel=0,
                                       webpage=start}
-    extractItemsStartingFromUrl shadowLibrary start extractRecords
+
+    --items <- runExtractor start
+    --let items2 = map mapToUrl items
+    --items3 <- mapM runExtractor items2
+
+
+    --let items3 = map runExtractor items2
+    --extractItemsStartingFromUrl shadowLibrary start extractRecords2
+    --map putStrLn items3
+    --mapM_ (putStrLn . show) items
+    --putStrLn items3
+
+    results <- getAllPages start
+    print results
+    --print items