added file extraction

2022-03-30 10:31:59 +02:00 · 2022-03-30 10:31:59 +02:00 · 4e6c365121
commit 4e6c365121
parent b56459626d
1 changed files with 12 additions and 51 deletions
--- a/app/pbsociety.hs
+++ b/app/pbsociety.hs
@ -4,7 +4,6 @@ import ShadowLibrary.Core

 import Text.XML.HXT.Core
 import Text.XML.HXT.XPath
-- import Text.XML.HXT.Curl
 import Data.List
 import Data.List.Utils (replace)

@ -13,64 +12,44 @@ import Text.Printf


 extractRecords = extractLinksWithText "//div[@class='artifact-title']/a"  -- pary adres-tytuł
-                 -- >>> second (arr $ replace "\r\n            " " ") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
-                 -- >>> first (arr ((++"tr") . init))  -- modyfikujemy pierwszy element pary, czyli adres URL
-                 -- >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
-                 -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika)

-extractPages = extractLinksWithText "//div[@class='pagination-masked clearfix top']//a[@class='next-page-link']" 
+extractPages = extractLinksWithText "//div[@class='pagination-masked clearfix top']//a[@class='next-page-link']" -- pary adres-tytuł

-- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
-toShadowItem :: ((String, String), String) -> ShadowItem
-toShadowItem ((url, articleTitle), yearlyTitle) =
-  (defaultShadowItem url title) {
-    originalDate = Just date,
-    itype = "periodical",
-    format = Just "pdf",
-    finalUrl = url
-    }
-  where title = "Almanach Muszyny " ++ yearlyTitle ++ " " ++ (replace "\r\n" "" (replace "\r\n          " "" articleTitle))
-        date = getDate url
-
-getDate url =
-  case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of
-    Just [[_, year]] -> year
-    otherwise -> error $ "unexpected url: " ++ url
+extractPublicationFiles = extractLinksWithText "//div[@class='file-link']/a" -- pary adres-tytuł

 runExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPages)

 runDocumentsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractRecords)

-runExtractorMultiple (url, title) = runExtractor url
+runFileExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractPublicationFiles)
+

 mapToUrl :: ([Char], [Char]) -> [Char]
 mapToUrl (url, title) = url

+
 merge [] ys = ys
 merge (x:xs) ys = x:merge ys xs

-addDimension array = [array]

 withEmptyCheck current [] = do 
  publications <- runDocumentsExtractor current
  let publicationUrls = map mapToUrl publications
-  return publicationUrls
+  publicationFiles <- mapM runFileExtractor publicationUrls
+  let publicationFileUrls = map mapToUrl (map head publicationFiles)
+  return publicationFileUrls
  
 withEmptyCheck current nextUrls = do
  let single = head nextUrls
  publications <- runDocumentsExtractor current
  let publicationUrls = map mapToUrl publications
-  --print publicationUrls
-  --TODO how to combine publications?
-  --let mapped = map addDimension publications
-  --print mapped
-  --print current
+  publicationFiles <- mapM runFileExtractor publicationUrls
+  let publicationFileUrls = map mapToUrl (map head publicationFiles)
  recursive <- getAllPages single
-  --print recursive
-  let results = merge publicationUrls recursive
-  --print results
+  let results = merge publicationFileUrls recursive
  return results

+
 getAllPages url = do
  items <- runExtractor url
  let urls = map mapToUrl items
@ -80,23 +59,5 @@ getAllPages url = do

 main = do
    let start = "https://pbsociety.org.pl/repository/discover?filtertype=has_content_in_original_bundle&filter_relational_operator=equals&filter=true"
-    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
-                                       lname="Polskie Towarzystwo Botaniczne",
-                                       abbrev="PBSociety",
-                                       lLevel=0,
-                                       webpage=start}
-
-    --items <- runExtractor start
-    --let items2 = map mapToUrl items
-    --items3 <- mapM runExtractor items2
-
-
-    --let items3 = map runExtractor items2
-    --extractItemsStartingFromUrl shadowLibrary start extractRecords2
-    --map putStrLn items3
-    --mapM_ (putStrLn . show) items
-    --putStrLn items3
-
    results <- getAllPages start
    print results
-    --print items