next version

2021-04-12 15:59:18 +02:00 · 2021-04-12 15:59:18 +02:00 · 9027097319
commit 9027097319
parent 27d91d8817
1 changed files with 46 additions and 14 deletions
--- a/app/archiwapilsudski.hs
+++ b/app/archiwapilsudski.hs
@ -12,13 +12,50 @@ import Text.Regex.Posix
 import Text.Printf


-extractRecords = extractLinksWithText "//a[@class='roczniki']"  -- pary adres-tytuł
-                 >>> second (arr $ replace "\r\n            " " ") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
-                 >>> first (arr ((++"tr") . init))  -- modyfikujemy pierwszy element pary, czyli adres URL
-                 >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
-                 -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika)
+cleanText = arr (replace "\177" "a")
+        >>> arr (replace "\230" "c")
+        >>> arr (replace "\234" "e")
+        >>> arr (replace "\179" "l")
+        >>> arr (replace "\241" "n")
+        >>> arr (replace "\243" "o")
+        >>> arr (replace "\182" "s")
+        >>> arr (replace "(\188)|(\191)" "z")
+        >>> arr (replace "\160" "")

-toShadowItem :: ((String, String), String) -> ShadowItem
+cleanDate = arr (replace "\160" "")
+        >>> arr (replace " " "")
+
+getTitle = proc doc -> do
+  xpathTrees <- getXPathTrees "//title" -< doc
+  title <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
+  titleCleaned <- cleanText -< title
+  returnA -< titleCleaned
+
+getDate = proc doc -> do
+  xpathTrees <- getXPathTrees "//tr[td/text() = ' Data dokumentu ']/td[2]" -< doc
+  date <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
+  cleanedDate <- cleanDate -< date
+  returnA -< cleanedDate
+
+getUrls = proc doc -> do
+  xpathTrees <- getXPathTrees "//div[@style='padding: 4px 7px;float:left;']/a" -< doc
+  urlList <- (getAttrValue "href" >>> intercalate "//") -< xpathTrees
+  returnA -< urlList
+
+extractRecordData = proc recordUrl -> do
+  doc <- downloadDocument -< recordUrl
+  recordTitle <- getTitle -< doc
+  recordDate <- getDate -< doc
+  recordUrls <- getUrls -< doc
+  returnA -< recordUrls
+
+extractRecordLinks = extractLinks "//table[@class='tabelka']//a[contains(@href,'.php')]"
+                 >>> extractLinks "//table[@class='tabelka']//td[@width='68%']/a[contains(@href,'.php')]"
+                 >>> extractLinks "//table[@class='tabelka']//td[@width='76%']/a[contains(@href,'.php')]"
+                 >>> extractRecordData
+
+
+toShadowItem :: (String, String) -> ShadowItem
 toShadowItem (url, title) =
  (defaultShadowItem url title) {
    originalDate = Just date,
@ -27,19 +64,14 @@ toShadowItem (url, title) =
    finalUrl = url
    }
  where title = title
-        date = '12-34-5678'
-
-getDate url =
-  case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of
-    Just [[_, year]] -> year
-    otherwise -> error $ "unexpected url: " ++ url
+        date = "12-34-5678"


 main = do
-    let start = "https://archiwa.pilsudski.org/"
+    let start = "https://archiwa.pilsudski.org/dokument.php?nonav=0&nrar=701&nrzesp=4&sygn=4&handle=701.180/16036"
    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
                                       lname="Archiwa Pilsudski",
                                       abbrev="ArchPil",
                                       lLevel=0,
                                       webpage=start}
-    extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)
+    extractItemsStartingFromUrl shadowLibrary start (extractRecordData)