From 7b1bbb03c280715c5b405d377847bb888410fa79 Mon Sep 17 00:00:00 2001
From: Andrzej Preibisz <andrzejjpreibisz@gmail.com>
Date: Wed, 30 Mar 2022 21:18:39 +0200
Subject: [PATCH] Zagadnienia ekonomiki rolnej

---
 app/zadanie.hs       | 64 ++++++++++++++++++++++++++++++++++++++++++++
 shadow-library.cabal | 13 +++++++++
 2 files changed, 77 insertions(+)
 create mode 100755 app/zadanie.hs

diff --git a/app/zadanie.hs b/app/zadanie.hs
new file mode 100755
index 0000000..15dce1f
--- /dev/null
+++ b/app/zadanie.hs
@@ -0,0 +1,64 @@
+{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
+import ShadowLibrary.Core
+
+import Text.XML.HXT.Core
+import Text.XML.HXT.XPath
+-- import Text.XML.HXT.Curl
+import Data.List
+import Data.List.Utils (replace)
+
+import Text.Regex.Posix
+import Text.Printf
+
+
+extractNestedRecords = extractLinksWithText "//a[@class='archiveVolume' and not(contains(@href, '.PDF')) and not(contains(@href, '.pdf'))]"  -- pary adres-tytuł
+                 >>> second (arr $ replace "\r\n\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
+                 >>> first (extractLinksWithText "//a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
+                 -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika)
+                 
+extractRecords = extractLinksWithText "//a[@class='archiveVolume' and (contains(@href,'.pdf') or contains(@href,'.PDF'))]" 
+	    >>> second (arr $ replace "\t" " ")                
+
+-- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
+toShadowItem :: ((String, String), String) -> ShadowItem
+toShadowItem ((url, articleTitle), yearlyTitle) =
+  (defaultShadowItem url title) {
+    originalDate = Just date,
+    itype = "periodical",
+    format = Just "pdf",
+    finalUrl = url
+    }
+  where title = "Zagadnienia ekonomiki rolnej " ++ yearlyTitle ++ " " ++ (replace "\r\n" "" (replace "\r\n          " "" articleTitle))
+	date = getDate yearlyTitle
+
+toShadowItemTop :: (String, String) -> ShadowItem
+toShadowItemTop (url, articleTitle) =
+  (defaultShadowItem url title) {
+    originalDate = Just date,
+    itype = "periodical",
+    format = Just "pdf",
+    finalUrl = url
+    }
+  where title = "Zagadnienia ekonomiki rolnej " ++ " " ++ (replace "\r\n" "" (replace "\r\n          " "" articleTitle))
+	date = getDateTop url
+
+	
+getDate url =
+  case url =~~ "[0-9]?[0-9]/(19[0-9][0-9]|20[0-9][0-9])" :: Maybe [[String]] of
+    Just [[month, year]] -> month
+    otherwise -> error $ "unexpected url: " ++ url
+
+getDateTop url =
+  case url =~~ "[0-9]*(19[0-9][0-9]|20[0-9][0-9])" :: Maybe [[String]] of
+    Just [[_, year]] -> year
+    otherwise -> error $ "unexpected url: " ++ url
+
+main = do
+    let start = "http://www.zer.waw.pl/Archive"
+    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
+                                       lname="Ekonomika Rolna",
+                                       abbrev="EkonRol",
+                                       lLevel=0,
+                                       webpage=start}
+    extractItemsStartingFromUrl shadowLibrary start (extractNestedRecords >>> arr toShadowItem)
+    extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItemTop)
diff --git a/shadow-library.cabal b/shadow-library.cabal
index cd77195..819355f 100644
--- a/shadow-library.cabal
+++ b/shadow-library.cabal
@@ -58,6 +58,19 @@ executable almanachmuszyny
                       , regex-posix
                       , shadow-library
    default-language:    Haskell2010
+   
+   
+executable ekonrol
+   hs-source-dirs:      app
+   main-is:             zadanie.hs
+   ghc-options:         -threaded -rtsopts -with-rtsopts=-N
+   build-depends:       base
+                      , hxt
+                      , hxt-xpath
+                      , MissingH
+                      , regex-posix
+                      , shadow-library
+   default-language:    Haskell2010
 
 
 source-repository head