Zagadnienia ekonomiki rolnej

2022-03-30 21:18:39 +02:00 · 2022-03-30 21:18:39 +02:00 · 7b1bbb03c2
commit 7b1bbb03c2
parent 8883a924b4
2 changed files with 77 additions and 0 deletions
--- a/app/zadanie.hs
+++ b/app/zadanie.hs
@ -0,0 +1,64 @@
 {-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
 import ShadowLibrary.Core
 import Text.XML.HXT.Core
 import Text.XML.HXT.XPath
 -- import Text.XML.HXT.Curl
 import Data.List
 import Data.List.Utils (replace)
 import Text.Regex.Posix
 import Text.Printf
 extractNestedRecords = extractLinksWithText "//a[@class='archiveVolume' and not(contains(@href, '.PDF')) and not(contains(@href, '.pdf'))]"  -- pary adres-tytuł
                 >>> second (arr $ replace "\r\n\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
                 >>> first (extractLinksWithText "//a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
                 -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika)
 extractRecords = extractLinksWithText "//a[@class='archiveVolume' and (contains(@href,'.pdf') or contains(@href,'.PDF'))]" 
 	    >>> second (arr $ replace "\t" " ")                
 -- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
 toShadowItem :: ((String, String), String) -> ShadowItem
 toShadowItem ((url, articleTitle), yearlyTitle) =
  (defaultShadowItem url title) {
    originalDate = Just date,
    itype = "periodical",
    format = Just "pdf",
    finalUrl = url
    }
  where title = "Zagadnienia ekonomiki rolnej " ++ yearlyTitle ++ " " ++ (replace "\r\n" "" (replace "\r\n          " "" articleTitle))
 	date = getDate yearlyTitle
 toShadowItemTop :: (String, String) -> ShadowItem
 toShadowItemTop (url, articleTitle) =
  (defaultShadowItem url title) {
    originalDate = Just date,
    itype = "periodical",
    format = Just "pdf",
    finalUrl = url
    }
  where title = "Zagadnienia ekonomiki rolnej " ++ " " ++ (replace "\r\n" "" (replace "\r\n          " "" articleTitle))
 	date = getDateTop url
 getDate url =
  case url =~~ "[0-9]?[0-9]/(19[0-9][0-9]|20[0-9][0-9])" :: Maybe [[String]] of
    Just [[month, year]] -> month
    otherwise -> error $ "unexpected url: " ++ url
 getDateTop url =
  case url =~~ "[0-9]*(19[0-9][0-9]|20[0-9][0-9])" :: Maybe [[String]] of
    Just [[_, year]] -> year
    otherwise -> error $ "unexpected url: " ++ url
 main = do
    let start = "http://www.zer.waw.pl/Archive"
    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
                                       lname="Ekonomika Rolna",
                                       abbrev="EkonRol",
                                       lLevel=0,
                                       webpage=start}
    extractItemsStartingFromUrl shadowLibrary start (extractNestedRecords >>> arr toShadowItem)
    extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItemTop)
--- a/shadow-library.cabal
+++ b/shadow-library.cabal
@ -58,6 +58,19 @@ executable almanachmuszyny
                      , regex-posix
                      , shadow-library
   default-language:    Haskell2010
 executable ekonrol
   hs-source-dirs:      app
   main-is:             zadanie.hs
   ghc-options:         -threaded -rtsopts -with-rtsopts=-N
   build-depends:       base
                      , hxt
                      , hxt-xpath
                      , MissingH
                      , regex-posix
                      , shadow-library
   default-language:    Haskell2010
 source-repository head