diff --git a/app/ZborBielawa.hs b/app/ZborBielawa.hs new file mode 100644 index 0000000..30aa58d --- /dev/null +++ b/app/ZborBielawa.hs @@ -0,0 +1,55 @@ + +{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} +import ShadowLibrary.Core + +import Text.XML.HXT.Core +import Text.XML.HXT.XPath +-- import Text.XML.HXT.Curl +import Data.List +import Data.List.Utils (replace) + +import Text.Regex.Posix +import Text.Printf + + +extractNestedLinksWithText xpathCondition = (downloadDocument &&& this) + >>> first (getXPathTrees xpathCondition + >>> ((getXPathTrees "//a" >>> getAttrValue "href") + &&& (listA (deep isText >>> getText) + >>> arr (intercalate " ")))) + >>> arr rotateSecTh + >>> first expandURIFixed + +extractRecords = extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]" -- pary adres-tytuł podstrony + >>> first (extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]") -- pobieramy podstronę i kolejne podstrony z menu + >>> first (first (extractNestedLinksWithText "//big/a[contains(@href,'.pdf')][img]")) -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego + -- ostatecznie wyjdą krotki (((adres URL, tytuł nr-u), tytuł podstrony 2), tytuł podstrony 1) + +-- ... a tutaj te krotki przerabiamy do docelowej struktury ShadowItem +toShadowItem :: (((String, String), String), String) -> ShadowItem +toShadowItem (((url, releaseTitle), collectionTitle), categoryTitle) = + (defaultShadowItem url title) { + originalDate = Just date, + itype = "periodical", + format = Just "pdf", + finalUrl = url + } + where title = categoryTitle ++ (" " ++ collectionTitle) + date = getDate $ releaseTitle + + +getDate yearlyTitle = + case yearlyTitle =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of + Just [[_, year]] -> year + -- otherwise -> error $ "unexpected yearlyTitle: " ++ yearlyTitle + otherwise -> yearlyTitle + + +main = do + let start = "http://zborbielawa.pl/archiwum/" + let shadowLibrary = ShadowLibrary {logoUrl=Nothing, + lname="Zbór Bielawa", + abbrev="ZboBiel", + lLevel=0, + webpage=start} + extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem) diff --git a/app/zborbielawa.hs b/app/zborbielawa.hs deleted file mode 100644 index 6b1a1bd..0000000 --- a/app/zborbielawa.hs +++ /dev/null @@ -1,57 +0,0 @@ - -{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} -import ShadowLibrary.Core - -import Text.XML.HXT.Core -import Text.XML.HXT.XPath --- import Text.XML.HXT.Curl -import Data.List -import Data.List.Utils (replace) - -import Text.Regex.Posix -import Text.Printf - - -extractRecords = extractLinksWithText "//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link']" -- pary adres-tytuł - -- >>> second (arr $ replace "\r\n " "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków - -- >>> second (arr $ replace " " "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków --- >>> first (arr ((++"tr") . init)) -- modyfikujemy pierwszy element pary, czyli adres URL - >>> first (extractLinksWithText "//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link']") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego - >>> first (second (arr $ replace "\r\n " "")) - >>> first (first ( - extractLinksWithText "//a[contains(@href,'.pdf')]" - >>> second (arr $ replace "\r\n " "") - -- >>> first (arr $ replace "//" "/") - ) - ) -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego - -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika) - --- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem -toShadowItem :: (((String, String), String), String) -> ShadowItem -toShadowItem (((url, chapterTitle), articleTitle), yearlyTitle) = - (defaultShadowItem url title) { - originalDate = Just date, - itype = "periodical", - format = Just "pdf", - finalUrl = url - } - where title = articleTitle ++ (replace " " "" chapterTitle) - date = yearlyTitle - -getDate yearlyTitle = - case yearlyTitle =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of - Just [[_, year]] -> year - otherwise -> error $ "unexpected yearlyTitle: " ++ yearlyTitle - - - - - -main = do - let start = "http://zborbielawa.pl/archiwum/" - let shadowLibrary = ShadowLibrary {logoUrl=Nothing, - lname="Zbór Bielawa", - abbrev="ZboBiel", - lLevel=0, - webpage=start} - extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem) diff --git a/shadow-library.cabal b/shadow-library.cabal index 30cd5ae..dda3c81 100644 --- a/shadow-library.cabal +++ b/shadow-library.cabal @@ -62,7 +62,7 @@ executable almanachmuszyny executable zborbielawa hs-source-dirs: app - main-is: zborbielawa.hs + main-is: ZborBielawa.hs ghc-options: -threaded -rtsopts -with-rtsopts=-N build-depends: base , hxt