From 3e9ad1fa0bb3daca60bc9dfc369659ef5948ff7a Mon Sep 17 00:00:00 2001 From: Anna Nowak Date: Tue, 6 Apr 2021 23:25:55 +0200 Subject: [PATCH] Funkcja do wydobycia roku z url, funkcja do wydobycia url i tekstu --- app/ptd.hs | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/app/ptd.hs b/app/ptd.hs index 2adf3f6..6c11946 100644 --- a/app/ptd.hs +++ b/app/ptd.hs @@ -1,5 +1,7 @@ +{-# LANGUAGE UTF #-} {-# LANGUAGE Arrows, NoMonomorphismRestriction #-} + import ShadowLibrary.Core import Text.XML.HXT.Core @@ -10,27 +12,35 @@ import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf +extractNestedLinksWithText xpathCondition = (downloadDocumentWithEncoding "UTF-8" &&& this) + >>> first (getXPathTrees xpathCondition + >>> ((getXPathTrees "//a" >>> getAttrValue "href") + &&& (listA (deep isText >>> getText) + >>> arr (intercalate " ")))) + >>> arr rotateSecTh + >>> first expandURIFixed -extractRecords = extractLinksWithText "//div[@class='entry-content']/p[count(a)=1]/a" - >>> second (arr $ replace "\r\n " " ") - >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]") +extractRecords = extractLinksWithText "//div[@class='entry-content']/p/a[contains(@href, 'id')]" + >>> first (extractNestedLinksWithText "//div[@class='entry-content']/p[strong[a]] | //div[@class='entry-content']/p[a]") --- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem toShadowItem :: ((String, String), String) -> ShadowItem -toShadowItem ((url, articleTitle), yearlyTitle) = +toShadowItem ((url, articleTitle), magazineTitle) = (defaultShadowItem url title) { originalDate = Just date, itype = "periodical", format = Just "pdf", finalUrl = url } - where title = "Almanach Muszyny " ++ yearlyTitle ++ " " ++ (replace "\r\n" "" (replace "\r\n " "" articleTitle)) - date = getDate url + where title = magazineTitle ++ " - " ++ (replace " \8211 pdf" "" articleTitle) + date = getYear url -getDate url = - case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of +getYear :: String -> String +getYear url = + case url =~~ "/rocznik[0-9]{2}/" :: Maybe [[String]] of Just [[_, year]] -> year - otherwise -> error $ "unexpected url: " ++ url + otherwise -> case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of + Just [[_, year]] -> "19" ++ (replace "rocznik" "" year) + otherwise -> "" main = do @@ -40,4 +50,4 @@ main = do abbrev="ptd", lLevel=0, webpage=baseUrl} - extractItemsStartingFromUrl shadowLibrary baseUrl (extractRecords >>> arr toShadowItem) + extractItemsStartingFromUrl shadowLibrary baseUrl (extractRecords >>> arr toShadowItem) \ No newline at end of file