From ad3f714bf62f856843ea460d968e45a92ee2353f Mon Sep 17 00:00:00 2001 From: Jan Nowak Date: Sun, 18 Apr 2021 15:40:42 +0200 Subject: [PATCH] Pobieranie tytulow artykulow. --- app/elektronikapraktyczna.hs | 45 ++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/app/elektronikapraktyczna.hs b/app/elektronikapraktyczna.hs index 0322803..c9b422e 100644 --- a/app/elektronikapraktyczna.hs +++ b/app/elektronikapraktyczna.hs @@ -33,7 +33,7 @@ mToString n extractMonth :: String -> String extractMonth n = case n =~~ ("[A-za-z]+" :: String) of - Just month -> eliminate (baseMonthNameToNumber (toLowerString month)) + Just month -> "-" ++ eliminate (baseMonthNameToNumber (toLowerString month)) otherwise -> "" mExtractYear :: String -> String @@ -42,36 +42,51 @@ mExtractYear n = Just year -> year otherwise -> "" -changeDate :: String -> String -changeDate a = eliminate (extractYear a) ++ "-" ++ extractMonth a - +changeDate :: String -> Maybe String +changeDate a = Just (eliminate (extractYear a) ++ extractMonth a) -extractRecords = extractLinksWithText "(//a[@class='magazine-list__year-item'])[last()]" -- pary adres-tytuł ---extractRecords = extractLinksWithText "//a[@class='magazine-list__year-item']" -- pary adres-tytuł +extractLinksWithArticleTitle xpathCondition = (downloadDocument &&& this) + >>> first (getXPathTrees xpathCondition + >>> ( + (getXPathTrees "//div[@class='text']" >>> (listA (deep isText >>> getText) + >>> arr (intercalate " "))) + &&& + (getXPathTrees "//div[@class='files__item']/a[contains(@href,'.pdf')]" >>> (getAttrValue "href")) + )) + +--extractRecords = extractLinksWithText "(//a[@class='magazine-list__year-item'])[last()]" -- pary adres-tytuł +extractRecords = extractLinksWithText "//a[@class='magazine-list__year-item']" -- pary adres-tytuł >>> second (arr $ replace "\r\n " "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków >>> second (arr $ replace " " "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków -- >>> first (arr ((++"tr") . init)) -- modyfikujemy pierwszy element pary, czyli adres URL >>> first (extractLinksWithText "//div[@class='magazine-list__item']/a") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego >>> first (second (arr $ replace "\r\n " "")) >>> first (first ( - extractLinksWithText "//div[@class='files__item']/a[contains(@href,'.pdf')]" - >>> second (arr $ replace "\r\n " "") + -- extractLinksWithArticleTitle "//div[@class='files__item']/a[contains(@href,'.pdf')]" + (extractLinksWithArticleTitle "//div[@class='magazine-single__content-title article text']") + >>> + first ( + first (arr $ replace " " "") + >>> first (arr $ replace "\r\n " "") + ) + -- >>> first (arr $ replace "\r\n" "") -- >>> first (arr $ replace "//" "/") ) - >>> second (arr $ changeDate) + -- >>> second (arr $ changeDate) -- Zmiana nazwy miesiąca na wartość liczbową + >>> second (arr $ replace " " "") >>> second (arr $ replace " " "") ) -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika) -- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem -toShadowItem :: (((String, String), String), String) -> ShadowItem -toShadowItem (((url, chapterTitle), articleTitle), yearlyTitle) = +toShadowItem :: ((((String, String), String), String), String) -> ShadowItem +toShadowItem ((((chapterTitle, url), finalUrl), articleTitle), yearlyTitle) = (defaultShadowItem url title) { - originalDate = Just date, + originalDate = changeDate articleTitle, itype = "periodical", format = Just "pdf", - finalUrl = url + finalUrl = finalUrl } - where title = "Elektronika Praktyczna " ++ (replace " " "" articleTitle) + where title = "Elektronika Praktyczna " ++ articleTitle ++ " - " ++ chapterTitle date = yearlyTitle getDate yearlyTitle = @@ -87,4 +102,4 @@ main = do abbrev="EP", lLevel=0, webpage=start} - extractItemsStartingFromUrl shadowLibrary start (extractRecords)-- >>> arr toShadowItem) + extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)