diff --git a/app/archiwapilsudski.hs b/app/archiwapilsudski.hs index 50899d0..7c46a63 100644 --- a/app/archiwapilsudski.hs +++ b/app/archiwapilsudski.hs @@ -28,6 +28,12 @@ cleanText = arr (replace "\261" "a") cleanDate = arr (replace "\160" "") >>> arr (replace " " "") +getLang = proc doc -> do + xpathTrees <- getXPathTrees "//tr[td/text() = ' J\281zyki ']/td[2]" -< doc + lang <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees + langCleaned <- cleanDate -< lang + returnA -< langCleaned + getTitle = proc doc -> do xpathTrees <- getXPathTrees "//title" -< doc title <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees @@ -50,7 +56,8 @@ extractRecordData = proc recordUrl -> do recordTitle <- getTitle -< doc recordDate <- getDate -< doc recordUrls <- getUrls -< doc - returnA -< (recordUrls, recordTitle, recordDate) + recordLang <- getLang -< doc + returnA -< (recordUrls, recordTitle, recordDate, recordLang) extractRecordLinks = extractLinks "//table[@class='tabelka']//a[contains(@href,'.php')]" >>> extractLinks "//table[@class='tabelka']//td[@width='68%']/a[contains(@href,'.php')]" @@ -58,16 +65,18 @@ extractRecordLinks = extractLinks "//table[@class='tabelka']//a[contains(@href,' >>> extractRecordData -toShadowItem :: (String, String, String) -> ShadowItem -toShadowItem (url, recordTitle, recordDate) = +toShadowItem :: (String, String, String, String) -> ShadowItem +toShadowItem (url, recordTitle, recordDate, recordLang) = (defaultShadowItem url title) { originalDate = Just date, itype = "periodical", format = Just "pdf", - finalUrl = url + finalUrl = url, + lang = Just language } where title = recordTitle date = recordDate + language = recordLang main = do