extractclanguage

This commit is contained in:
Łukasz Jędyk 2021-04-17 12:42:55 +02:00
parent c5c01e7d81
commit 5877347916

View File

@ -28,6 +28,12 @@ cleanText = arr (replace "\261" "a")
cleanDate = arr (replace "\160" "") cleanDate = arr (replace "\160" "")
>>> arr (replace " " "") >>> arr (replace " " "")
getLang = proc doc -> do
xpathTrees <- getXPathTrees "//tr[td/text() = ' J\281zyki ']/td[2]" -< doc
lang <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
langCleaned <- cleanDate -< lang
returnA -< langCleaned
getTitle = proc doc -> do getTitle = proc doc -> do
xpathTrees <- getXPathTrees "//title" -< doc xpathTrees <- getXPathTrees "//title" -< doc
title <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees title <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
@ -50,7 +56,8 @@ extractRecordData = proc recordUrl -> do
recordTitle <- getTitle -< doc recordTitle <- getTitle -< doc
recordDate <- getDate -< doc recordDate <- getDate -< doc
recordUrls <- getUrls -< doc recordUrls <- getUrls -< doc
returnA -< (recordUrls, recordTitle, recordDate) recordLang <- getLang -< doc
returnA -< (recordUrls, recordTitle, recordDate, recordLang)
extractRecordLinks = extractLinks "//table[@class='tabelka']//a[contains(@href,'.php')]" extractRecordLinks = extractLinks "//table[@class='tabelka']//a[contains(@href,'.php')]"
>>> extractLinks "//table[@class='tabelka']//td[@width='68%']/a[contains(@href,'.php')]" >>> extractLinks "//table[@class='tabelka']//td[@width='68%']/a[contains(@href,'.php')]"
@ -58,16 +65,18 @@ extractRecordLinks = extractLinks "//table[@class='tabelka']//a[contains(@href,'
>>> extractRecordData >>> extractRecordData
toShadowItem :: (String, String, String) -> ShadowItem toShadowItem :: (String, String, String, String) -> ShadowItem
toShadowItem (url, recordTitle, recordDate) = toShadowItem (url, recordTitle, recordDate, recordLang) =
(defaultShadowItem url title) { (defaultShadowItem url title) {
originalDate = Just date, originalDate = Just date,
itype = "periodical", itype = "periodical",
format = Just "pdf", format = Just "pdf",
finalUrl = url finalUrl = url,
lang = Just language
} }
where title = recordTitle where title = recordTitle
date = recordDate date = recordDate
language = recordLang
main = do main = do