This commit is contained in:
nlitkowski 2021-04-07 02:37:50 +02:00
parent 8fd481ac15
commit 141728f14f

View File

@ -19,9 +19,25 @@ extractNestedLinksWithText xpathCondition = (downloadDocument &&& this)
&&& (listA (deep isText >>> getText)
>>> arr (intercalate " "))
))
>>> arr rotateSecTh
>>> arr rotateSecTh -- ((a, b), c) -> ((a, c), b)
>>> first expandURIFixed
getLinkAndText xpathCondition = proc doc -> do
xpathTrees <- getXPathTrees xpathCondition -< doc
href <- (getXPathTrees "//a" >>> getAttrValue "href") -< xpathTrees
txt <- (listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
returnA -< (href, txt)
extractNestedLinksWithText2 xpathCondition = proc x -> do
doc <- downloadDocument -< x
thisValue <- this -< x
((a,b),c) <- getLinkAndText xpathCondition -< doc
uriFixed <- expandURIFixed -< (a,c)
returnA -< (uriFixed, b)
extractRecords = extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]" -- pary adres-tytuł podstrony
>>> first (extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]") -- pobieramy podstronę i kolejne podstrony z menu
>>> first (first (extractNestedLinksWithText "//big/a[contains(@href,'.pdf')][img]")) -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego