diff --git a/app/ZborBielawa.hs b/app/ZborBielawa.hs index 1beec09..999803c 100644 --- a/app/ZborBielawa.hs +++ b/app/ZborBielawa.hs @@ -19,9 +19,25 @@ extractNestedLinksWithText xpathCondition = (downloadDocument &&& this) &&& (listA (deep isText >>> getText) >>> arr (intercalate " ")) )) - >>> arr rotateSecTh + >>> arr rotateSecTh -- ((a, b), c) -> ((a, c), b) >>> first expandURIFixed + +getLinkAndText xpathCondition = proc doc -> do + xpathTrees <- getXPathTrees xpathCondition -< doc + href <- (getXPathTrees "//a" >>> getAttrValue "href") -< xpathTrees + txt <- (listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees + returnA -< (href, txt) + + +extractNestedLinksWithText2 xpathCondition = proc x -> do + doc <- downloadDocument -< x + thisValue <- this -< x + ((a,b),c) <- getLinkAndText xpathCondition -< doc + uriFixed <- expandURIFixed -< (a,c) + returnA -< (uriFixed, b) + + extractRecords = extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]" -- pary adres-tytuł podstrony >>> first (extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]") -- pobieramy podstronę i kolejne podstrony z menu >>> first (first (extractNestedLinksWithText "//big/a[contains(@href,'.pdf')][img]")) -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego