WIP

2021-04-07 02:37:50 +02:00 · 2021-04-07 02:37:50 +02:00 · 141728f14f
commit 141728f14f
parent 8fd481ac15
1 changed files with 17 additions and 1 deletions
--- a/app/ZborBielawa.hs
+++ b/app/ZborBielawa.hs
@ -19,9 +19,25 @@ extractNestedLinksWithText xpathCondition = (downloadDocument &&& this)
                                                    &&& (listA (deep isText >>> getText)
                                                           >>> arr (intercalate " "))
                                                  ))
-                                      >>> arr rotateSecTh
+                                      >>> arr rotateSecTh -- ((a, b), c) -> ((a, c), b)
                                      >>> first expandURIFixed
 getLinkAndText xpathCondition = proc doc -> do
  xpathTrees <- getXPathTrees xpathCondition -< doc
  href <- (getXPathTrees "//a" >>> getAttrValue "href") -< xpathTrees
  txt <- (listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
  returnA -< (href, txt)
 extractNestedLinksWithText2 xpathCondition = proc x -> do
  doc <- downloadDocument -< x
  thisValue <- this -< x
  ((a,b),c) <- getLinkAndText xpathCondition -< doc
  uriFixed <- expandURIFixed -< (a,c)
  returnA -< (uriFixed, b)
 extractRecords = extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]"  -- pary adres-tytuł podstrony
                 >>> first (extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]") -- pobieramy podstronę i kolejne podstrony z menu
                 >>> first (first (extractNestedLinksWithText "//big/a[contains(@href,'.pdf')][img]")) -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego