Fetching text

2021-04-07 03:54:56 +02:00 · 2021-04-07 03:54:56 +02:00 · cb5948925d
commit cb5948925d
parent 10c06c503c
1 changed files with 4 additions and 5 deletions
--- a/app/ZborBielawa.hs
+++ b/app/ZborBielawa.hs
@ -4,21 +4,20 @@ import ShadowLibrary.Core

 import Text.XML.HXT.Core
 import Text.XML.HXT.XPath
-- import Text.XML.HXT.Curl
+
 import Data.List
 import Data.List.Utils (replace)

 import Text.Regex.Posix
 import Text.Printf
-import Debug.Trace


 getLinkAndText xpathCondition = proc doc -> do
  xpathTrees <- getXPathTrees xpathCondition -< doc
  name <- getElemName -< xpathTrees
-  txt <- (getXPathTrees "../text()" >>> getText) -< xpathTrees
+  txt <- (listA (deep isText >>> getText)  >>> arr (intercalate " ")) -< xpathTrees
  href <- (getXPathTrees "//a" >>> getAttrValue "href") -< xpathTrees
-  returnA -< trace ("AAAAAAAAAAAAAAAA " ++ show name) $ traceShowId (href, txt)
+  returnA -< href, txt


 extractNestedLinksWithText xpathCondition = proc url -> do
@ -31,7 +30,7 @@ extractNestedLinksWithText xpathCondition = proc url -> do
 extractRecords = proc x -> do 
                 (a, b) <- extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]" -< x  -- pary adres-tytuł podstrony
                 (a', b') <- (extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]") -< a -- pobieramy podstronę i kolejne podstrony z menu
-                 a'' <- (extractNestedLinksWithText "//big/a[contains(@href,'.pdf')][img]") -< a' -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
+                 a'' <- (extractNestedLinksWithText "//big[a[contains(@href,'.pdf')][img]]") -< a' -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
                 returnA -< ((a'', b'), b)
                 -- ostatecznie wyjdą krotki (((adres URL, tytuł nr-u), tytuł podstrony 2), tytuł podstrony 1)