clean output

2021-04-07 23:28:39 +02:00 · 2021-04-07 23:28:39 +02:00 · 1d077001b8
commit 1d077001b8
parent 0d45d42336
1 changed files with 12 additions and 9 deletions
--- a/app/ZborBielawa.hs
+++ b/app/ZborBielawa.hs
@ -8,7 +8,6 @@ import Text.XML.HXT.XPath
 import Data.List
 import Data.List.Utils (replace)

-- import Text.Regex.Posix
 import Text.Printf

 import Control.Lens.Regex.Text
@ -31,13 +30,17 @@ extractNestedLinksWithText xpathCondition = proc url -> do


 extractRecords = proc startUrl -> do 
-                --  (catUrl, catText) <- extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[2]" -< startUrl  -- pary adres-tytuł podstrony
-                 (catUrl, catText) <- extractLinksWithText "//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link']" -< startUrl  -- pary adres-tytuł podstrony
-                 (collUrl, collText) <- (extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]") -< catUrl -- pobieramy podstronę kategorii i kolejne podstrony z menu
-                --  (collUrl, collText) <- (extractLinksWithText "//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link']") -< catUrl -- pobieramy podstronę kategorii i kolejne podstrony z menu
-                 (relUrl, relText) <- (extractNestedLinksWithText "(//big[a[contains(@href,'.pdf')]])[1]") -< collUrl -- pobieramy stronę z wydaniami z danej kolekcji i linki do PDFów
-                --  (relUrl, relText) <- (extractNestedLinksWithText "//big[a[contains(@href,'.pdf')]]") -< collUrl -- pobieramy stronę z wydaniami z danej kolekcji i linki do PDFów
-                 returnA -< (relUrl, relText, collText, catText) -- ostatecznie wyjdą krotki (adres URL PDFa wydania, tytuł wydania, tytuł zbioru, tytuł kategorii)
+  (catUrl, catText) <- extractLinksWithText "//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link']" -< startUrl  -- pary adres-tytuł podstrony
+  (collUrl, collText) <- (extractLinksWithText "//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link']") -< catUrl -- pobieramy podstronę kategorii i kolejne podstrony z menu
+  (relUrl, relText) <- (extractNestedLinksWithText "//big[a[contains(@href,'.pdf')]]") -< collUrl -- pobieramy stronę z wydaniami z danej kolekcji i linki do PDFów
+  returnA -< (relUrl, relText, collText, catText) -- ostatecznie wyjdą krotki (adres URL PDFa wydania, tytuł wydania, tytuł zbioru, tytuł kategorii)
+
+cleanReleaseTitle :: String -> String
+cleanReleaseTitle = proc text -> do
+  endCleaned <- replace " |\160" "" -< text
+  nbspCleaned <- replace "\160" " " -< endCleaned
+  res <- replace "\8211 " "" -< nbspCleaned
+  returnA -< res

 -- ... a tutaj te krotki przerabiamy do docelowej struktury ShadowItem
 toShadowItem :: (String, String, String, String) -> ShadowItem
@ -48,7 +51,7 @@ toShadowItem (url, releaseTitle, collectionTitle, categoryTitle) =
    format = Just "pdf",
    finalUrl = url
  }
-  where title = categoryTitle ++ (" " ++ collectionTitle)
+  where title = categoryTitle ++ (" " ++ arr cleanReleaseTitle releaseTitle)
        date = getDate $ T.pack $ releaseTitle