From 1d077001b8bff5b2a8135efde99df181ed7f0b1a Mon Sep 17 00:00:00 2001
From: Norbert Litkowski <norbert.litkowski@gmail.com>
Date: Wed, 7 Apr 2021 23:28:39 +0200
Subject: [PATCH] clean output

---
 app/ZborBielawa.hs | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/app/ZborBielawa.hs b/app/ZborBielawa.hs
index a551f3c..f8e9d85 100644
--- a/app/ZborBielawa.hs
+++ b/app/ZborBielawa.hs
@@ -8,7 +8,6 @@ import Text.XML.HXT.XPath
 import Data.List
 import Data.List.Utils (replace)
 
--- import Text.Regex.Posix
 import Text.Printf
 
 import Control.Lens.Regex.Text
@@ -31,13 +30,17 @@ extractNestedLinksWithText xpathCondition = proc url -> do
 
 
 extractRecords = proc startUrl -> do 
-                --  (catUrl, catText) <- extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[2]" -< startUrl  -- pary adres-tytuł podstrony
-                 (catUrl, catText) <- extractLinksWithText "//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link']" -< startUrl  -- pary adres-tytuł podstrony
-                 (collUrl, collText) <- (extractLinksWithText "(//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link'])[1]") -< catUrl -- pobieramy podstronę kategorii i kolejne podstrony z menu
-                --  (collUrl, collText) <- (extractLinksWithText "//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link']") -< catUrl -- pobieramy podstronę kategorii i kolejne podstrony z menu
-                 (relUrl, relText) <- (extractNestedLinksWithText "(//big[a[contains(@href,'.pdf')]])[1]") -< collUrl -- pobieramy stronę z wydaniami z danej kolekcji i linki do PDFów
-                --  (relUrl, relText) <- (extractNestedLinksWithText "//big[a[contains(@href,'.pdf')]]") -< collUrl -- pobieramy stronę z wydaniami z danej kolekcji i linki do PDFów
-                 returnA -< (relUrl, relText, collText, catText) -- ostatecznie wyjdą krotki (adres URL PDFa wydania, tytuł wydania, tytuł zbioru, tytuł kategorii)
+  (catUrl, catText) <- extractLinksWithText "//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link']" -< startUrl  -- pary adres-tytuł podstrony
+  (collUrl, collText) <- (extractLinksWithText "//aside[@class='widget widget_maxmegamenu']//a[@class='mega-menu-link']") -< catUrl -- pobieramy podstronę kategorii i kolejne podstrony z menu
+  (relUrl, relText) <- (extractNestedLinksWithText "//big[a[contains(@href,'.pdf')]]") -< collUrl -- pobieramy stronę z wydaniami z danej kolekcji i linki do PDFów
+  returnA -< (relUrl, relText, collText, catText) -- ostatecznie wyjdą krotki (adres URL PDFa wydania, tytuł wydania, tytuł zbioru, tytuł kategorii)
+
+cleanReleaseTitle :: String -> String
+cleanReleaseTitle = proc text -> do
+  endCleaned <- replace " |\160" "" -< text
+  nbspCleaned <- replace "\160" " " -< endCleaned
+  res <- replace "\8211 " "" -< nbspCleaned
+  returnA -< res
 
 -- ... a tutaj te krotki przerabiamy do docelowej struktury ShadowItem
 toShadowItem :: (String, String, String, String) -> ShadowItem
@@ -48,7 +51,7 @@ toShadowItem (url, releaseTitle, collectionTitle, categoryTitle) =
     format = Just "pdf",
     finalUrl = url
   }
-  where title = categoryTitle ++ (" " ++ collectionTitle)
+  where title = categoryTitle ++ (" " ++ arr cleanReleaseTitle releaseTitle)
         date = getDate $ T.pack $ releaseTitle