title extraction

Robot for Teatr Lalek
2022-06-15 22:27:32 +02:00 · 2022-04-10 23:54:07 +02:00 · 2022-04-10 23:51:13 +02:00 · 2021-03-13 18:31:04 +01:00
3 changed files with 59 additions and 16 deletions
--- a/app/almanachmuszyny.hs
+++ b/app/almanachmuszyny.hs
@ -11,11 +11,14 @@ import Data.List.Utils (replace)
 import Text.Regex.Posix
 import Text.Printf

-extractRecords = extractLinksWithText "//a[@class='roczniki']"
-                 >>> second (arr $ replace "\r\n              " "")
-                 >>> first (arr ((++"tr") . init))
-                 >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]")

+extractRecords = extractLinksWithText "//a[@class='roczniki']"  -- pary adres-tytuł
+                 >>> second (arr $ replace "\r\n            " " ") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
+                 >>> first (arr ((++"tr") . init))  -- modyfikujemy pierwszy element pary, czyli adres URL
+                 >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
+                 -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika)
+
+-- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
 toShadowItem :: ((String, String), String) -> ShadowItem
 toShadowItem ((url, articleTitle), yearlyTitle) =
  (defaultShadowItem url title) {
--- a/app/teatrLalek.hs
+++ b/app/teatrLalek.hs
@ -0,0 +1,39 @@
+
+{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
+import ShadowLibrary.Core
+
+import Text.XML.HXT.Core
+import Text.XML.HXT.XPath
+-- import Text.XML.HXT.Curl
+import Data.List
+import Data.List.Utils (replace)
+
+import Text.Regex.Posix
+import Text.Printf
+
+
+extractRecords = extractLinks "//a[contains(@href, '.pdf')]"
+
+
+toShadowItem :: String -> ShadowItem
+toShadowItem url =
+  (defaultShadowItem url title) {
+    originalDate = Just date,
+    itype = "periodical",
+    format = Just "pdf",
+    finalUrl = url
+    }
+  where
+    date = last $ getAllTextMatches $ url =~ "(19[0-9][0-9]|20[0-9][0-9])" :: String
+    titleToProcess = last $ getAllTextMatches $ url =~ "/[a-zA-Z ]+[-_]" :: String
+    title = titleToProcess =~ "[a-zA-Z ]+" :: String
+
+
+main = do
+    let start = "http://polunima.pl/teatr-lalek/"
+    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
+                                       lname="Teatr Lalek",
+                                       abbrev="Teatr",
+                                       lLevel=0,
+                                       webpage=start}
+    extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)
--- a/shadow-library.cabal
+++ b/shadow-library.cabal
@ -34,18 +34,19 @@ library
                     , tz
  default-language:    Haskell2010

-- executable maly-modelarz-exe
--    hs-source-dirs:      app
--    main-is:             malymodelarz.hs
--    ghc-options:         -threaded -rtsopts -with-rtsopts=-N
--    build-depends:       base
--                       , hxt
--                       , hxt-curl
--                       , hxt-xpath
--                       , MissingH
--                       , regex-posix
--                       , shadow-library
--    default-language:    Haskell2010
+executable teatrLalek
+    hs-source-dirs:      app
+    main-is:             teatrLalek.hs
+    ghc-options:         -threaded -rtsopts -with-rtsopts=-N
+    build-depends:       base
+                       , hxt
+                       , hxt-curl
+                       , hxt-xpath
+                       , MissingH
+                       , regex-posix
+                       , shadow-library
+                       , regex-tdfa
+    default-language:    Haskell2010

 executable almanachmuszyny
   hs-source-dirs:      app
Author	SHA1	Message	Date
s470623	0114a102d4	title extraction	2022-06-15 22:27:32 +02:00
s470623	1f23d94e7c	Robot for Teatr Lalek	2022-04-10 23:54:07 +02:00
s470623	953d7242e8	Robot for Teatr Lalek	2022-04-10 23:51:13 +02:00
Filip Gralinski	8883a924b4	Add comments	2021-03-13 18:31:04 +01:00