fix chelmek

add project chelmek
2022-05-05 20:30:14 +02:00 · 2022-04-16 21:07:17 +02:00
3 changed files with 71 additions and 10 deletions
--- a/ShadowLibrary/Core.hs
+++ b/ShadowLibrary/Core.hs
@ -33,7 +33,7 @@ import Data.Tree.NTree.TypeDefs
 import Data.Maybe
 import Control.Monad.Trans
 import Text.XML.HXT.XPath
-- import Text.XML.HXT.Curl
+import Text.XML.HXT.Curl
 import Text.XML.HXT.HTTP

 import Text.Regex.TDFA
@ -64,8 +64,8 @@ downloadDocument = readFromDocument [withParseHTML yes,
                                     withEncodingErrors no,
                                     withPreserveComment yes,
                                     withStrictInput yes,
-                                     withHTTP []
--                                     withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")]
+--                                     withHTTP []
+                                     withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")]
                                    ]

 downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes,
@ -73,13 +73,13 @@ downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes,
                                                     withEncodingErrors no,
                                                     withPreserveComment yes,
                                                     withInputEncoding enc,
-					             withHTTP []]		
--                                                     withCurl []]
+--					             withHTTP []]
+                                                     withCurl []]

 downloadXmlDocument = readFromDocument [withWarnings no,
                                        withEncodingErrors no,
-					withHTTP []]
--                                        withCurl [] ]
+--					withHTTP []]
+                                        withCurl [] ]


 data ShadowLibrary = ShadowLibrary { logoUrl :: Maybe String,
--- a/app/chelmek.hs
+++ b/app/chelmek.hs
@ -0,0 +1,47 @@
+
+{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
+import ShadowLibrary.Core
+
+import Text.XML.HXT.Core
+import Text.XML.HXT.XPath
+-- import Text.XML.HXT.Curl
+import Data.List
+import Data.List.Utils (replace)
+
+import Text.Regex.Posix
+import Text.Printf
+
+
+extractRecords = extractLinksWithText "//div[@class='span4']//h2[@itemprop='name']/a[contains(@href,'o-nas')]"  -- pary adres-tytuł
+                 >>> first (extractLinksWithText "//a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
+                 -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika)
+
+
+toShadowItem :: ((String, String), String) -> ShadowItem
+toShadowItem ((url, articleTitle), yearlyTitle) =
+  (defaultShadowItem url title) {
+    originalDate = Just date,
+    itype = "periodical",
+   format = Just "pdf",
+    finalUrl = url
+    }  
+  where title = replace "\"" "'" articleTitle
+        date = getDate yearlyTitle
+
+getDate yearlyTitle =
+  case yearlyTitle =~~ "(19[0-9][0-9]|20[0-9][0-9])" :: Maybe [[String]] of
+    Just [[_, year]] -> year
+    otherwise -> "unexpected yearlyTitle" ++ yearlyTitle 
+
+
+main = do
+    let start = "http://moksir.chelmek.pl/o-nas/echo-chelmka"
+    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
+                                       lname="Chelmek",
+                                       abbrev="Chelmek",
+                                       lLevel=0,
+                                       webpage=start}
+    extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)
+    
+    
+
--- a/shadow-library.cabal
+++ b/shadow-library.cabal
@ -5,8 +5,8 @@ description:         Please see README.md
 homepage:            http://github.com/name/project
 license:             Proprietary
 license-file:        LICENSE
-author:              Your name here
-maintainer:          your.address@example.com
+author:              Maciej Ścigacz
+maintainer:          macsci1@st.amu.edu.pl
 -- copyright:
 category:            Web
 build-type:          Simple
@ -20,6 +20,7 @@ library
                     , HTTP
                     , hxt
                     , hxt-http
+                     , hxt-curl
                     , hxt-xpath
                     , MissingH
                     , monad-logger
@ -54,12 +55,25 @@ executable almanachmuszyny
   build-depends:       base
                      , hxt
                      , hxt-xpath
+                      , hxt-curl
                      , MissingH
                      , regex-posix
                      , shadow-library
   default-language:    Haskell2010

+executable chelmek
+   hs-source-dirs:      app
+   main-is:             chelmek.hs
+   ghc-options:         -threaded -rtsopts -with-rtsopts=-N
+   build-depends:       base
+                      , hxt
+                      , hxt-xpath
+                      , hxt-curl
+                      , MissingH
+                      , regex-posix
+                      , shadow-library
+   default-language:    Haskell2010

 source-repository head
  type:     git
-  location: https://github.com/name/project
+  location: https://git.wmi.amu.edu.pl/s444476/twilight-library
Author	SHA1	Message	Date
Maciej Ścigacz	ce0342076d	fix chelmek	2022-05-05 20:30:14 +02:00
Maciej Ścigacz	a96e26c4ef	add project chelmek	2022-04-16 21:07:17 +02:00