extractclanguage

next version
small tweak
2021-04-17 12:42:55 +02:00 · 2021-04-13 10:25:17 +02:00 · 2021-04-12 16:00:07 +02:00 · 2021-04-12 15:59:18 +02:00 · 2021-04-11 15:02:15 +02:00 · 2021-04-11 15:02:13 +02:00
5 changed files with 131 additions and 8 deletions
--- a/ShadowLibrary/Core.hs
+++ b/ShadowLibrary/Core.hs
@ -33,7 +33,7 @@ import Data.Tree.NTree.TypeDefs
 import Data.Maybe
 import Control.Monad.Trans
 import Text.XML.HXT.XPath
-- import Text.XML.HXT.Curl
+import Text.XML.HXT.Curl
 import Text.XML.HXT.HTTP

 import Text.Regex.TDFA
@ -64,8 +64,8 @@ downloadDocument = readFromDocument [withParseHTML yes,
                                     withEncodingErrors no,
                                     withPreserveComment yes,
                                     withStrictInput yes,
-                                     withHTTP []
--                                     withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")]
+--                                     withHTTP []
+                                     withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")]
                                    ]

 downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes,
@ -73,13 +73,13 @@ downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes,
                                                     withEncodingErrors no,
                                                     withPreserveComment yes,
                                                     withInputEncoding enc,
-					             withHTTP []]		
--                                                     withCurl []]
+--                       		             withHTTP []]
+                                                     withCurl []]

 downloadXmlDocument = readFromDocument [withWarnings no,
                                        withEncodingErrors no,
-					withHTTP []]
--                                        withCurl [] ]
+--					withHTTP []]
+                                        withCurl [] ]


 data ShadowLibrary = ShadowLibrary { logoUrl :: Maybe String,
--- a/app/almanachmuszyny.hs
+++ b/app/almanachmuszyny.hs
@ -4,7 +4,7 @@ import ShadowLibrary.Core

 import Text.XML.HXT.Core
 import Text.XML.HXT.XPath
-- import Text.XML.HXT.Curl
+--import Text.XML.HXT.Curl
 import Data.List
 import Data.List.Utils (replace)

--- a/app/archiwapilsudski.hs
+++ b/app/archiwapilsudski.hs
@ -0,0 +1,89 @@
+
+{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
+import ShadowLibrary.Core
+
+import Text.XML.HXT.Core
+import Text.XML.HXT.XPath
+-- import Text.XML.HXT.Curl
+import Data.List
+import Data.List.Utils (replace)
+
+import Text.Regex.Posix
+import Text.Printf
+
+
+cleanText = arr (replace "\261" "a")
+        >>> arr (replace "\263" "c")
+        >>> arr (replace "\281" "e")
+        >>> arr (replace "\322" "l")
+        >>> arr (replace "\321" "L")
+        >>> arr (replace "\324" "n")
+        >>> arr (replace "\243" "o")
+        >>> arr (replace "\347" "s")
+        >>> arr (replace "\346" "S")
+        >>> arr (replace "(\378)|(\380)" "z")
+        >>> arr (replace "(\377)|(\379)" "Z")
+        >>> arr (replace "\160" "")
+
+cleanDate = arr (replace "\160" "")
+        >>> arr (replace " " "")
+
+getLang = proc doc -> do
+  xpathTrees <- getXPathTrees "//tr[td/text() = ' J\281zyki ']/td[2]" -< doc
+  lang <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
+  langCleaned <- cleanDate -< lang
+  returnA -< langCleaned
+
+getTitle = proc doc -> do
+  xpathTrees <- getXPathTrees "//title" -< doc
+  title <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
+  titleCleaned <- cleanText -< title
+  returnA -< titleCleaned
+
+getDate = proc doc -> do
+  xpathTrees <- getXPathTrees "//tr[td/text() = ' Data dokumentu ']/td[2]" -< doc
+  date <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
+  cleanedDate <- cleanDate -< date
+  returnA -< cleanedDate
+
+getUrls = proc doc -> do
+  xpathTrees <- getXPathTrees "//div[@style='padding: 4px 7px;float:left;']/a[contains(@href,'pdf')]" -< doc
+  urlList <- (getAttrValue "href" >>> arr ("http://archiwa.pilsudski.org/" ++))  -< xpathTrees
+  returnA -< urlList
+
+extractRecordData = proc recordUrl -> do
+  doc <- downloadDocument -< recordUrl
+  recordTitle <- getTitle -< doc
+  recordDate <- getDate -< doc
+  recordUrls <- getUrls -< doc
+  recordLang <- getLang -< doc
+  returnA -< (recordUrls, recordTitle, recordDate, recordLang)
+
+extractRecordLinks = extractLinks "//table[@class='tabelka']//a[contains(@href,'.php')]"
+                 >>> extractLinks "//table[@class='tabelka']//td[@width='68%']/a[contains(@href,'.php')]"
+                 >>> extractLinks "//table[@class='tabelka']//td[@width='76%']/a[contains(@href,'.php')]"
+                 >>> extractRecordData
+
+
+toShadowItem :: (String, String, String, String) -> ShadowItem
+toShadowItem (url, recordTitle, recordDate, recordLang) =
+  (defaultShadowItem url title) {
+    originalDate = Just date,
+    itype = "periodical",
+    format = Just "pdf",
+    finalUrl = url,
+    lang = Just language
+    }
+  where title = recordTitle
+        date = recordDate
+        language = recordLang
+
+
+main = do
+    let start = "https://archiwa.pilsudski.org/"
+    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
+                                       lname="Archiwa Pilsudski",
+                                       abbrev="ArchPil",
+                                       lLevel=0,
+                                       webpage=start}
+    extractItemsStartingFromUrl shadowLibrary start (extractRecordLinks >>> arr toShadowItem)
--- a/shadow-library.cabal
+++ b/shadow-library.cabal
@ -20,6 +20,7 @@ library
                     , HTTP
                     , hxt
                     , hxt-http
+                     , hxt-curl
                     , hxt-xpath
                     , MissingH
                     , monad-logger
@ -51,6 +52,20 @@ executable almanachmuszyny
   hs-source-dirs:      app
   main-is:             almanachmuszyny.hs
   ghc-options:         -threaded -rtsopts -with-rtsopts=-N
+   build-depends:       base
+                      , hxt
+                      , hxt-xpath
+                      , hxt-curl
+                      , MissingH
+                      , regex-posix
+                      , shadow-library
+   default-language:    Haskell2010
+
+
+executable archiwapilsudski
+   hs-source-dirs:      app
+   main-is:             archiwapilsudski.hs
+   ghc-options:         -threaded -rtsopts -with-rtsopts=-N
   build-depends:       base
                      , hxt
                      , hxt-xpath
--- a/stack.yaml.lock
+++ b/stack.yaml.lock
@ -0,0 +1,19 @@
+# This file was autogenerated by Stack.
+# You should not edit this file by hand.
+# For more information, please see the documentation at:
+#   https://docs.haskellstack.org/en/stable/lock_files
+
+packages:
+- completed:
+    hackage: hxt-xpath-9.1.2.2@sha256:9cd590ae93a04573db8f90fa4094625ebd97dded45da7667c577ce6b38a42900,1999
+    pantry-tree:
+      size: 2225
+      sha256: aee2f75974e868ff429b8ff349a29667536c60397098f5dfedc968d1951511bb
+  original:
+    hackage: hxt-xpath-9.1.2.2
+snapshots:
+- completed:
+    size: 507596
+    url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/11/9.yaml
+    sha256: 42f472dbf06482da1b3319241f3e3b3593a45bd7d4f537d2789f21386b9b2ad3
+  original: lts-11.9
Author	SHA1	Message	Date
Łukasz Jędyk	5877347916	extractclanguage	2021-04-17 12:42:55 +02:00
Łukasz Jędyk	c5c01e7d81	next version	2021-04-13 10:25:17 +02:00
Łukasz Jędyk	974098d4ec	small tweak	2021-04-12 16:00:07 +02:00
Łukasz Jędyk	9027097319	next version	2021-04-12 15:59:18 +02:00
Łukasz Jędyk	27d91d8817	Merge branch 'withcurl' of git://gonito.net/twilight-library	2021-04-11 15:02:15 +02:00
Łukasz Jędyk	fe298e2ea7	initial setup	2021-04-11 15:02:13 +02:00
Filip Gralinski	107793cbf1	Change to Curl	2019-03-17 21:52:13 +01:00