From e5561156becb2ff1a52994c2b3bda5bcb30ad41f Mon Sep 17 00:00:00 2001 From: Wojciech Pakulski Date: Tue, 6 Apr 2021 02:25:53 +0200 Subject: [PATCH] Add WUP bot --- app/urzadpatentowy.hs | 65 +++++++++++++++++++++++++++++++++++++++++++ shadow-library.cabal | 18 ++++++++++-- 2 files changed, 80 insertions(+), 3 deletions(-) create mode 100644 app/urzadpatentowy.hs diff --git a/app/urzadpatentowy.hs b/app/urzadpatentowy.hs new file mode 100644 index 0000000..469374a --- /dev/null +++ b/app/urzadpatentowy.hs @@ -0,0 +1,65 @@ +{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} +import ShadowLibrary.Core + +import Text.XML.HXT.Core +import Text.XML.HXT.XPath +--import Text.XML.HXT.Curl +import Data.List +import Data.List.Utils (replace) + +import Text.Regex.Posix +import Text.Printf + +import Control.Monad + + +extractFirstPage = (downloadDocument &&& this) + >>> first (getXPathTrees "//td[@class='ms-gb'][1]" >>> getChildren >>> getText) + +wupExtractor = (downloadDocument &&& this) + >>> first ( getXPathTrees "//tr[contains(@class, 'ms-itmhover') or contains(@class, 'ms-alternatingstrong')]" ) + >>> first (deep ( + (getXPathTrees "//td[@class='ms-vb2'][6]" >>> getChildren >>> getText) &&& + (getXPathTrees "//td[@class='ms-vb2'][7]/a" >>> getAttrValue "href") + )) + +fetchLinks year xs failedTries = do + items <- runX $ (arr (const ( + "https://grab.uprp.pl/sites/Wydawnictwa/WydawnictwaArchiwum/_layouts/15/inplview.aspx?List={D7CA1A2D-B281-4EB6-A9DE-E20ED7654A37}&View={AFA62008-9FDA-4B91-B630-AA3EC8CFB774}&ViewCount=9&IsXslView=TRUE&GroupString=%3B%23" ++ + show year ++ + "%3B%23&IsGroupRender=TRUE&WebPartID={AFA62008-9FDA-4B91-B630-AA3EC8CFB774}" + )) >>> setTraceLevel 1 >>> wupExtractor >>> arr toShadowItem) + + if length items == 0 then + if failedTries == 10 then + mapM_ (putStrLn . show) xs + else + fetchLinks (year + 1) xs (failedTries + 1) + else + fetchLinks (year + 1) (xs ++ items) 0 + + +toShadowItem :: ((String, String), String) -> ShadowItem +toShadowItem ((date, relativeFileUrl), url) = + (defaultShadowItem url title) { + originalDate = Just date, + itype = "periodical", + format = Just "pdf", + finalUrl = fileUrl + } + where fileUrl = "https://grab.uprp.pl" ++ relativeFileUrl + title = "WUP" + +main = do + let start = "https://grab.uprp.pl/sites/Wydawnictwa/WydawnictwaArchiwum/WydawnictwaArchiwum/Forms/AllItems.aspx" + let shadowLibrary = ShadowLibrary {logoUrl=Nothing, + lname="Wiadomości Urzędu Patentowego", + abbrev="WUP", + lLevel=0, + webpage=start} + + firstPageItems <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractFirstPage) + let firstYear = read $ (head . tail . words . fst . head) firstPageItems :: Int + + fetchLinks firstYear [] 0 + diff --git a/shadow-library.cabal b/shadow-library.cabal index 3e1bd4e..a0f4daa 100644 --- a/shadow-library.cabal +++ b/shadow-library.cabal @@ -48,9 +48,22 @@ library -- , shadow-library -- default-language: Haskell2010 -executable almanachmuszyny +-- executable almanachmuszyny +-- hs-source-dirs: app +-- main-is: almanachmuszyny.hs +-- ghc-options: -threaded -rtsopts -with-rtsopts=-N +-- build-depends: base +-- , hxt +-- , hxt-xpath +-- , hxt-curl +-- , MissingH +-- , regex-posix +-- , shadow-library +-- default-language: Haskell2010 + +executable urzadpatentowy hs-source-dirs: app - main-is: almanachmuszyny.hs + main-is: urzadpatentowy.hs ghc-options: -threaded -rtsopts -with-rtsopts=-N build-depends: base , hxt @@ -61,7 +74,6 @@ executable almanachmuszyny , shadow-library default-language: Haskell2010 - source-repository head type: git location: https://github.com/name/project