{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath --import Text.XML.HXT.Curl import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf import Control.Monad extractFirstPage = (downloadDocument &&& this) >>> first (getXPathTrees "//td[@class='ms-gb'][1]" >>> getChildren >>> getText) wupExtractor = (downloadDocument &&& this) >>> first ( getXPathTrees "//tr[contains(@class, 'ms-itmhover') or contains(@class, 'ms-alternatingstrong')]" ) >>> first (deep ( (getXPathTrees "//td[@class='ms-vb2'][6]" >>> getChildren >>> getText) &&& (getXPathTrees "//td[@class='ms-vb2'][7]/a" >>> getAttrValue "href") &&& (getXPathTrees "//td[@class='ms-vb2'][5]" >>> getChildren >>> getText) )) fetchLinks year xs failedTries = do items <- runX $ (arr (const ( "https://grab.uprp.pl/sites/Wydawnictwa/WydawnictwaArchiwum/_layouts/15/inplview.aspx?List={D7CA1A2D-B281-4EB6-A9DE-E20ED7654A37}&View={AFA62008-9FDA-4B91-B630-AA3EC8CFB774}&ViewCount=9&IsXslView=TRUE&GroupString=%3B%23" ++ show year ++ "%3B%23&IsGroupRender=TRUE&WebPartID={AFA62008-9FDA-4B91-B630-AA3EC8CFB774}" )) >>> setTraceLevel 1 >>> wupExtractor) >>> arr toShadowItem if length items == 0 then if failedTries == 10 then return xs else fetchLinks (year + 1) xs (failedTries + 1) else fetchLinks (year + 1) (xs ++ items) 0 toShadowItem :: ((String, (String, String)), String) -> ShadowItem toShadowItem ((date, (relativeFileUrl, itemNr)), url) = (defaultShadowItem url title) { originalDate = Just date, itype = "periodical", format = Just "pdf", finalUrl = fileUrl } where fileUrl = "https://grab.uprp.pl" ++ relativeFileUrl title = "Wiadomosci Urzedu Patentowego Nr. " ++ itemNr main = do let start = "https://grab.uprp.pl/sites/Wydawnictwa/WydawnictwaArchiwum/WydawnictwaArchiwum/Forms/AllItems.aspx" let shadowLibrary = ShadowLibrary {logoUrl=Nothing, lname="Wiadomości Urzędu Patentowego", abbrev="WUP", lLevel=0, webpage=start} firstPageItems <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractFirstPage) let firstYear = read $ (head . tail . words . fst . head) firstPageItems :: Int shadowItems <- fetchLinks firstYear [] 0 mapM_ (putStrLn . show) shadowItems