{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath -- import Text.XML.HXT.Curl import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf -- wyciaganie pdf-ow extractRecords = extractLinksWithText "//nav//a[contains(@href,'title=Kategoria:')]" >>> first (extractLinksWithText "//a[contains(@href,'title=Plik') and contains(@href,'pdf')]") >>> first (first (extractLinksGeneralized "//div[@id='file']//iframe" "src")) >>> first (first (arr $ replace "#page=1" "")) -- proby pobrania pdf i jpg --extractRecords = extractLinksWithText "//nav//a[contains(@href,'title=Kategoria:14_WDH')]" -- >>> first (extractLinksWithText "//a[contains(@href,'title=Plik') and (contains(@href,'pdf') or contains(@href,'jpg'))]") -- >>> first ( first ( -- downloadDocument -- >>> (getXPathTrees "//div[@id='file']//iframe" -- >>> getAttrValue "src") *** (getXPathTrees "//div[@id='file']//a" -- >>> getAttrValue "href") -- >>> first (expandURIFixed) -- )) toShadowItem :: ((String, String), String) -> ShadowItem toShadowItem ((url, fileTitle), emptyTmp) = (defaultShadowItem url title) { originalDate = date, itype = "periodical", format = ext, finalUrl = url } where title = "Archiwum Harcerskie - " ++ fileTitle date = extractDate url ext = extractFormat url extractDate :: String -> Maybe String extractDate n = case n =~~ ("(((19[0-9]{2})|(2[0-2]{1}[0-9]{2}))(((-[0-1]{1}[0-9]{1}-[0-9]{2})|)|((-[0-1]{1}[0-9]{1}[^0-9][ _-]{1}))))" :: String) of Just date -> Just date otherwise -> Nothing main = do let start = "http://archiwumharcerskie.pl/index.php?title=Strona_główna" let shadowLibrary = ShadowLibrary {logoUrl=Nothing, lname="Archiwum Harcerskie", abbrev="ArchHarc", lLevel=0, webpage=start} extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem) -- extractItemsStartingFromUrl shadowLibrary start (extractRecords)