{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} import ShadowLibrary.Core import Text.XML.HXT.Core import Text.XML.HXT.XPath import Data.List import Data.List.Utils (replace) import Text.Regex.Posix import Text.Printf import System.IO extractYears = extractLinksWithText "//div[@class='inner clearfix']//div[@class='row']//div[@class='col-sm-6']//ul//li//a" -- pary adres-tytuł runYearsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractYears) extractNewFiles = extractLinksWithText "//a[@title='Pobierz plik']" -- pary adres-tytuł runNewFilesExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractNewFiles) extractOldFiles = extractLinksWithText "//div[@class='content']//div[@class='inner clearfix']//ul//li//a" -- pary adres-tytuł runOldFilesExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractOldFiles) --367-376 extractOldYears = extractLinksWithText "//nav[@class='nav-left']//ul//li[@class='single-article']/a" -- pary adres-tytuł runOldYearsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractOldYears) mapToUrl :: ([Char], [Char]) -> [Char] mapToUrl (url, title) = url merge [] ys = ys merge (x:xs) ys = x:merge ys xs repeatNTimes 0 = return [] repeatNTimes n = do let link = "https://bip.um.gdynia.pl/module/Contents/controller/Default/action/loadSubcategory/subcategory_id/" ++ show (377-n) items <- runOldYearsExtractor link let urls = map mapToUrl items recursive <- repeatNTimes (n-1) let results = merge urls recursive return results flatten :: [[a]] -> [a] flatten xs = (\z n -> foldr (\x y -> foldr z y x) n xs) (:) [] getAllPages url = do items <- runYearsExtractor url let urls = map mapToUrl items newFiles <- mapM runNewFilesExtractor urls let fileUrls = map mapToUrl (flatten newFiles) return fileUrls getOldFiles = do oldPages <- repeatNTimes 10 --let sub = take 2 oldPages oldFiles <- mapM runOldFilesExtractor oldPages let fileUrls = map mapToUrl (flatten oldFiles) return fileUrls main = do let start = "https://bip.um.gdynia.pl/wyszukiwarka-uchwal-rady-miasta,7485/wyszukiwarka-uchwal,504923" oldResults <- getOldFiles oldF <- openFile "oldFiles.json" WriteMode hPrint oldF oldResults hClose oldF --print (length results) newResults <- getAllPages start newF <- openFile "newFiles.json" WriteMode hPrint newF newResults hClose newF