This commit is contained in:
Jakub Adamski 2022-04-11 12:01:01 +02:00
parent c9457d0168
commit 07cf42c6d8
4 changed files with 10271 additions and 0 deletions

71
app/bip.hs Normal file
View File

@ -0,0 +1,71 @@
{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
import ShadowLibrary.Core
import Text.XML.HXT.Core
import Text.XML.HXT.XPath
import Data.List
import Data.List.Utils (replace)
import Text.Regex.Posix
import Text.Printf
import System.IO
extractYears = extractLinksWithText "//div[@class='inner clearfix']//div[@class='row']//div[@class='col-sm-6']//ul//li//a" -- pary adres-tytuł
runYearsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractYears)
extractNewFiles = extractLinksWithText "//a[@title='Pobierz plik']" -- pary adres-tytuł
runNewFilesExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractNewFiles)
extractOldFiles = extractLinksWithText "//div[@class='content']//div[@class='inner clearfix']//ul//li//a" -- pary adres-tytuł
runOldFilesExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractOldFiles)
--367-376
extractOldYears = extractLinksWithText "//nav[@class='nav-left']//ul//li[@class='single-article']/a" -- pary adres-tytuł
runOldYearsExtractor url = runX $ (arr (const url) >>> setTraceLevel 1 >>> extractOldYears)
mapToUrl :: ([Char], [Char]) -> [Char]
mapToUrl (url, title) = url
merge [] ys = ys
merge (x:xs) ys = x:merge ys xs
repeatNTimes 0 = return []
repeatNTimes n = do
let link = "https://bip.um.gdynia.pl/module/Contents/controller/Default/action/loadSubcategory/subcategory_id/" ++ show (377-n)
items <- runOldYearsExtractor link
let urls = map mapToUrl items
recursive <- repeatNTimes (n-1)
let results = merge urls recursive
return results
flatten :: [[a]] -> [a]
flatten xs = (\z n -> foldr (\x y -> foldr z y x) n xs) (:) []
getAllPages url = do
items <- runYearsExtractor url
let urls = map mapToUrl items
newFiles <- mapM runNewFilesExtractor urls
let fileUrls = map mapToUrl (flatten newFiles)
return fileUrls
getOldFiles = do
oldPages <- repeatNTimes 10
--let sub = take 2 oldPages
oldFiles <- mapM runOldFilesExtractor oldPages
let fileUrls = map mapToUrl (flatten oldFiles)
return fileUrls
main = do
let start = "https://bip.um.gdynia.pl/wyszukiwarka-uchwal-rady-miasta,7485/wyszukiwarka-uchwal,504923"
oldResults <- getOldFiles
oldF <- openFile "oldFiles.json" WriteMode
hPrint oldF oldResults
hClose oldF
--print (length results)
newResults <- getAllPages start
newF <- openFile "newFiles.json" WriteMode
hPrint newF newResults
hClose newF

8267
newFiles.json Normal file

File diff suppressed because it is too large Load Diff

1921
oldFiles.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -72,6 +72,18 @@ executable pbsociety
, shadow-library , shadow-library
default-language: Haskell2010 default-language: Haskell2010
executable bip
hs-source-dirs: app
main-is: bip.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base
, hxt
, hxt-xpath
, MissingH
, regex-posix
, shadow-library
default-language: Haskell2010
source-repository head source-repository head
type: git type: git
location: https://github.com/name/project location: https://github.com/name/project