This commit is contained in:
Kacper 2022-05-19 23:28:20 +02:00
parent 53fff21f4d
commit 6153264fcb
2 changed files with 8 additions and 5 deletions

View File

@ -28,7 +28,7 @@ toShadowItem ((url, articleTitle), yearlyTitle) =
date = getDate url date = getDate url
getDate url = getDate url =
case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of case url =~~ "/(20[0-9][0-9])/" :: Maybe [[String]] of
Just [[_, year]] -> year Just [[_, year]] -> year
otherwise -> error $ "unexpected url: " ++ url otherwise -> error $ "unexpected url: " ++ url

View File

@ -1,4 +1,3 @@
{-# LANGUAGE Arrows, NoMonomorphismRestriction #-} {-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
import ShadowLibrary.Core import ShadowLibrary.Core
@ -14,15 +13,19 @@ import Text.Printf
extractRecords = extractLinksWithText "//td/a[contains(@href,'.pdf')]" extractRecords = extractLinksWithText "//td/a[contains(@href,'.pdf')]"
toShadowItem :: (String, String) -> ShadowItem toShadowItem :: (String, String) -> ShadowItem
toShadowItem (url, text) = toShadowItem (url, title) =
(defaultShadowItem url text) { (defaultShadowItem url title) {
originalDate = Just date, originalDate = Just date,
itype = "periodical", itype = "periodical",
format = Just "pdf", format = Just "pdf",
finalUrl = url finalUrl = url
} }
where title = "" where title = ""
date = "" date = getDate $ replace "%20" " " url
getDate :: String -> String
getDate url = date where
date = url Text.Regex.Posix.=~ "(202[0-2]|20[0-1][0-9])" :: String
main = do main = do
let start = "https://www.pip.gov.pl/pl/inspektor-pracy/66546,archiwum-inspektora-pracy-.html" let start = "https://www.pip.gov.pl/pl/inspektor-pracy/66546,archiwum-inspektora-pracy-.html"