ISI-twilight/app/gazety.hs
Natalia Gawron 4430f3a593 fix date
2021-04-03 21:31:03 +02:00

53 lines
1.7 KiB
Haskell

{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
import ShadowLibrary.Core
import Text.XML.HXT.Core
import Text.XML.HXT.XPath
import Text.XML.HXT.Curl
import Data.List
import Data.List.Utils (replace)
import Text.Regex.Posix
import Text.Printf
extractRecords = extractLinksWithText "//ul[@class='menu-topmenu menu-iconmenu']//li[@class='first' or @class='last' or not(@class)]//a"
>>> second (arr $ replace "\n\t \n\t\t" "")
>>> second (arr $ replace "\t \n " "")
>>> first (extractLinksWithText "//div[@class='jsn-article-content']//a[contains(.,'nr')]")
>>> first (second ( arr $ replace "ś" "s"))
>>> first (second ( arr $ replace "ń" "n"))
>>> first (second ( arr $ replace "ź" "z"))
toShadowItem :: ((String, String), String) -> ShadowItem
toShadowItem ((url, articleTitle), yearlyTitle) =
(defaultShadowItem url title) {
originalDate = extractDate articleTitle,
itype = "periodical",
format = Just "pdf",
finalUrl = url
}
where title = articleTitle
extractDate :: String -> Maybe String
extractDate url =
case url =~~ ("[A-Za-z]+ [0-9]{4}" :: String) of
Just date -> Just date
otherwise -> extractOtherDate url
extractOtherDate :: String -> Maybe String
extractOtherDate url =
case url =~~ ("[0-9]{4}_[0-9]{2}" :: String) of
Just date -> Just (replace "_" "-" date)
otherwise -> Just url
main = do
let start = "https://www.smpopowice.pl/"
let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
lname="Nasze Popowice",
abbrev="NaszPop",
lLevel=0,
webpage=start}
extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)