twilight-library-s434708/app/archiwapilsudski.hs


{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
import ShadowLibrary.Core

import Text.XML.HXT.Core
import Text.XML.HXT.XPath
-- import Text.XML.HXT.Curl
import Data.List
import Data.List.Utils (replace)

import Text.Regex.Posix
import Text.Printf


cleanText = arr (replace "\177" "a")
        >>> arr (replace "\230" "c")
        >>> arr (replace "\234" "e")
        >>> arr (replace "\179" "l")
        >>> arr (replace "\241" "n")
        >>> arr (replace "\243" "o")
        >>> arr (replace "\182" "s")
        >>> arr (replace "(\188)|(\191)" "z")
        >>> arr (replace "\160" "")

cleanDate = arr (replace "\160" "")
        >>> arr (replace " " "")

getTitle = proc doc -> do
  xpathTrees <- getXPathTrees "//title" -< doc
  title <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
  titleCleaned <- cleanText -< title
  returnA -< titleCleaned

getDate = proc doc -> do
  xpathTrees <- getXPathTrees "//tr[td/text() = ' Data dokumentu ']/td[2]" -< doc
  date <- (getXPathTrees "//text()" >>> listA (deep isText >>> getText) >>> arr (intercalate " ")) -< xpathTrees
  cleanedDate <- cleanDate -< date
  returnA -< cleanedDate

getUrls = proc doc -> do
  xpathTrees <- getXPathTrees "//div[@style='padding: 4px 7px;float:left;']/a" -< doc
  urlList <- (getAttrValue "href" >>> arr ("http://archiwa.pilsudski.org/" ++ ) >>> intercalate "//") -< xpathTrees
  returnA -< urlList

extractRecordData = proc recordUrl -> do
  doc <- downloadDocument -< recordUrl
  recordTitle <- getTitle -< doc
  recordDate <- getDate -< doc
  recordUrls <- getUrls -< doc
  returnA -< recordUrls

extractRecordLinks = extractLinks "//table[@class='tabelka']//a[contains(@href,'.php')]"
                 >>> extractLinks "//table[@class='tabelka']//td[@width='68%']/a[contains(@href,'.php')]"
                 >>> extractLinks "//table[@class='tabelka']//td[@width='76%']/a[contains(@href,'.php')]"
                 >>> extractRecordData


toShadowItem :: (String, String) -> ShadowItem
toShadowItem (url, title) =
  (defaultShadowItem url title) {
    originalDate = Just date,
    itype = "periodical",
    format = Just "pdf",
    finalUrl = url
    }
  where title = title
        date = "12-34-5678"


main = do
    let start = "https://archiwa.pilsudski.org/dokument.php?nonav=0&nrar=701&nrzesp=4&sygn=4&handle=701.180/16036"
    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
                                       lname="Archiwa Pilsudski",
                                       abbrev="ArchPil",
                                       lLevel=0,
                                       webpage=start}
    extractItemsStartingFromUrl shadowLibrary start (extractRecordData)