twilight-library/app/archiwumharcerskie.hs


{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
import ShadowLibrary.Core

import Text.XML.HXT.Core
import Text.XML.HXT.XPath
-- import Text.XML.HXT.Curl
import Data.List
import Data.List.Utils (replace)

import Text.Regex.Posix
import Text.Printf

-- wyciaganie pdf-ow
extractRecords = extractLinksWithText "//nav//a[contains(@href,'title=Kategoria:')]"
               >>> first (extractLinksWithText "//a[contains(@href,'title=Plik') and contains(@href,'pdf')]")
               >>> first (first (extractLinksGeneralized  "//div[@id='file']//iframe" "src"))
               >>> first (first (arr $ replace "#page=1" ""))

-- proby pobrania pdf i jpg
--extractRecords = extractLinksWithText "//nav//a[contains(@href,'title=Kategoria:14_WDH')]"
--               >>> first (extractLinksWithText "//a[contains(@href,'title=Plik') and (contains(@href,'pdf') or contains(@href,'jpg'))]") 
--               >>> first ( first (
--                               downloadDocument
--                               >>> (getXPathTrees "//div[@id='file']//iframe"
--                                   >>> getAttrValue "src") *** (getXPathTrees "//div[@id='file']//a"
--                                   >>> getAttrValue "href")
--                               >>> first (expandURIFixed)
--                   ))

toShadowItem :: ((String, String), String) -> ShadowItem
toShadowItem ((url, fileTitle), emptyTmp) =
  (defaultShadowItem url title) {
    originalDate = date,
    itype = "periodical",
    format = ext,
    finalUrl = url
    }
  where title = "Archiwum Harcerskie - " ++ fileTitle
        date = extractDate url
        ext = extractFormat url

extractDate :: String -> Maybe String
extractDate n =
  case n =~~ ("(((19[0-9]{2})|(2[0-2]{1}[0-9]{2}))(((-[0-1]{1}[0-9]{1}-[0-9]{2})|)|((-[0-1]{1}[0-9]{1}[^0-9][ _-]{1}))))" :: String) of
    Just date -> Just date
    otherwise -> Nothing


main = do
    let start = "http://archiwumharcerskie.pl/index.php?title=Strona_główna"
    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
                                       lname="Archiwum Harcerskie",
                                       abbrev="ArchHarc",
                                       lLevel=0,
                                       webpage=start}
    extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)
--    extractItemsStartingFromUrl shadowLibrary start (extractRecords)
archiwum harserskie test 2021-04-11 14:09:01 +02:00
			`{-# LANGUAGE Arrows, NoMonomorphismRestriction #-}`
			`import ShadowLibrary.Core`

			`import Text.XML.HXT.Core`
			`import Text.XML.HXT.XPath`
			`-- import Text.XML.HXT.Curl`
			`import Data.List`
			`import Data.List.Utils (replace)`

			`import Text.Regex.Posix`
			`import Text.Printf`

			`-- wyciaganie pdf-ow`
archiwumharcerskie.hs changes 2021-04-11 14:24:25 +02:00			`extractRecords = extractLinksWithText "//nav//a[contains(@href,'title=Kategoria:')]"`
archiwum harserskie test 2021-04-11 14:09:01 +02:00			`>>> first (extractLinksWithText "//a[contains(@href,'title=Plik') and contains(@href,'pdf')]")`
			`>>> first (first (extractLinksGeneralized "//div[@id='file']//iframe" "src"))`
remove from url #page=1 2021-04-18 21:42:41 +02:00			`>>> first (first (arr $ replace "#page=1" ""))`
archiwum harserskie test 2021-04-11 14:09:01 +02:00
			`-- proby pobrania pdf i jpg`
			`--extractRecords = extractLinksWithText "//nav//a[contains(@href,'title=Kategoria:14_WDH')]"`
			`-- >>> first (extractLinksWithText "//a[contains(@href,'title=Plik') and (contains(@href,'pdf') or contains(@href,'jpg'))]")`
			`-- >>> first ( first (`
			`-- downloadDocument`
			`-- >>> (getXPathTrees "//div[@id='file']//iframe"`
			`-- >>> getAttrValue "src") *** (getXPathTrees "//div[@id='file']//a"`
			`-- >>> getAttrValue "href")`
			`-- >>> first (expandURIFixed)`
			`-- ))`

			`toShadowItem :: ((String, String), String) -> ShadowItem`
			`toShadowItem ((url, fileTitle), emptyTmp) =`
			`(defaultShadowItem url title) {`
			`originalDate = date,`
			`itype = "periodical",`
			`format = ext,`
			`finalUrl = url`
			`}`
			`where title = "Archiwum Harcerskie - " ++ fileTitle`
			`date = extractDate url`
			`ext = extractFormat url`

			`extractDate :: String -> Maybe String`
			`extractDate n =`
			`case n =~~ ("(((19[0-9]{2})\|(2[0-2]{1}[0-9]{2}))(((-[0-1]{1}[0-9]{1}-[0-9]{2})\|)\|((-[0-1]{1}[0-9]{1}[^0-9][ _-]{1}))))" :: String) of`
			`Just date -> Just date`
			`otherwise -> Nothing`


			`main = do`
			`let start = "http://archiwumharcerskie.pl/index.php?title=Strona_główna"`
			`let shadowLibrary = ShadowLibrary {logoUrl=Nothing,`
			`lname="Archiwum Harcerskie",`
			`abbrev="ArchHarc",`
			`lLevel=0,`
			`webpage=start}`
			`extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)`
			`-- extractItemsStartingFromUrl shadowLibrary start (extractRecords)`