diff --git a/README.md b/README.md new file mode 100644 index 0000000..dd4f667 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# Build and run project + +`stack install` + +`stack build` + +`stack exec kulturaparyska` \ No newline at end of file diff --git a/ShadowLibrary/Core.hs b/ShadowLibrary/Core.hs index 678df33..9753935 100644 --- a/ShadowLibrary/Core.hs +++ b/ShadowLibrary/Core.hs @@ -33,7 +33,7 @@ import Data.Tree.NTree.TypeDefs import Data.Maybe import Control.Monad.Trans import Text.XML.HXT.XPath --- import Text.XML.HXT.Curl +import Text.XML.HXT.Curl import Text.XML.HXT.HTTP import Text.Regex.TDFA @@ -64,8 +64,8 @@ downloadDocument = readFromDocument [withParseHTML yes, withEncodingErrors no, withPreserveComment yes, withStrictInput yes, - withHTTP [] --- withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")] + -- withHTTP [] + withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")] ] downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes, @@ -73,13 +73,13 @@ downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes, withEncodingErrors no, withPreserveComment yes, withInputEncoding enc, - withHTTP []] --- withCurl []] + -- withHTTP []] + withCurl []] downloadXmlDocument = readFromDocument [withWarnings no, withEncodingErrors no, - withHTTP []] --- withCurl [] ] + -- withHTTP []] + withCurl [] ] data ShadowLibrary = ShadowLibrary { logoUrl :: Maybe String, diff --git a/app/almanachmuszyny.hs b/app/almanachmuszyny.hs index ebc4f5c..edc881b 100644 --- a/app/almanachmuszyny.hs +++ b/app/almanachmuszyny.hs @@ -20,6 +20,7 @@ extractRecords = extractLinksWithText "//a[@class='roczniki']" -- pary adres-ty -- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem toShadowItem :: ((String, String), String) -> ShadowItem + toShadowItem ((url, articleTitle), yearlyTitle) = (defaultShadowItem url title) { originalDate = Just date, diff --git a/app/kulturaparyska.hs b/app/kulturaparyska.hs index ebc4f5c..084ae42 100644 --- a/app/kulturaparyska.hs +++ b/app/kulturaparyska.hs @@ -12,10 +12,10 @@ import Text.Regex.Posix import Text.Printf -extractRecords = extractLinksWithText "//a[@class='roczniki']" -- pary adres-tytuł +extractRecords = extractLinksWithText "//a[@class='year-anchor ']" -- pary adres-tytuł >>> second (arr $ replace "\r\n " " ") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków - >>> first (arr ((++"tr") . init)) -- modyfikujemy pierwszy element pary, czyli adres URL - >>> first (extractLinksWithText "//li/a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego + -- >>> first (arr ((++"f") . init)) -- modyfikujemy pierwszy element pary, czyli adres URL + >>> first (extractLinksWithText "//a[contains(@href,'.pdf')]") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika) -- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem @@ -27,8 +27,8 @@ toShadowItem ((url, articleTitle), yearlyTitle) = format = Just "pdf", finalUrl = url } - where title = "Almanach Muszyny " ++ yearlyTitle ++ " " ++ (replace "\r\n" "" (replace "\r\n " "" articleTitle)) - date = getDate url + where title = "Kultura Paryska " ++ yearlyTitle ++ " " ++ (replace "\r\n" "" (replace "\r\n " "" articleTitle)) + date = yearlyTitle getDate url = case url =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of @@ -37,10 +37,11 @@ getDate url = main = do - let start = "http://www.almanachmuszyny.pl/" + let start = "https://kulturaparyska.com/pl/publication/4/year/1946" let shadowLibrary = ShadowLibrary {logoUrl=Nothing, - lname="Almanach Muszyny", - abbrev="AlmMusz", + lname="Kultura Paryska", + abbrev="kultParys", lLevel=0, webpage=start} + putStrLn "Program started" extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem) diff --git a/shadow-library.cabal b/shadow-library.cabal index cd77195..a71fb7f 100644 --- a/shadow-library.cabal +++ b/shadow-library.cabal @@ -2,11 +2,11 @@ name: shadow-library version: 0.1.0.0 synopsis: Initial project template from stack description: Please see README.md -homepage: http://github.com/name/project +homepage: https://git.wmi.amu.edu.pl/s444463/twilight-library license: Proprietary license-file: LICENSE -author: Your name here -maintainer: your.address@example.com +author: Mikołaj Pokrywka +maintainer: - -- copyright: category: Web build-type: Simple @@ -19,6 +19,7 @@ library build-depends: base >= 4.7 && < 5 , HTTP , hxt + , hxt-curl , hxt-http , hxt-xpath , MissingH @@ -59,7 +60,20 @@ executable almanachmuszyny , shadow-library default-language: Haskell2010 +executable kulturaparyska + hs-source-dirs: app + main-is: kulturaparyska.hs + ghc-options: -threaded -rtsopts -with-rtsopts=-N + build-depends: base + , hxt + , hxt-curl + , hxt-xpath + , MissingH + , regex-posix + , shadow-library + default-language: Haskell2010 + source-repository head type: git - location: https://github.com/name/project + location: https://git.wmi.amu.edu.pl/s444463/twilight-library