Pobieranie tytulow artykulow.

Optymalizacje
Zrobiona zamiana nazwy miesiaca na wartosc liczbowa.
2021-04-18 15:40:42 +02:00 · 2021-04-17 18:13:05 +02:00 · 2021-04-17 18:07:25 +02:00 · 2021-04-06 21:43:15 +02:00 · 2021-04-06 21:20:43 +02:00 · 2021-04-06 20:54:18 +02:00
5 changed files with 140 additions and 14 deletions
--- a/ShadowLibrary/Core.hs
+++ b/ShadowLibrary/Core.hs
@ -33,7 +33,7 @@ import Data.Tree.NTree.TypeDefs
 import Data.Maybe
 import Control.Monad.Trans
 import Text.XML.HXT.XPath
-- import Text.XML.HXT.Curl
+import Text.XML.HXT.Curl
 import Text.XML.HXT.HTTP
 import Text.Regex.TDFA
@ -64,8 +64,8 @@ downloadDocument = readFromDocument [withParseHTML yes,
                                     withEncodingErrors no,
                                     withPreserveComment yes,
                                     withStrictInput yes,
-                                     withHTTP []
+--                                     withHTTP []
--                                     withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")]
+                                     withCurl [("curl--user-agent","AMU Digital Libraries Indexing Agent")]
                                    ]
 downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes,
@ -73,13 +73,13 @@ downloadDocumentWithEncoding enc = readFromDocument [withParseHTML yes,
                                                     withEncodingErrors no,
                                                     withPreserveComment yes,
                                                     withInputEncoding enc,
-					             withHTTP []]		
+--                       		             withHTTP []]
--                                                     withCurl []]
+                                                     withCurl []]
 downloadXmlDocument = readFromDocument [withWarnings no,
                                        withEncodingErrors no,
-					withHTTP []]
+--					withHTTP []]
--                                        withCurl [] ]
+                                        withCurl [] ]
 data ShadowLibrary = ShadowLibrary { logoUrl :: Maybe String,
--- a/app/almanachmuszyny.hs
+++ b/app/almanachmuszyny.hs
@ -4,7 +4,7 @@ import ShadowLibrary.Core
 import Text.XML.HXT.Core
 import Text.XML.HXT.XPath
-- import Text.XML.HXT.Curl
+--import Text.XML.HXT.Curl
 import Data.List
 import Data.List.Utils (replace)
--- a/app/elektronikapraktyczna.hs
+++ b/app/elektronikapraktyczna.hs
@ -0,0 +1,105 @@
 {-# LANGUAGE Arrows, NoMonomorphismRestriction #-}
 import ShadowLibrary.Core
 import Text.XML.HXT.Core
 import Text.XML.HXT.XPath
 -- import Text.XML.HXT.Curl
 import Data.List
 import Data.List.Utils (replace)
 import Text.Regex.Posix
 import Text.Printf
 import Data.Char (toLower)
 class Nothingish a where
    nada :: a
 instance Nothingish [a] where
    nada = []
 eliminate :: (Nothingish a) => Maybe a -> a
 eliminate (Just a) = a
 eliminate Nothing  = nada
 toLowerString :: [Char] -> [Char]
 toLowerString str = [ toLower x | x <- str]
 mToString :: Maybe String -> String
 mToString n 
  | n == Nothing = ""
  | otherwise = eliminate n
 extractMonth :: String -> String
 extractMonth n =
  case n =~~ ("[A-za-z]+" :: String) of
    Just month -> "-" ++ eliminate (baseMonthNameToNumber (toLowerString month))
    otherwise -> ""
 mExtractYear :: String -> String
 mExtractYear n =
  case n =~~ ("(1[6789]|20)[0-9][0-9]" :: String) of
    Just year -> year
    otherwise -> ""
 changeDate :: String -> Maybe String
 changeDate a = Just (eliminate (extractYear a) ++ extractMonth a)
 extractLinksWithArticleTitle xpathCondition = (downloadDocument &&& this)
                                      >>> first (getXPathTrees xpathCondition
                                                >>> (
                                                  (getXPathTrees "//div[@class='text']" >>> (listA (deep isText >>> getText)
                                                            >>> arr (intercalate " "))) 
                                                    &&& 
                                                    (getXPathTrees "//div[@class='files__item']/a[contains(@href,'.pdf')]" >>> (getAttrValue "href"))
                                                ))
 --extractRecords = extractLinksWithText "(//a[@class='magazine-list__year-item'])[last()]"  -- pary adres-tytuł
 extractRecords = extractLinksWithText "//a[@class='magazine-list__year-item']"  -- pary adres-tytuł
                 >>> second (arr $ replace "\r\n                        " "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
                 >>> second (arr $ replace "    " "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
 --                 >>> first (arr ((++"tr") . init))  -- modyfikujemy pierwszy element pary, czyli adres URL
                 >>> first (extractLinksWithText "//div[@class='magazine-list__item']/a") -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
                 >>> first (second (arr $ replace "\r\n                                " ""))
                 >>> first (first (
                                    -- extractLinksWithArticleTitle "//div[@class='files__item']/a[contains(@href,'.pdf')]" 
                                    (extractLinksWithArticleTitle "//div[@class='magazine-single__content-title article text']")
                                    >>> 
                                                first (
                                                  first (arr $ replace "                                                                                    " "")
                                                  >>> first (arr $ replace "\r\n        " "")
                                                )
                                                -- >>> first (arr $ replace "\r\n" "")
                                    -- >>> first (arr $ replace "//" "/")
                                  )
                                  -- >>> second (arr $ changeDate) -- Zmiana nazwy miesiąca na wartość liczbową
                                  >>> second (arr $ replace "                                " "") >>> second (arr $ replace "     " "")
                            ) -- pobieramy stronę z adresu URL i wyciągamy linki z tej strony pasujące do wyrażenia XPathowego
                 -- ostatecznie wyjdą trójki ((adres URL, tytuł artykułu), tytuł rocznika)
 -- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
 toShadowItem :: ((((String, String), String), String), String) -> ShadowItem
 toShadowItem ((((chapterTitle, url), finalUrl), articleTitle), yearlyTitle) =
  (defaultShadowItem url title) {
    originalDate = changeDate articleTitle,
    itype = "periodical",
    format = Just "pdf",
    finalUrl = finalUrl
    }
  where title = "Elektronika Praktyczna  " ++ articleTitle ++ " - " ++ chapterTitle 
        date = yearlyTitle
 getDate yearlyTitle =
  case yearlyTitle =~~ "/(19[0-9][0-9]|20[0-9][0-9])/" :: Maybe [[String]] of
    Just [[_, year]] -> year
    otherwise -> error $ "unexpected yearlyTitle: " ++ yearlyTitle
 main = do
    let start = "https://ep.com.pl/archiwum/"
    let shadowLibrary = ShadowLibrary {logoUrl=Nothing,
                                       lname="Elektronika praktyczna",
                                       abbrev="EP",
                                       lLevel=0,
                                       webpage=start}
    extractItemsStartingFromUrl shadowLibrary start (extractRecords >>> arr toShadowItem)
--- a/shadow-library.cabal
+++ b/shadow-library.cabal
@ -2,11 +2,11 @@ name:                shadow-library
 version:             0.1.0.0
 synopsis:            Initial project template from stack
 description:         Please see README.md
-homepage:            http://github.com/name/project
+homepage:            https://git.wmi.amu.edu.pl/s426206/twilight-library.git
 license:             Proprietary
 license-file:        LICENSE
-author:              Your name here
+author:              Jan Nowak
-maintainer:          your.address@example.com
+maintainer:          jannow2@st.amu.edu.pl
 -- copyright:
 category:            Web
 build-type:          Simple
@ -20,6 +20,7 @@ library
                     , HTTP
                     , hxt
                     , hxt-http
                     , hxt-curl
                     , hxt-xpath
                     , MissingH
                     , monad-logger
@ -47,13 +48,14 @@ library
 --                       , shadow-library
 --    default-language:    Haskell2010
-executable almanachmuszyny
+executable elektronikapraktyczna
   hs-source-dirs:      app
-   main-is:             almanachmuszyny.hs
+   main-is:             elektronikapraktyczna.hs
   ghc-options:         -threaded -rtsopts -with-rtsopts=-N
   build-depends:       base
                      , hxt
                      , hxt-xpath
                      , hxt-curl
                      , MissingH
                      , regex-posix
                      , shadow-library
@ -62,4 +64,4 @@ executable almanachmuszyny
 source-repository head
  type:     git
-  location: https://github.com/name/project
+  location: https://git.wmi.amu.edu.pl/s426206/twilight-library.git
--- a/stack.yaml.lock
+++ b/stack.yaml.lock
@ -0,0 +1,19 @@
 # This file was autogenerated by Stack.
 # You should not edit this file by hand.
 # For more information, please see the documentation at:
 #   https://docs.haskellstack.org/en/stable/lock_files
 packages:
 - completed:
    hackage: hxt-xpath-9.1.2.2@sha256:9cd590ae93a04573db8f90fa4094625ebd97dded45da7667c577ce6b38a42900,1999
    pantry-tree:
      size: 2225
      sha256: aee2f75974e868ff429b8ff349a29667536c60397098f5dfedc968d1951511bb
  original:
    hackage: hxt-xpath-9.1.2.2
 snapshots:
 - completed:
    size: 507596
    url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/11/9.yaml
    sha256: 42f472dbf06482da1b3319241f3e3b3593a45bd7d4f537d2789f21386b9b2ad3
  original: lts-11.9
Author	SHA1	Message	Date
Jan Nowak	ad3f714bf6	Pobieranie tytulow artykulow.	2021-04-18 15:40:42 +02:00
Jan Nowak	bf137cf60a	Optymalizacje	2021-04-17 18:13:05 +02:00
Jan Nowak	bc3ebf2eaa	Zrobiona zamiana nazwy miesiaca na wartosc liczbowa.	2021-04-17 18:07:25 +02:00
Jan Nowak	79dede34f5	Specyfikacja robota	2021-04-06 21:43:15 +02:00
Jan Nowak	4991806c95	Poprawione zapisywanie tytulu, odpytywanie wszystkich rocznikow	2021-04-06 21:20:43 +02:00
Jan Nowak	06300a987c	Zrobiony robot pobierajacy informacje o pdf.	2021-04-06 20:54:18 +02:00
Jan Nowak	3d2acc4df8	Merge branch 'withcurl' of git://gonito.net/twilight-library	2021-04-06 17:52:49 +02:00
Jan Nowak	600e21e915	Elektronika praktyczna	2021-04-06 17:51:18 +02:00
Filip Gralinski	107793cbf1	Change to Curl	2019-03-17 21:52:13 +01:00