Replace encoded symbols with latin letters and filter away titleless ShadowItems

This commit is contained in:
Aleksy Wroblewski 2021-03-18 20:37:50 +01:00
parent 4d891e1735
commit 2721426b06
2 changed files with 20 additions and 2 deletions

View File

@ -298,7 +298,15 @@ extractItems shadowLibrary start extractor = do
-- insertIntoDatabase shadowLibrary items
putStrLn (show items)
getTitle :: ShadowItem -> String
getTitle (ShadowItem _ title _ _ _ _ _ _ _) = title
isEmpty :: String -> Bool
isEmpty str = length str == 0
removeTitleless :: [ShadowItem] -> [ShadowItem]
removeTitleless = filter (\si -> not .isEmpty . getTitle $ si)
extractItemsStartingFromUrl shadowLibrary start extractor = do
items <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractor)
-- insertIntoDatabase shadowLibrary items
mapM_ (putStrLn . show) items
mapM_ (putStrLn . show) (removeTitleless items)

View File

@ -15,6 +15,16 @@ import Text.Printf
extractRecords = extractLinksWithText "//a[contains(@href, '.pdf')]" -- pary adres-tytuł
>>> second (arr $ replace "\n\t\t\t\t\t\t\t\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
>>> second (arr $ replace "\n\t\t\t\t\t\t\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
>>> second (arr $ replace "\324" "n")
>>> second (arr $ replace "\281" "e")
>>> second (arr $ replace "\380" "z")
>>> second (arr $ replace "\322" "l")
>>> second (arr $ replace "\243" "o")
>>> second (arr $ replace "\347" "s")
>>> second (arr $ replace "\263" "c")
>>> second (arr $ replace "\346" "S")
-- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
toShadowItem :: (String, String) -> ShadowItem