Replace encoded symbols with latin letters and filter away titleless ShadowItems
This commit is contained in:
parent
4d891e1735
commit
2721426b06
@ -298,7 +298,15 @@ extractItems shadowLibrary start extractor = do
|
|||||||
-- insertIntoDatabase shadowLibrary items
|
-- insertIntoDatabase shadowLibrary items
|
||||||
putStrLn (show items)
|
putStrLn (show items)
|
||||||
|
|
||||||
|
getTitle :: ShadowItem -> String
|
||||||
|
getTitle (ShadowItem _ title _ _ _ _ _ _ _) = title
|
||||||
|
|
||||||
|
isEmpty :: String -> Bool
|
||||||
|
isEmpty str = length str == 0
|
||||||
|
|
||||||
|
removeTitleless :: [ShadowItem] -> [ShadowItem]
|
||||||
|
removeTitleless = filter (\si -> not .isEmpty . getTitle $ si)
|
||||||
|
|
||||||
extractItemsStartingFromUrl shadowLibrary start extractor = do
|
extractItemsStartingFromUrl shadowLibrary start extractor = do
|
||||||
items <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractor)
|
items <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractor)
|
||||||
-- insertIntoDatabase shadowLibrary items
|
mapM_ (putStrLn . show) (removeTitleless items)
|
||||||
mapM_ (putStrLn . show) items
|
|
||||||
|
@ -15,6 +15,16 @@ import Text.Printf
|
|||||||
extractRecords = extractLinksWithText "//a[contains(@href, '.pdf')]" -- pary adres-tytuł
|
extractRecords = extractLinksWithText "//a[contains(@href, '.pdf')]" -- pary adres-tytuł
|
||||||
>>> second (arr $ replace "\n\t\t\t\t\t\t\t\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
|
>>> second (arr $ replace "\n\t\t\t\t\t\t\t\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
|
||||||
>>> second (arr $ replace "\n\t\t\t\t\t\t\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
|
>>> second (arr $ replace "\n\t\t\t\t\t\t\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
|
||||||
|
>>> second (arr $ replace "\324" "n")
|
||||||
|
>>> second (arr $ replace "\281" "e")
|
||||||
|
>>> second (arr $ replace "\380" "z")
|
||||||
|
>>> second (arr $ replace "\322" "l")
|
||||||
|
>>> second (arr $ replace "\243" "o")
|
||||||
|
>>> second (arr $ replace "\347" "s")
|
||||||
|
>>> second (arr $ replace "\263" "c")
|
||||||
|
>>> second (arr $ replace "\346" "S")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
-- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
|
-- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
|
||||||
toShadowItem :: (String, String) -> ShadowItem
|
toShadowItem :: (String, String) -> ShadowItem
|
||||||
|
Loading…
Reference in New Issue
Block a user