Replace encoded symbols with latin letters and filter away titleless ShadowItems
This commit is contained in:
parent
4d891e1735
commit
2721426b06
@ -298,7 +298,15 @@ extractItems shadowLibrary start extractor = do
|
||||
-- insertIntoDatabase shadowLibrary items
|
||||
putStrLn (show items)
|
||||
|
||||
getTitle :: ShadowItem -> String
|
||||
getTitle (ShadowItem _ title _ _ _ _ _ _ _) = title
|
||||
|
||||
isEmpty :: String -> Bool
|
||||
isEmpty str = length str == 0
|
||||
|
||||
removeTitleless :: [ShadowItem] -> [ShadowItem]
|
||||
removeTitleless = filter (\si -> not .isEmpty . getTitle $ si)
|
||||
|
||||
extractItemsStartingFromUrl shadowLibrary start extractor = do
|
||||
items <- runX $ (arr (const start) >>> setTraceLevel 1 >>> extractor)
|
||||
-- insertIntoDatabase shadowLibrary items
|
||||
mapM_ (putStrLn . show) items
|
||||
mapM_ (putStrLn . show) (removeTitleless items)
|
||||
|
@ -15,6 +15,16 @@ import Text.Printf
|
||||
extractRecords = extractLinksWithText "//a[contains(@href, '.pdf')]" -- pary adres-tytuł
|
||||
>>> second (arr $ replace "\n\t\t\t\t\t\t\t\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
|
||||
>>> second (arr $ replace "\n\t\t\t\t\t\t\t\t\t\t" "") -- czyścimy drugi element pary, czyli tytuł z niepotrzebnych białych znaków
|
||||
>>> second (arr $ replace "\324" "n")
|
||||
>>> second (arr $ replace "\281" "e")
|
||||
>>> second (arr $ replace "\380" "z")
|
||||
>>> second (arr $ replace "\322" "l")
|
||||
>>> second (arr $ replace "\243" "o")
|
||||
>>> second (arr $ replace "\347" "s")
|
||||
>>> second (arr $ replace "\263" "c")
|
||||
>>> second (arr $ replace "\346" "S")
|
||||
|
||||
|
||||
|
||||
-- ... a tutaj te trójki przerabiamy do docelowej struktury ShadowItem
|
||||
toShadowItem :: (String, String) -> ShadowItem
|
||||
|
Loading…
Reference in New Issue
Block a user