54 KiB
Zagęszczamy wektory
Podstawowy problem z wektorową reprezentacją typu tf-idf polega na tym, że wektory dokumentów (i macierz całej kolekcji dokumentów) są _rzadkie, tzn. zawierają dużo zer. W praktyce potrzebujemy bardziej "gęstej" czy "kompaktowej" reprezentacji numerycznej dokumentów.
_Hashing trick
Powierzchownie problem możemy rozwiązać przez użycie tzw. _sztuczki z haszowaniem (hashing trick). Będziemy potrzebować funkcji mieszającej (haszującej) $H$, która rzutuje napisy na liczby, których reprezentacja binarna składa się z $b$ bitów:
$$H : \Sigma^{*} \rightarrow \{0,\dots,2^b-1\}$$
($\Sigma^{*}$ to zbiór wszystkich napisów.)
Pytanie: Czy funkcja $H$ może być różnowartościowa?
Jako funkcji $H$ możemy np. użyć funkcji MurmurHash2 lub 3.
import Data.Digest.Murmur64
hash64 "Komputer"
hash64 "komputer"
hash64 "komputer"
hash64 "komputerze"
hash64 "komputerek"
hash64 "abrakadabra"
hash64 ""
hash64 " "
Hash64 0x4a80abc136f926e7
Hash64 0x6c3a641663470e2c
Hash64 0x6c3a641663470e2c
Hash64 0xa714568917576314
Hash64 0x875d9e7e413747c8
Hash64 0x13ce831936ebc69e
Hash64 0xb04ce6229407c882
Hash64 0x6ecd7bae29ae0450
Pytanie: podobne napisy mają zupełnie różne wartości funkcji haszującej, czy to dobrze, czy to źle?
Musimy tylko sparametryzować naszą funkcję rozmiarem "odcisku" (parametr $b$).
{-# LANGUAGE OverloadedStrings #-}
import Data.Text
-- pomocnicza funkcja, która konwertuje wartość specjalnego
-- typu Hash64 do zwykłej liczby całkowitej
hashValueAsInteger :: Hash64 -> Integer
hashValueAsInteger = toInteger . asWord64
-- unpack to funkcja, która wartość typu String konwertuje do Text
hash :: Integer -> Text -> Integer
hash b t = hashValueAsInteger (hash64 $ unpack t) `mod` (2 ^ b)
hash 16 "komputer"
hash 16 "komputerze"
hash 16 "komputerem"
hash 16 "abrakadabra"
hash 4 "komputer"
3628
25364
2877
50846
12
Pytanie: Jakie wartości $b$ będą bezsensowne?
Sztuczka z haszowaniem polega na tym, że zamiast numerować słowa korzystając ze słownika, po prostu używamy funkcji haszującej. W ten sposób wektor będzie _zawsze rozmiar $2^b$ - bez względu na rozmiar słownika.
Zacznijmy od przywołania wszystkich potrzebnych definicji.
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
import Data.Text hiding(map, filter, zip)
import Text.Regex.PCRE.Heavy
isStopWord :: Text -> Bool
isStopWord "w" = True
isStopWord "jest" = True
isStopWord "że" = True
isStopWord w = w ≈ [re|^\p{P}+$|]
removeStopWords :: [Text] -> [Text]
removeStopWords = filter (not . isStopWord)
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE FlexibleContexts #-}
import Data.Text hiding(map, filter, zip)
import Prelude hiding(words, take)
import Text.Regex.PCRE.Heavy
import Data.Map as Map hiding(take, map, filter)
import Data.Set as Set hiding(map)
tokenize :: Text -> [Text]
tokenize = map fst . scan [re|C\+\+|[\p{L}0-9]+|\p{P}|]
mockInflectionDictionary :: Map Text Text
mockInflectionDictionary = Map.fromList [
("kota", "kot"),
("butach", "but"),
("masz", "mieć"),
("ma", "mieć"),
("buta", "but"),
("zgubiłem", "zgubić")]
lemmatizeWord :: Map Text Text -> Text -> Text
lemmatizeWord dict w = findWithDefault w w dict
lemmatize :: Map Text Text -> [Text] -> [Text]
lemmatize dict = map (lemmatizeWord dict)
poorMansStemming = Data.Text.take 6
normalize :: Text -> [Text]
normalize = map poorMansStemming . removeStopWords . map toLower . lemmatize mockInflectionDictionary . tokenize
getVocabulary :: [Text] -> Set Text
getVocabulary = Set.unions . map (Set.fromList . normalize)
idf :: [[Text]] -> Text -> Double
idf coll t = log (fromIntegral n / fromIntegral df)
where df = Prelude.length $ Prelude.filter (\d -> t `elem` d) coll
n = Prelude.length coll
vectorizeTfIdf :: Int -> [[Text]] -> Map Int Text -> [Text] -> [Double]
vectorizeTfIdf vecSize coll v doc = map (\i -> count (v ! i) doc * idf coll (v ! i)) [0..(vecSize-1)]
where count t doc = fromIntegral $ (Prelude.length . Prelude.filter (== t)) doc
import System.IO
import Data.List.Split as SP
legendsh <- openFile "legendy.txt" ReadMode
hSetEncoding legendsh utf8
contents <- hGetContents legendsh
ls = Prelude.lines contents
items = map (map pack . SP.splitOn "\t") ls
labelsL = map Prelude.head items
collectionL = map (!!1) items
collectionLNormalized = map normalize collectionL
voc' = getVocabulary collectionL
vocLSize = Prelude.length voc'
vocL :: Map Int Text
vocL = Map.fromList $ zip [0..] $ Set.toList voc'
invvocL :: Map Text Int
invvocL = Map.fromList $ zip (Set.toList voc') [0..]
lVectorized = map (vectorizeTfIdf vocLSize collectionLNormalized vocL) collectionLNormalized
import Text.Printf
import Data.List (take)
formatNumber :: Double -> String
formatNumber x = printf "% 7.2f" x
similarTo :: ([Double] -> [Double] -> Double) -> [[Double]] -> Int -> Text
similarTo simFun vs ix = pack $ Prelude.unwords $ map (formatNumber . ((vs !! ix) `simFun`)) vs
paintMatrix :: ([Double] -> [Double] -> Double) -> [Text] -> [[Double]] -> Text
paintMatrix simFun labels vs = header <> "\n" <> Data.Text.unlines (map (\(lab, ix) -> lab <> " " <> similarTo simFun vs ix) $ zip labels [0..(Prelude.length vs - 1)])
where header = " " <> Data.Text.unwords (map (\l -> pack $ printf "% 7s" l) labels)
limit = 13
labelsLimited = Data.List.take limit labelsL
limitedL = Data.List.take limit lVectorized
vectorNorm :: [Double] -> Double
vectorNorm vs = sqrt $ sum $ map (\x -> x * x) vs
toUnitVector :: [Double] -> [Double]
toUnitVector vs = map (/ n) vs
where n = vectorNorm vs
(✕) :: [Double] -> [Double] -> Double
(✕) v1 v2 = sum $ Prelude.zipWith (*) v1 v2
cosineSim v1 v2 = toUnitVector v1 ✕ toUnitVector v2
paintMatrix cosineSim labelsLimited limitedL
na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud na_ak 1.00 0.02 0.01 0.01 0.03 0.02 0.02 0.04 0.03 0.02 0.01 0.02 0.03 w_lud 0.02 1.00 0.02 0.05 0.04 0.01 0.03 0.04 0.06 0.01 0.02 0.03 0.06 ba_hy 0.01 0.02 1.00 0.01 0.02 0.03 0.03 0.04 0.08 0.22 0.01 0.04 0.01 w_lap 0.01 0.05 0.01 1.00 0.01 0.01 0.00 0.01 0.02 0.00 0.00 0.00 0.00 ne_dz 0.03 0.04 0.02 0.01 1.00 0.04 0.03 0.07 0.08 0.06 0.03 0.03 0.05 be_wy 0.02 0.01 0.03 0.01 0.04 1.00 0.01 0.03 0.21 0.01 0.02 0.25 0.01 zw_oz 0.02 0.03 0.03 0.00 0.03 0.01 1.00 0.04 0.03 0.00 0.01 0.02 0.02 mo_zu 0.04 0.04 0.04 0.01 0.07 0.03 0.04 1.00 0.10 0.02 0.09 0.05 0.04 be_wy 0.03 0.06 0.08 0.02 0.08 0.21 0.03 0.10 1.00 0.05 0.03 0.24 0.04 ba_hy 0.02 0.01 0.22 0.00 0.06 0.01 0.00 0.02 0.05 1.00 0.01 0.02 0.00 mo_zu 0.01 0.02 0.01 0.00 0.03 0.02 0.01 0.09 0.03 0.01 1.00 0.01 0.02 be_wy 0.02 0.03 0.04 0.00 0.03 0.25 0.02 0.05 0.24 0.02 0.01 1.00 0.02 w_lud 0.03 0.06 0.01 0.00 0.05 0.01 0.02 0.04 0.04 0.00 0.02 0.02 1.00
Powyższa macierz reprezentuje porównanie przy użyciu podobieństwa kosinusowego. Spróbujmy teraz użyć gęstszych wektorów przy użyciu hashing trick. Jako wartość $b$ przyjmijmy 6.
Zobaczmy najpierw, w które "przegródki" będą wpadały poszczególne wyrazy słownika.
map (\t -> (t, hash 6 t)) $ Data.List.take 100 $ Set.toList voc'
[("0",32),("00",4),("01",4),("07",40),("09",44),("1",1),("10",61),("100",27),("12",58),("13",51),("131",37),("15",30),("16",21),("17",58),("18",55),("19",35),("1997r",61),("2",62),("20",28),("2006",44),("2008",19),("2009",4),("2010",3),("22",27),("23",34),("24",7),("25",29),("26",35),("27",44),("28",61),("29",30),("3",56),("30",55),("300",38),("31",45),("4",53),("40",39),("42",43),("48",53),("49",13),("5",31),("50",32),("56",38),("57",55),("6",59),("7",27),("8",34),("a",27),("aaa",33),("absolu",11),("absurd",18),("aby",12),("adnym",10),("adres",15),("adrese",62),("afroam",3),("afryce",46),("agresy",57),("ah",37),("aha",42),("aig",56),("akadem",18),("akcja",0),("akcje",21),("akompa",13),("aktor",26),("akurat",7),("albino",27),("albo",44),("ale",7),("alfa",58),("alkoho",56),("altern",38),("ameryk",11),("amp",62),("anakon",34),("analiz",62),("andrze",63),("anegdo",43),("ang",37),("anga\380o",27),("anglii",33),("ani",22),("anonsu",36),("antono",3),("antykr",41),("apetyt",16),("apolit",39),("apropo",54),("apteki",20),("aqua",59),("archit",61),("aromat",44),("artyku",31),("asami",22),("astron",59),("asy\347ci",60),("atmosf",37),("audycj",50),("auta",38)]
Pytanie: Czy jakieś dwa termy wpadły do jednej przegródki?
Stwórzmy najpierw funkcję, która będzie wektoryzowała pojedynczy term $t$. Po prostu stworzymy wektor, które będzie miał rozmiar $2^b$, wszędzie będzie miał 0 z wyjątkiem pozycji o numerze $H_b(t)$ - tam wpiszmy odwrotną częstość dokumentową.
$$\vec{t} = [0,\dots,\idf_c t,\dots,0]$$
Teraz dla dokumentu $d = (t_1,\dots,t_n)$ i dla schematu ważenia tf-idf:
$$\vec{d} = \sum \vec{t_i}$$
wordVector :: Integer -> [[Text]] -> Text -> [Double]
wordVector b coll term = map selector [0..vecSize]
where vecSize = 2^b - 1
wordFingerprint = hash b term
selector i
| i == wordFingerprint = idf coll term
| otherwise = 0.0
wordVector 6 collectionLNormalized "aromat"
wordVector 6 collectionLNormalized "albo"
wordVector 6 collectionLNormalized "akcja"
[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.268683541318364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
Teraz wystarczy zsumować wektory dla poszczególnych słów, żeby otrzymać wektor dokumentu. Najpierw zdefiniujmy sobie sumę wektorową.
(+++) :: [Double] -> [Double] -> [Double]
(+++) = Prelude.zipWith (+)
[0.2, 0.5, 1.0] +++ [1.0, 3.5, 2.0]
[1.2,4.0,3.0]
Przydatna będzie jeszcze funkcja, która tworzy wektor z samymi zerami o zadanej długości:
zero :: Int -> [Double]
zero s = Prelude.replicate s 0.0
zero (2^6)
[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
vectorizeWithHashingTrick :: Integer -> [[Text]] -> [Text] -> [Double]
vectorizeWithHashingTrick b coll doc = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2^b) doc
vectorizeWithHashingTrick 6 collectionLNormalized $ collectionLNormalized !! 3
vectorizeWithHashingTrick 6 collectionLNormalized ["aromat", "albo", "akcja"]
vectorizeWithHashingTrick 6 collectionLNormalized ["akcja", "aromat", "albo"]
vectorizeWithHashingTrick 6 collectionLNormalized ["akcja", "aromat", "albo", "albo"]
vectorizeWithHashingTrick 6 collectionLNormalized ["akcja", "aromat", "09"]
[5.242936783195232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.856470206220483,0.0,0.0,1.1700712526502546,0.5947071077466928,0.0,5.712940412440966,3.0708470981669183,0.0,0.0,4.465908118654584,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,4.788681510917635,0.0,3.7727609380946383,0.0,1.575536360758419,0.0,3.079613757534693,0.0,4.465908118654584,0.0,4.588010815455483,4.465908118654584,0.0,1.5214691394881432,0.0,0.0,0.0,0.0,4.465908118654584,2.5199979695992702,0.0,1.5214691394881432,8.388148398070203e-2,0.0,4.465908118654584,0.0,0.0,3.367295829986474,0.0,3.7727609380946383,0.0,1.5214691394881432,0.0,3.7727609380946383,0.0,0.0,0.0,3.367295829986474,0.0]
[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.734591659972947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.734591659972947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.003275201291313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.931816237309167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
Zobaczmy, jak zagęszczenie wpływa na macierz podobieństwa.
lVectorized' = map (vectorizeWithHashingTrick 8 collectionLNormalized) collectionLNormalized
limitedL' = Data.List.take limit lVectorized'
paintMatrix cosineSim labelsLimited limitedL'
na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud na_ak 1.00 0.37 0.21 0.28 0.35 0.22 0.32 0.45 0.47 0.21 0.25 0.20 0.39 w_lud 0.37 1.00 0.28 0.18 0.38 0.15 0.20 0.35 0.36 0.14 0.17 0.19 0.33 ba_hy 0.21 0.28 1.00 0.08 0.20 0.18 0.24 0.29 0.30 0.27 0.17 0.15 0.24 w_lap 0.28 0.18 0.08 1.00 0.10 0.11 0.11 0.30 0.17 0.06 0.07 0.13 0.21 ne_dz 0.35 0.38 0.20 0.10 1.00 0.32 0.30 0.52 0.44 0.27 0.36 0.26 0.41 be_wy 0.22 0.15 0.18 0.11 0.32 1.00 0.26 0.26 0.39 0.15 0.23 0.43 0.22 zw_oz 0.32 0.20 0.24 0.11 0.30 0.26 1.00 0.38 0.36 0.06 0.18 0.20 0.29 mo_zu 0.45 0.35 0.29 0.30 0.52 0.26 0.38 1.00 0.54 0.23 0.39 0.38 0.51 be_wy 0.47 0.36 0.30 0.17 0.44 0.39 0.36 0.54 1.00 0.26 0.37 0.42 0.48 ba_hy 0.21 0.14 0.27 0.06 0.27 0.15 0.06 0.23 0.26 1.00 0.24 0.10 0.27 mo_zu 0.25 0.17 0.17 0.07 0.36 0.23 0.18 0.39 0.37 0.24 1.00 0.20 0.34 be_wy 0.20 0.19 0.15 0.13 0.26 0.43 0.20 0.38 0.42 0.10 0.20 1.00 0.29 w_lud 0.39 0.33 0.24 0.21 0.41 0.22 0.29 0.51 0.48 0.27 0.34 0.29 1.00
Pytanie: Co się stanie, gdy zwiększymy $b$, a co jeśli zmniejszymi?
Zalety sztuczki z haszowaniem:
- zagwarantowany stały rozmiar wektora
- szybsze obliczenia
- w naturalny sposób uwzględniamy termy, których nie było w początkowej kolekcji (ale uwaga na idf!)
- nie musimy pamiętać odzworowania rzutującego słowa na ich numery
Wady:
- dwa różne słowa mogą wpaść do jednej przegródki (szczególnie częste, jeśli $b$ jest za małe)
- jeśli $b$ ustawimy za duże, wektory mogą być nawet większe niż w przypadku standardowego podejścia
Word2vec
A może istnieje dobra wróżka, która dałaby nam dobre wektory słów (z których będziemy składali proste wektory dokumentów przez sumowanie)?
Pytanie: Jakie własności powinny mieć dobre wektory słów?
Tak! Istnieją gotowe "bazy danych" wektorów. Jedną z najpopularniejszych (i najstarszych) metod uzyskiwania takich wektorów jest Word2vec. Jak dokładnie Word2vec, dowiemy się później, na dzisiaj po prostu użyjmy tych wektorów.
Najpierw wprowadźmy alternatywną normalizację zgodną z tym, jak został wygenerowany model.
normalize' :: Text -> [Text]
normalize' = removeStopWords . map toLower . tokenize
normalize' "Ala ma kota."
ala
ma
kota
collectionLNormalized' = map normalize' collectionL
collectionLNormalized' !! 3
mam
kumpla
ktory
zdawal
walentynki
i
polozyl
koperte
dla
laski
z
kartka
na
desce
rozdzielczej
egzaminator
wziol
ta
karteke
i
powiedzial
ze
ma
znade
wypisal
mu
papierek
i
po
egzaminie
hehe
filmik
dobry
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
import Data.Word2Vec.Model
import Data.Maybe (catMaybes, fromJust)
import qualified Data.Vector.Storable as V
model <- readWord2VecModel "tiny.bin"
toOurVector :: WVector -> [Double]
toOurVector (WVector v _) = map realToFrac $ V.toList v
balwanV = toOurVector $ fromJust $ getVector model "bałwan"
balwanV
Prelude.length balwanV
vectorizeWord2vec model d = Prelude.foldr (+++) (zero 100) $ map toOurVector $ catMaybes $ map (getVector model) d
collectionLVectorized'' = map (vectorizeWord2vec model) collectionLNormalized'
[-2.305081844329834,0.3418600857257843,4.44999361038208,0.9008448719978333,-2.1629886627197266,1.0206516981124878,4.157524108886719,2.5060904026031494,-0.17275184392929077,4.085052967071533,2.236677408218384,-2.3315281867980957,0.5224806070327759,0.15804219245910645,-1.5636622905731201,-1.2624900341033936,-0.3161393105983734,-1.971177101135254,1.4859644174575806,-0.1742715835571289,1.209444284439087,4.063786193728447e-2,-0.2808700501918793,-0.5895432233810425,-4.126195430755615,-2.690922260284424,1.4975452423095703,-0.25380706787109375,-4.5767364501953125,-1.7726246118545532,2.938936710357666,-0.7173141837120056,-2.4317402839660645,-4.206724643707275,0.6768773198127747,2.236821413040161,4.1044291108846664e-2,1.6991114616394043,1.2354476377367973e-2,-3.079916000366211,-1.7430219650268555,1.8969229459762573,-0.4897139072418213,1.1981141567230225,2.431124687194824,0.39453181624412537,1.9735784530639648,2.124225378036499,-4.338796138763428,-0.954145610332489,3.3927927017211914,0.8821511268615723,5.120451096445322e-3,2.917816638946533,-2.035374164581299,3.3221969604492188,-4.981880187988281,-1.105080008506775,-4.093905448913574,-1.5998111963272095,0.6372298002243042,-0.7565107345581055,0.4038744270801544,0.685226321220398,2.137610912322998,-0.4390018582344055,1.007287859916687,0.19681350886821747,-2.598611354827881,-1.8872140645980835,1.6989527940750122,1.6458508968353271,-5.091184616088867,1.4902764558792114,-0.4839307367801666,-2.840092420578003,1.0180696249008179,0.7615311741828918,1.8135554790496826,-0.30493396520614624,3.5879104137420654,1.4585649967193604,3.2775094509124756,-1.1610190868377686,-2.3159284591674805,4.1530327796936035,-4.67172384262085,-0.8594478964805603,-0.860812783241272,-0.31788957118988037,0.7260096669197083,0.1879102736711502,-0.15789580345153809,1.9434200525283813,-1.9945732355117798,1.8799400329589844,-0.5253798365592957,-0.2834266722202301,-0.8012301921844482,1.5093021392822266]
100
collectionLVectorized'' !! 3
[-26.834667675197124,2.568521626293659,37.66925026476383,9.381511189043522,-32.04328362643719,-19.734033070504665,55.21128339320421,14.215368987061083,23.60182836651802,38.74189975857735,0.16257449332624674,-47.983866568654776,-36.917382495012134,36.08420217037201,13.996580198407173,-30.473296120762825,21.28328724205494,30.601420499384403,-40.5945385559462,16.043263137340546,-8.694086126983166,-41.90418399870396,-10.448782376945019,-0.21028679609298706,9.586350612342358,-46.172676257789135,46.27567541599274,11.25023115798831,9.00947591662407,-43.525397814810276,22.09978771582246,56.93886440992355,-23.428963833488524,-1.4649565666913986,21.969609811902046,-21.504647210240364,24.955158293247223,-8.328911297023296,-31.118815276771784,0.22846409678459167,12.212224327027798,-28.337586268782616,-24.105730276554823,3.36764569953084,8.270942151546478,33.71851025521755,30.665825616568327,-24.134687054902315,-31.72916578501463,35.20022106170654,71.15121555328369,-15.448215141892433,-41.27439119666815,3.0322337672114372,9.768462024629116,38.911416467279196,-9.848581969738007,-20.030757322907448,6.734442539513111,-84.9070791369304,38.147536396980286,4.3607237339019775,-25.426255017518997,5.240264508873224,-32.71464269608259,2.095752328634262,2.4292337521910667,32.93906496465206,-51.44473773613572,0.5551527962088585,-6.1982685178518295,20.187213011085987,-52.809339098632336,-10.458874322474003,13.979218572378159,-38.16066548228264,27.336308609694242,5.3437707126140594,-32.01269288826734,-38.117460787296295,-9.337415304034948,38.90077601373196,-2.158842660486698,-44.878454223275185,23.69188129901886,-54.10413733869791,-41.30505630373955,-37.28948371112347,-65.8488347530365,32.51569982431829,3.781733974814415,72.77320172637701,6.847739472985268,63.77478001266718,24.26227615773678,7.260737741366029,10.931276574730873,-17.388786104973406,9.978045962750912,5.968699499964714]
limitedL'' = Data.List.take limit collectionLVectorized''
paintMatrix cosineSim labelsLimited limitedL''
na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud na_ak 1.00 0.92 0.85 0.77 0.87 0.90 0.92 0.88 0.87 0.87 0.89 0.89 0.89 w_lud 0.92 1.00 0.92 0.72 0.93 0.93 0.91 0.94 0.95 0.86 0.94 0.94 0.96 ba_hy 0.85 0.92 1.00 0.69 0.89 0.91 0.83 0.89 0.95 0.86 0.87 0.94 0.90 w_lap 0.77 0.72 0.69 1.00 0.60 0.74 0.67 0.65 0.68 0.58 0.68 0.73 0.66 ne_dz 0.87 0.93 0.89 0.60 1.00 0.90 0.87 0.95 0.94 0.86 0.93 0.90 0.95 be_wy 0.90 0.93 0.91 0.74 0.90 1.00 0.89 0.89 0.91 0.85 0.91 0.96 0.94 zw_oz 0.92 0.91 0.83 0.67 0.87 0.89 1.00 0.89 0.86 0.86 0.91 0.85 0.90 mo_zu 0.88 0.94 0.89 0.65 0.95 0.89 0.89 1.00 0.97 0.85 0.95 0.91 0.96 be_wy 0.87 0.95 0.95 0.68 0.94 0.91 0.86 0.97 1.00 0.84 0.93 0.95 0.95 ba_hy 0.87 0.86 0.86 0.58 0.86 0.85 0.86 0.85 0.84 1.00 0.83 0.85 0.84 mo_zu 0.89 0.94 0.87 0.68 0.93 0.91 0.91 0.95 0.93 0.83 1.00 0.91 0.96 be_wy 0.89 0.94 0.94 0.73 0.90 0.96 0.85 0.91 0.95 0.85 0.91 1.00 0.94 w_lud 0.89 0.96 0.90 0.66 0.95 0.94 0.90 0.96 0.95 0.84 0.96 0.94 1.00
Możemy próbować mnożyć wektory z modelu Word2vec z idf. Najpierw zdefiniujmy mnożenie przez skalar.
(***) :: Double -> [Double] -> [Double]
(***) s = map (*s)
2.5 *** [1.0, 0.0, 2.0]
[2.5,0.0,5.0]
Teraz będziemy przemnażali wektory Word2vec przez idf (jako skalar).
import Data.Maybe (isJust)
vectorizeWord2vecIdf model coll d =
Prelude.foldr (+++) (zero 100)
$ map (\(t, Just v) -> idf coll t *** toOurVector v)
$ Prelude.filter (\(_, v) -> isJust v)
$ map (\t -> (t, getVector model t)) d
collectionLVectorized''' = map (vectorizeWord2vecIdf model collectionLNormalized') collectionLNormalized'
collectionLVectorized''' !! 3
[-35.63830397762308,32.606312678971506,102.20663646169147,56.00417395285867,-130.56709475346878,-14.916644370325773,55.15817632053957,83.2241937686228,26.432875116296394,48.94350344147367,11.370669191277202,-59.54579267200742,-116.01687192456801,60.53824040579282,39.84659684249884,-34.37377085402866,104.53525319069323,45.53363024094972,-34.25020197907558,-43.9007702604392,35.36538495508536,-59.81737728971619,-1.5823889595648828,-50.211106838043655,14.83789867297237,-109.45917608219175,86.56767915592452,-32.170794763065615,29.559930839016644,-126.81686726526162,-9.918908360030228,47.14965938694648,5.955083439147183,41.24417782948478,3.592410260515919,72.10649687523313,61.374776273461855,60.28687760276824,-28.886499026001676,-8.710633131022206,-68.73464623080284,-37.95272838994007,-26.390548039392165,-14.241950251566944,74.6286124718925,46.21889022510431,72.23999508751568,-19.597547074284556,-20.160749174807382,99.49036127458763,131.98057386978817,-23.842794956628147,-62.381675411749846,-19.366936151725387,1.4839595614144327,60.40520721416763,-7.70311857607342,-31.75784386529525,48.71818084466781,-202.41827342135582,138.5639100010709,12.447619757719652,-39.38375639132277,27.877688543771935,-87.00559882214534,56.45689362090545,37.89098984507379,103.78465196444151,-166.10094891357176,-50.83382060940457,11.574060187412977,74.00519869734406,-97.00170731343235,32.18159534728971,-11.280059681646494,-40.701643971890256,74.64230137346699,0.7613112917269982,-6.103424218278271,-150.47551072570587,-21.714627635239918,91.26690441786137,62.91576955719526,-92.35700140312395,-25.421583980267307,-67.87480813505826,-120.16245846953592,-68.89155479679258,-122.00206448376261,35.263603445401785,6.416282520155956,203.41225708856086,-62.42983953251155,59.36113672119048,40.00275897200196,-62.55633545667429,89.66866371308245,-42.287712072353834,-72.59490110281287,52.23637641217955]
limitedL''' = Data.List.take limit collectionLVectorized'''
paintMatrix cosineSim labelsLimited limitedL'''
na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud na_ak 1.00 0.83 0.78 0.63 0.78 0.81 0.83 0.76 0.77 0.80 0.77 0.79 0.79 w_lud 0.83 1.00 0.82 0.60 0.84 0.84 0.84 0.85 0.86 0.74 0.86 0.83 0.90 ba_hy 0.78 0.82 1.00 0.57 0.78 0.84 0.77 0.79 0.90 0.75 0.74 0.89 0.85 w_lap 0.63 0.60 0.57 1.00 0.38 0.60 0.50 0.43 0.52 0.45 0.55 0.65 0.47 ne_dz 0.78 0.84 0.78 0.38 1.00 0.81 0.79 0.90 0.89 0.77 0.81 0.81 0.90 be_wy 0.81 0.84 0.84 0.60 0.81 1.00 0.82 0.76 0.83 0.74 0.81 0.92 0.88 zw_oz 0.83 0.84 0.77 0.50 0.79 0.82 1.00 0.77 0.77 0.74 0.82 0.75 0.83 mo_zu 0.76 0.85 0.79 0.43 0.90 0.76 0.77 1.00 0.93 0.74 0.87 0.80 0.90 be_wy 0.77 0.86 0.90 0.52 0.89 0.83 0.77 0.93 1.00 0.72 0.81 0.89 0.92 ba_hy 0.80 0.74 0.75 0.45 0.77 0.74 0.74 0.74 0.72 1.00 0.66 0.73 0.72 mo_zu 0.77 0.86 0.74 0.55 0.81 0.81 0.82 0.87 0.81 0.66 1.00 0.80 0.88 be_wy 0.79 0.83 0.89 0.65 0.81 0.92 0.75 0.80 0.89 0.73 0.80 1.00 0.87 w_lud 0.79 0.90 0.85 0.47 0.90 0.88 0.83 0.90 0.92 0.72 0.88 0.87 1.00