aitech-eks-pub/wyk/05_Geste_wektory.ipynb

55 KiB

Logo 1

Ekstrakcja informacji

5. Gęste reprezentacje wektorowe [wykład]

Filip Graliński (2021)

Logo 2

Zagęszczamy wektory

Podstawowy problem z wektorową reprezentacją typu tf-idf polega na tym, że wektory dokumentów (i macierz całej kolekcji dokumentów) są _rzadkie, tzn. zawierają dużo zer. W praktyce potrzebujemy bardziej "gęstej" czy "kompaktowej" reprezentacji numerycznej dokumentów.

_Hashing trick

Powierzchownie problem możemy rozwiązać przez użycie tzw. _sztuczki z haszowaniem (hashing trick). Będziemy potrzebować funkcji mieszającej (haszującej) $H$, która rzutuje napisy na liczby, których reprezentacja binarna składa się z $b$ bitów:

$$H : \Sigma^{*} \rightarrow \{0,\dots,2^b-1\}$$

($\Sigma^{*}$ to zbiór wszystkich napisów.)

Pytanie: Czy funkcja $H$ może być różnowartościowa?

Jako funkcji $H$ możemy np. użyć funkcji MurmurHash2 lub 3.

import Data.Digest.Murmur64

hash64 "Komputer"
hash64 "komputer"
hash64 "komputer"
hash64 "komputerze"
hash64 "komputerek"
hash64 "abrakadabra"
hash64 ""
hash64 " "
Hash64 0x4a80abc136f926e7
Hash64 0x6c3a641663470e2c
Hash64 0x6c3a641663470e2c
Hash64 0xa714568917576314
Hash64 0x875d9e7e413747c8
Hash64 0x13ce831936ebc69e
Hash64 0xb04ce6229407c882
Hash64 0x6ecd7bae29ae0450

Pytanie: podobne napisy mają zupełnie różne wartości funkcji haszującej, czy to dobrze, czy to źle?

Musimy tylko sparametryzować naszą funkcję rozmiarem „odcisku” (parametr $b$).

{-# LANGUAGE OverloadedStrings #-}

import Data.Text

-- pomocnicza funkcja, która konwertuje wartość specjalnego
-- typu Hash64 do zwykłej liczby całkowitej
hashValueAsInteger :: Hash64 -> Integer
hashValueAsInteger = toInteger . asWord64

-- unpack to funkcja, która wartość typu String konwertuje do Text
hash :: Integer -> Text -> Integer
hash b t = hashValueAsInteger (hash64 $ unpack t) `mod` (2 ^ b)

hash 16 "komputer"
hash 16 "komputerze"
hash 16 "komputerem"
hash 16 "abrakadabra"
hash 4 "komputer"
3628
25364
2877
50846
12

Pytanie: Jakie wartości $b$ będą bezsensowne?

Sztuczka z haszowaniem polega na tym, że zamiast numerować słowa korzystając ze słownika, po prostu używamy funkcji haszującej. W ten sposób wektor będzie _zawsze rozmiar $2^b$ - bez względu na rozmiar słownika.

Zacznijmy od przywołania wszystkich potrzebnych definicji.

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}

import Data.Text hiding(map, filter, zip)
import Text.Regex.PCRE.Heavy

isStopWord :: Text -> Bool
isStopWord "w" = True
isStopWord "jest" = True
isStopWord "że" = True
isStopWord w = w ≈ [re|^\p{P}+$|]


removeStopWords :: [Text] -> [Text]
removeStopWords = filter (not . isStopWord)
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE FlexibleContexts #-}

import Data.Text hiding(map, filter, zip)
import Prelude hiding(words, take)
import Text.Regex.PCRE.Heavy
import Data.Map as Map hiding(take, map, filter)
import Data.Set as Set hiding(map)

tokenize :: Text -> [Text]
tokenize = map fst . scan [re|C\+\+|[\p{L}0-9]+|\p{P}|]


mockInflectionDictionary :: Map Text Text
mockInflectionDictionary = Map.fromList [
   ("kota", "kot"),
   ("butach", "but"),
   ("masz", "mieć"),
   ("ma", "mieć"),
   ("buta", "but"),
   ("zgubiłem", "zgubić")]

lemmatizeWord :: Map Text Text -> Text -> Text
lemmatizeWord dict w = findWithDefault w w dict

lemmatize :: Map Text Text -> [Text] -> [Text]
lemmatize dict = map (lemmatizeWord dict)


poorMansStemming = Data.Text.take 6

normalize :: Text -> [Text]
normalize = map poorMansStemming . removeStopWords . map toLower . lemmatize mockInflectionDictionary . tokenize

getVocabulary :: [Text] -> Set Text 
getVocabulary = Set.unions . map (Set.fromList . normalize) 
  
idf :: [[Text]] -> Text -> Double
idf coll t = log (fromIntegral n / fromIntegral df)
  where df = Prelude.length $ Prelude.filter (\d -> t `elem` d) coll
        n = Prelude.length coll
        
vectorizeTfIdf :: Int -> [[Text]] -> Map Int Text -> [Text] -> [Double]
vectorizeTfIdf vecSize coll v doc = map (\i -> count (v ! i) doc * idf coll (v ! i)) [0..(vecSize-1)]
   where count t doc = fromIntegral $ (Prelude.length . Prelude.filter (== t)) doc        
import System.IO
import Data.List.Split as SP

legendsh <- openFile "legendy.txt" ReadMode
hSetEncoding legendsh utf8
contents <- hGetContents legendsh
ls = Prelude.lines contents
items = map (map pack . SP.splitOn "\t") ls

labelsL = map Prelude.head items
collectionL = map (!!1) items

collectionLNormalized = map normalize collectionL
voc' = getVocabulary collectionL

vocLSize = Prelude.length voc'

vocL :: Map Int Text
vocL = Map.fromList $ zip [0..] $ Set.toList voc'

invvocL :: Map Text Int
invvocL = Map.fromList $ zip (Set.toList voc') [0..]

lVectorized = map (vectorizeTfIdf vocLSize collectionLNormalized vocL) collectionLNormalized
import Text.Printf
import Data.List (take)

formatNumber :: Double -> String
formatNumber x = printf "% 7.2f" x

similarTo :: ([Double] -> [Double] -> Double) -> [[Double]] -> Int -> Text
similarTo simFun vs ix = pack $ Prelude.unwords $ map (formatNumber . ((vs !! ix) `simFun`)) vs

paintMatrix :: ([Double] -> [Double] -> Double) -> [Text] -> [[Double]] -> Text
paintMatrix simFun labels vs = header <> "\n" <> Data.Text.unlines (map (\(lab, ix) -> lab <> " " <> similarTo simFun vs ix) $ zip labels [0..(Prelude.length vs - 1)])
    where header = "      " <> Data.Text.unwords (map (\l -> pack $ printf "% 7s" l) labels)
Eta reduce
Found:
formatNumber x = printf "% 7.2f" x
Why Not:
formatNumber = printf "% 7.2f"
Use zipWith
Found:
map (\ (lab, ix) -> lab <> " " <> similarTo simFun vs ix) $ zip labels [0 .. (Prelude.length vs - 1)]
Why Not:
zipWith (curry (\ (lab, ix) -> lab <> " " <> similarTo simFun vs ix)) labels [0 .. (Prelude.length vs - 1)]
Avoid lambda
Found:
\ l -> pack $ printf "% 7s" l
Why Not:
pack . printf "% 7s"
limit = 13
labelsLimited =  Data.List.take limit labelsL
limitedL = Data.List.take limit lVectorized

vectorNorm :: [Double] -> Double
vectorNorm vs = sqrt $ sum $ map (\x -> x * x) vs

toUnitVector :: [Double] -> [Double]
toUnitVector vs = map (/ n) vs
   where n = vectorNorm vs


(✕) :: [Double] -> [Double] -> Double
(✕) v1 v2 = sum $ Prelude.zipWith (*) v1 v2

cosineSim v1 v2 = toUnitVector v1 ✕ toUnitVector v2

paintMatrix cosineSim labelsLimited limitedL
        na_ak   w_lud   ba_hy   w_lap   ne_dz   be_wy   zw_oz   mo_zu   be_wy   ba_hy   mo_zu   be_wy   w_lud
na_ak    1.00    0.02    0.01    0.01    0.03    0.02    0.02    0.04    0.03    0.02    0.01    0.02    0.03
w_lud    0.02    1.00    0.02    0.05    0.04    0.01    0.03    0.04    0.06    0.01    0.02    0.03    0.06
ba_hy    0.01    0.02    1.00    0.01    0.02    0.03    0.03    0.04    0.08    0.22    0.01    0.04    0.01
w_lap    0.01    0.05    0.01    1.00    0.01    0.01    0.00    0.01    0.02    0.00    0.00    0.00    0.00
ne_dz    0.03    0.04    0.02    0.01    1.00    0.04    0.03    0.07    0.08    0.06    0.03    0.03    0.05
be_wy    0.02    0.01    0.03    0.01    0.04    1.00    0.01    0.03    0.21    0.01    0.02    0.25    0.01
zw_oz    0.02    0.03    0.03    0.00    0.03    0.01    1.00    0.04    0.03    0.00    0.01    0.02    0.02
mo_zu    0.04    0.04    0.04    0.01    0.07    0.03    0.04    1.00    0.10    0.02    0.09    0.05    0.04
be_wy    0.03    0.06    0.08    0.02    0.08    0.21    0.03    0.10    1.00    0.05    0.03    0.24    0.04
ba_hy    0.02    0.01    0.22    0.00    0.06    0.01    0.00    0.02    0.05    1.00    0.01    0.02    0.00
mo_zu    0.01    0.02    0.01    0.00    0.03    0.02    0.01    0.09    0.03    0.01    1.00    0.01    0.02
be_wy    0.02    0.03    0.04    0.00    0.03    0.25    0.02    0.05    0.24    0.02    0.01    1.00    0.02
w_lud    0.03    0.06    0.01    0.00    0.05    0.01    0.02    0.04    0.04    0.00    0.02    0.02    1.00

Powyższa macierz reprezentuje porównanie przy użyciu podobieństwa kosinusowego. Spróbujmy teraz użyć gęstszych wektorów przy użyciu hashing trick. Jako wartość $b$ przyjmijmy 6.

Zobaczmy najpierw, w które "przegródki" będą wpadały poszczególne wyrazy słownika.

map (\t -> (t, hash 6 t)) $ Data.List.take 100 $ Set.toList voc'
[("0",32),("00",4),("01",4),("07",40),("09",44),("1",1),("10",61),("100",27),("12",58),("13",51),("131",37),("15",30),("16",21),("17",58),("18",55),("19",35),("1997r",61),("2",62),("20",28),("2006",44),("2008",19),("2009",4),("2010",3),("22",27),("23",34),("24",7),("25",29),("26",35),("27",44),("28",61),("29",30),("3",56),("30",55),("300",38),("31",45),("4",53),("40",39),("42",43),("48",53),("49",13),("5",31),("50",32),("56",38),("57",55),("6",59),("7",27),("8",34),("a",27),("aaa",33),("absolu",11),("absurd",18),("aby",12),("adnym",10),("adres",15),("adrese",62),("afroam",3),("afryce",46),("agresy",57),("ah",37),("aha",42),("aig",56),("akadem",18),("akcja",0),("akcje",21),("akompa",13),("aktor",26),("akurat",7),("albino",27),("albo",44),("ale",7),("alfa",58),("alkoho",56),("altern",38),("ameryk",11),("amp",62),("anakon",34),("analiz",62),("andrze",63),("anegdo",43),("ang",37),("anga\380o",27),("anglii",33),("ani",22),("anonsu",36),("antono",3),("antykr",41),("apetyt",16),("apolit",39),("apropo",54),("apteki",20),("aqua",59),("archit",61),("aromat",44),("artyku",31),("asami",22),("astron",59),("asy\347ci",60),("atmosf",37),("audycj",50),("auta",38)]

Pytanie: Czy jakieś dwa termy wpadły do jednej przegródki?

Stwórzmy najpierw funkcję, która będzie wektoryzowała pojedynczy term $t$. Po prostu stworzymy wektor, które będzie miał rozmiar $2^b$, wszędzie będzie miał 0 z wyjątkiem pozycji o numerze $H_b(t)$ - tam wpiszmy odwrotną częstość dokumentową.

$$\vec{t} = [0,\dots,\idf_c t,\dots,0]$$

Teraz dla dokumentu $d = (t_1,\dots,t_n)$ i dla schematu ważenia tf-idf:

$$\vec{d} = \sum \vec{t_i}$$

wordVector :: Integer -> [[Text]] -> Text -> [Double]
wordVector b coll term = map selector [0..vecSize]
   where vecSize = 2^b - 1
         wordFingerprint = hash b term
         selector i 
          | i == wordFingerprint = idf coll term
          | otherwise = 0.0

wordVector 6 collectionLNormalized "aromat"
wordVector 6 collectionLNormalized "albo"
wordVector 6 collectionLNormalized "akcja"
[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.268683541318364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]

Teraz wystarczy zsumować wektory dla poszczególnych słów, żeby otrzymać wektor dokumentu. Najpierw zdefiniujmy sobie sumę wektorową.

(+++) :: [Double] -> [Double] -> [Double]
(+++) = Prelude.zipWith (+)

[0.2, 0.5, 1.0] +++ [1.0, 3.5, 2.0]
[1.2,4.0,3.0]

Przydatna będzie jeszcze funkcja, która tworzy wektor z samymi zerami o zadanej długości:

zero :: Int -> [Double]
zero s = Prelude.replicate s 0.0

zero (2^6)
[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]

vectorizeWithHashingTrick :: Integer -> [[Text]] -> [Text] -> [Double]
vectorizeWithHashingTrick b coll doc = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2^b) doc

vectorizeWithHashingTrick 6 collectionLNormalized $ collectionLNormalized !! 3
vectorizeWithHashingTrick 6 collectionLNormalized ["aromat", "albo", "akcja"]
vectorizeWithHashingTrick 6 collectionLNormalized ["akcja", "aromat", "albo"]
vectorizeWithHashingTrick 6 collectionLNormalized ["akcja", "aromat", "albo", "albo"]
vectorizeWithHashingTrick 6 collectionLNormalized ["akcja", "aromat", "09"]
Eta reduce
Found:
vectorizeWithHashingTrick b coll doc = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2 ^ b) doc
Why Not:
vectorizeWithHashingTrick b coll = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2 ^ b)
[5.242936783195232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.856470206220483,0.0,0.0,1.1700712526502546,0.5947071077466928,0.0,5.712940412440966,3.0708470981669183,0.0,0.0,4.465908118654584,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,4.788681510917635,0.0,3.7727609380946383,0.0,1.575536360758419,0.0,3.079613757534693,0.0,4.465908118654584,0.0,4.588010815455483,4.465908118654584,0.0,1.5214691394881432,0.0,0.0,0.0,0.0,4.465908118654584,2.5199979695992702,0.0,1.5214691394881432,8.388148398070203e-2,0.0,4.465908118654584,0.0,0.0,3.367295829986474,0.0,3.7727609380946383,0.0,1.5214691394881432,0.0,3.7727609380946383,0.0,0.0,0.0,3.367295829986474,0.0]
[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.734591659972947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.734591659972947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.003275201291313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.931816237309167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]

Zobaczmy, jak zagęszczenie wpływa na macierz podobieństwa.

lVectorized' = map (vectorizeWithHashingTrick 8 collectionLNormalized) collectionLNormalized
limitedL' = Data.List.take limit lVectorized'

paintMatrix cosineSim labelsLimited limitedL'
        na_ak   w_lud   ba_hy   w_lap   ne_dz   be_wy   zw_oz   mo_zu   be_wy   ba_hy   mo_zu   be_wy   w_lud
na_ak    1.00    0.37    0.21    0.28    0.35    0.22    0.32    0.45    0.47    0.21    0.25    0.20    0.39
w_lud    0.37    1.00    0.28    0.18    0.38    0.15    0.20    0.35    0.36    0.14    0.17    0.19    0.33
ba_hy    0.21    0.28    1.00    0.08    0.20    0.18    0.24    0.29    0.30    0.27    0.17    0.15    0.24
w_lap    0.28    0.18    0.08    1.00    0.10    0.11    0.11    0.30    0.17    0.06    0.07    0.13    0.21
ne_dz    0.35    0.38    0.20    0.10    1.00    0.32    0.30    0.52    0.44    0.27    0.36    0.26    0.41
be_wy    0.22    0.15    0.18    0.11    0.32    1.00    0.26    0.26    0.39    0.15    0.23    0.43    0.22
zw_oz    0.32    0.20    0.24    0.11    0.30    0.26    1.00    0.38    0.36    0.06    0.18    0.20    0.29
mo_zu    0.45    0.35    0.29    0.30    0.52    0.26    0.38    1.00    0.54    0.23    0.39    0.38    0.51
be_wy    0.47    0.36    0.30    0.17    0.44    0.39    0.36    0.54    1.00    0.26    0.37    0.42    0.48
ba_hy    0.21    0.14    0.27    0.06    0.27    0.15    0.06    0.23    0.26    1.00    0.24    0.10    0.27
mo_zu    0.25    0.17    0.17    0.07    0.36    0.23    0.18    0.39    0.37    0.24    1.00    0.20    0.34
be_wy    0.20    0.19    0.15    0.13    0.26    0.43    0.20    0.38    0.42    0.10    0.20    1.00    0.29
w_lud    0.39    0.33    0.24    0.21    0.41    0.22    0.29    0.51    0.48    0.27    0.34    0.29    1.00

Pytanie: Co się stanie, gdy zwiększymy $b$, a co jeśli zmniejszymi?

Zalety sztuczki z haszowaniem:

  • zagwarantowany stały rozmiar wektora
  • szybsze obliczenia
  • w naturalny sposób uwzględniamy termy, których nie było w początkowej kolekcji (ale uwaga na idf!)
  • nie musimy pamiętać odzworowania rzutującego słowa na ich numery

Wady:

  • dwa różne słowa mogą wpaść do jednej przegródki (szczególnie częste, jeśli $b$ jest za małe)
  • jeśli $b$ ustawimy za duże, wektory mogą być nawet większe niż w przypadku standardowego podejścia

Word2vec

A może istnieje dobra wróżka, która dałaby nam dobre wektory słów (z których będziemy składali proste wektory dokumentów przez sumowanie)?

Pytanie: Jakie własności powinny mieć dobre wektory słów?

Tak! Istnieją gotowe "bazy danych" wektorów. Jedną z najpopularniejszych (i najstarszych) metod uzyskiwania takich wektorów jest Word2vec. Jak dokładnie Word2vec, dowiemy się później, na dzisiaj po prostu użyjmy tych wektorów.

Najpierw wprowadźmy alternatywną normalizację zgodną z tym, jak został wygenerowany model.

normalize' :: Text -> [Text]
normalize' = removeStopWords . map toLower . tokenize

normalize' "Ala ma kota."
ala
ma
kota
collectionLNormalized' = map normalize' collectionL
collectionLNormalized' !! 3
mam
kumpla
ktory
zdawal
walentynki
i
polozyl
koperte
dla
laski
z
kartka
na
desce
rozdzielczej
egzaminator
wziol
ta
karteke
i
powiedzial
ze
ma
znade
wypisal
mu
papierek
i
po
egzaminie
hehe
filmik
dobry
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}

import Data.Word2Vec.Model
import Data.Maybe (catMaybes, fromJust)
import qualified Data.Vector.Storable as V

model <- readWord2VecModel "tiny.bin"

toOurVector :: WVector -> [Double]
toOurVector (WVector v _) = map realToFrac $ V.toList v

balwanV = toOurVector $ fromJust $ getVector model "bałwan"
balwanV
Prelude.length balwanV

vectorizeWord2vec model d = Prelude.foldr (+++) (zero 100) $ map toOurVector $ catMaybes $ map (getVector model) d

collectionLVectorized'' = map (vectorizeWord2vec model) collectionLNormalized'
[-2.305081844329834,0.3418600857257843,4.44999361038208,0.9008448719978333,-2.1629886627197266,1.0206516981124878,4.157524108886719,2.5060904026031494,-0.17275184392929077,4.085052967071533,2.236677408218384,-2.3315281867980957,0.5224806070327759,0.15804219245910645,-1.5636622905731201,-1.2624900341033936,-0.3161393105983734,-1.971177101135254,1.4859644174575806,-0.1742715835571289,1.209444284439087,4.063786193728447e-2,-0.2808700501918793,-0.5895432233810425,-4.126195430755615,-2.690922260284424,1.4975452423095703,-0.25380706787109375,-4.5767364501953125,-1.7726246118545532,2.938936710357666,-0.7173141837120056,-2.4317402839660645,-4.206724643707275,0.6768773198127747,2.236821413040161,4.1044291108846664e-2,1.6991114616394043,1.2354476377367973e-2,-3.079916000366211,-1.7430219650268555,1.8969229459762573,-0.4897139072418213,1.1981141567230225,2.431124687194824,0.39453181624412537,1.9735784530639648,2.124225378036499,-4.338796138763428,-0.954145610332489,3.3927927017211914,0.8821511268615723,5.120451096445322e-3,2.917816638946533,-2.035374164581299,3.3221969604492188,-4.981880187988281,-1.105080008506775,-4.093905448913574,-1.5998111963272095,0.6372298002243042,-0.7565107345581055,0.4038744270801544,0.685226321220398,2.137610912322998,-0.4390018582344055,1.007287859916687,0.19681350886821747,-2.598611354827881,-1.8872140645980835,1.6989527940750122,1.6458508968353271,-5.091184616088867,1.4902764558792114,-0.4839307367801666,-2.840092420578003,1.0180696249008179,0.7615311741828918,1.8135554790496826,-0.30493396520614624,3.5879104137420654,1.4585649967193604,3.2775094509124756,-1.1610190868377686,-2.3159284591674805,4.1530327796936035,-4.67172384262085,-0.8594478964805603,-0.860812783241272,-0.31788957118988037,0.7260096669197083,0.1879102736711502,-0.15789580345153809,1.9434200525283813,-1.9945732355117798,1.8799400329589844,-0.5253798365592957,-0.2834266722202301,-0.8012301921844482,1.5093021392822266]
100
collectionLVectorized'' !! 3
[-26.834667675197124,2.568521626293659,37.66925026476383,9.381511189043522,-32.04328362643719,-19.734033070504665,55.21128339320421,14.215368987061083,23.60182836651802,38.74189975857735,0.16257449332624674,-47.983866568654776,-36.917382495012134,36.08420217037201,13.996580198407173,-30.473296120762825,21.28328724205494,30.601420499384403,-40.5945385559462,16.043263137340546,-8.694086126983166,-41.90418399870396,-10.448782376945019,-0.21028679609298706,9.586350612342358,-46.172676257789135,46.27567541599274,11.25023115798831,9.00947591662407,-43.525397814810276,22.09978771582246,56.93886440992355,-23.428963833488524,-1.4649565666913986,21.969609811902046,-21.504647210240364,24.955158293247223,-8.328911297023296,-31.118815276771784,0.22846409678459167,12.212224327027798,-28.337586268782616,-24.105730276554823,3.36764569953084,8.270942151546478,33.71851025521755,30.665825616568327,-24.134687054902315,-31.72916578501463,35.20022106170654,71.15121555328369,-15.448215141892433,-41.27439119666815,3.0322337672114372,9.768462024629116,38.911416467279196,-9.848581969738007,-20.030757322907448,6.734442539513111,-84.9070791369304,38.147536396980286,4.3607237339019775,-25.426255017518997,5.240264508873224,-32.71464269608259,2.095752328634262,2.4292337521910667,32.93906496465206,-51.44473773613572,0.5551527962088585,-6.1982685178518295,20.187213011085987,-52.809339098632336,-10.458874322474003,13.979218572378159,-38.16066548228264,27.336308609694242,5.3437707126140594,-32.01269288826734,-38.117460787296295,-9.337415304034948,38.90077601373196,-2.158842660486698,-44.878454223275185,23.69188129901886,-54.10413733869791,-41.30505630373955,-37.28948371112347,-65.8488347530365,32.51569982431829,3.781733974814415,72.77320172637701,6.847739472985268,63.77478001266718,24.26227615773678,7.260737741366029,10.931276574730873,-17.388786104973406,9.978045962750912,5.968699499964714]
limitedL'' = Data.List.take limit collectionLVectorized''

paintMatrix cosineSim labelsLimited limitedL''
        na_ak   w_lud   ba_hy   w_lap   ne_dz   be_wy   zw_oz   mo_zu   be_wy   ba_hy   mo_zu   be_wy   w_lud
na_ak    1.00    0.92    0.85    0.77    0.87    0.90    0.92    0.88    0.87    0.87    0.89    0.89    0.89
w_lud    0.92    1.00    0.92    0.72    0.93    0.93    0.91    0.94    0.95    0.86    0.94    0.94    0.96
ba_hy    0.85    0.92    1.00    0.69    0.89    0.91    0.83    0.89    0.95    0.86    0.87    0.94    0.90
w_lap    0.77    0.72    0.69    1.00    0.60    0.74    0.67    0.65    0.68    0.58    0.68    0.73    0.66
ne_dz    0.87    0.93    0.89    0.60    1.00    0.90    0.87    0.95    0.94    0.86    0.93    0.90    0.95
be_wy    0.90    0.93    0.91    0.74    0.90    1.00    0.89    0.89    0.91    0.85    0.91    0.96    0.94
zw_oz    0.92    0.91    0.83    0.67    0.87    0.89    1.00    0.89    0.86    0.86    0.91    0.85    0.90
mo_zu    0.88    0.94    0.89    0.65    0.95    0.89    0.89    1.00    0.97    0.85    0.95    0.91    0.96
be_wy    0.87    0.95    0.95    0.68    0.94    0.91    0.86    0.97    1.00    0.84    0.93    0.95    0.95
ba_hy    0.87    0.86    0.86    0.58    0.86    0.85    0.86    0.85    0.84    1.00    0.83    0.85    0.84
mo_zu    0.89    0.94    0.87    0.68    0.93    0.91    0.91    0.95    0.93    0.83    1.00    0.91    0.96
be_wy    0.89    0.94    0.94    0.73    0.90    0.96    0.85    0.91    0.95    0.85    0.91    1.00    0.94
w_lud    0.89    0.96    0.90    0.66    0.95    0.94    0.90    0.96    0.95    0.84    0.96    0.94    1.00

Możemy próbować mnożyć wektory z modelu Word2vec z idf. Najpierw zdefiniujmy mnożenie przez skalar.

(***) :: Double -> [Double] -> [Double]
(***) s = map (*s)

2.5 *** [1.0, 0.0, 2.0]
[2.5,0.0,5.0]

Teraz będziemy przemnażali wektory Word2vec przez idf (jako skalar).

import Data.Maybe (isJust)

vectorizeWord2vecIdf model coll d = 
    Prelude.foldr (+++) (zero 100) 
    $ map (\(t, Just v) -> idf coll t *** toOurVector v) 
    $ Prelude.filter (\(_, v) -> isJust v)
    $ map (\t -> (t, getVector model t)) d

collectionLVectorized''' = map (vectorizeWord2vecIdf model collectionLNormalized') collectionLNormalized'
Fuse foldr/map
Found:
Prelude.foldr (+++) (zero 100) $ map (\ (t, Just v) -> idf coll t *** toOurVector v) $ Prelude.filter (\ (_, v) -> isJust v) $ map (\ t -> (t, getVector model t)) d
Why Not:
foldr ((+++) . (\ (t, Just v) -> idf coll t *** toOurVector v)) (zero 100) (Prelude.filter (\ (_, v) -> isJust v) $ map (\ t -> (t, getVector model t)) d)
collectionLVectorized''' !! 3
[-35.63830397762308,32.606312678971506,102.20663646169147,56.00417395285867,-130.56709475346878,-14.916644370325773,55.15817632053957,83.2241937686228,26.432875116296394,48.94350344147367,11.370669191277202,-59.54579267200742,-116.01687192456801,60.53824040579282,39.84659684249884,-34.37377085402866,104.53525319069323,45.53363024094972,-34.25020197907558,-43.9007702604392,35.36538495508536,-59.81737728971619,-1.5823889595648828,-50.211106838043655,14.83789867297237,-109.45917608219175,86.56767915592452,-32.170794763065615,29.559930839016644,-126.81686726526162,-9.918908360030228,47.14965938694648,5.955083439147183,41.24417782948478,3.592410260515919,72.10649687523313,61.374776273461855,60.28687760276824,-28.886499026001676,-8.710633131022206,-68.73464623080284,-37.95272838994007,-26.390548039392165,-14.241950251566944,74.6286124718925,46.21889022510431,72.23999508751568,-19.597547074284556,-20.160749174807382,99.49036127458763,131.98057386978817,-23.842794956628147,-62.381675411749846,-19.366936151725387,1.4839595614144327,60.40520721416763,-7.70311857607342,-31.75784386529525,48.71818084466781,-202.41827342135582,138.5639100010709,12.447619757719652,-39.38375639132277,27.877688543771935,-87.00559882214534,56.45689362090545,37.89098984507379,103.78465196444151,-166.10094891357176,-50.83382060940457,11.574060187412977,74.00519869734406,-97.00170731343235,32.18159534728971,-11.280059681646494,-40.701643971890256,74.64230137346699,0.7613112917269982,-6.103424218278271,-150.47551072570587,-21.714627635239918,91.26690441786137,62.91576955719526,-92.35700140312395,-25.421583980267307,-67.87480813505826,-120.16245846953592,-68.89155479679258,-122.00206448376261,35.263603445401785,6.416282520155956,203.41225708856086,-62.42983953251155,59.36113672119048,40.00275897200196,-62.55633545667429,89.66866371308245,-42.287712072353834,-72.59490110281287,52.23637641217955]
limitedL''' = Data.List.take limit collectionLVectorized'''

paintMatrix cosineSim labelsLimited limitedL'''
        na_ak   w_lud   ba_hy   w_lap   ne_dz   be_wy   zw_oz   mo_zu   be_wy   ba_hy   mo_zu   be_wy   w_lud
na_ak    1.00    0.83    0.78    0.63    0.78    0.81    0.83    0.76    0.77    0.80    0.77    0.79    0.79
w_lud    0.83    1.00    0.82    0.60    0.84    0.84    0.84    0.85    0.86    0.74    0.86    0.83    0.90
ba_hy    0.78    0.82    1.00    0.57    0.78    0.84    0.77    0.79    0.90    0.75    0.74    0.89    0.85
w_lap    0.63    0.60    0.57    1.00    0.38    0.60    0.50    0.43    0.52    0.45    0.55    0.65    0.47
ne_dz    0.78    0.84    0.78    0.38    1.00    0.81    0.79    0.90    0.89    0.77    0.81    0.81    0.90
be_wy    0.81    0.84    0.84    0.60    0.81    1.00    0.82    0.76    0.83    0.74    0.81    0.92    0.88
zw_oz    0.83    0.84    0.77    0.50    0.79    0.82    1.00    0.77    0.77    0.74    0.82    0.75    0.83
mo_zu    0.76    0.85    0.79    0.43    0.90    0.76    0.77    1.00    0.93    0.74    0.87    0.80    0.90
be_wy    0.77    0.86    0.90    0.52    0.89    0.83    0.77    0.93    1.00    0.72    0.81    0.89    0.92
ba_hy    0.80    0.74    0.75    0.45    0.77    0.74    0.74    0.74    0.72    1.00    0.66    0.73    0.72
mo_zu    0.77    0.86    0.74    0.55    0.81    0.81    0.82    0.87    0.81    0.66    1.00    0.80    0.88
be_wy    0.79    0.83    0.89    0.65    0.81    0.92    0.75    0.80    0.89    0.73    0.80    1.00    0.87
w_lud    0.79    0.90    0.85    0.47    0.90    0.88    0.83    0.90    0.92    0.72    0.88    0.87    1.00