tau-2020-pytorch-tutorial/word2vec.ipynb
Jakub Pokrywka 80333aca0a pretty view
2020-12-16 01:01:20 +01:00

13 KiB

!pip install gensim 
Requirement already satisfied: gensim in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (3.8.3)
Requirement already satisfied: smart-open>=1.8.1 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (4.0.1)
Requirement already satisfied: scipy>=0.18.1 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (1.5.2)
Requirement already satisfied: six>=1.5.0 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (1.15.0)
Requirement already satisfied: numpy>=1.11.3 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (1.19.2)
from IPython.display import Image
import gensim.downloader
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

Mikolov et al. (2013)

title

The cat XXX on the

Nauka nienadzorowana- nie trzeba zaetykietowanego korpusu

word_vectors = gensim.downloader.load("glove-wiki-gigaword-100")
[===============-----------------------------------] 30.2% 38.7/128.1MB downloaded
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

[==================================================] 100.0% 128.1/128.1MB downloaded
word_vectors['dog']
array([ 0.30817  ,  0.30938  ,  0.52803  , -0.92543  , -0.73671  ,
        0.63475  ,  0.44197  ,  0.10262  , -0.09142  , -0.56607  ,
       -0.5327   ,  0.2013   ,  0.7704   , -0.13983  ,  0.13727  ,
        1.1128   ,  0.89301  , -0.17869  , -0.0019722,  0.57289  ,
        0.59479  ,  0.50428  , -0.28991  , -1.3491   ,  0.42756  ,
        1.2748   , -1.1613   , -0.41084  ,  0.042804 ,  0.54866  ,
        0.18897  ,  0.3759   ,  0.58035  ,  0.66975  ,  0.81156  ,
        0.93864  , -0.51005  , -0.070079 ,  0.82819  , -0.35346  ,
        0.21086  , -0.24412  , -0.16554  , -0.78358  , -0.48482  ,
        0.38968  , -0.86356  , -0.016391 ,  0.31984  , -0.49246  ,
       -0.069363 ,  0.018869 , -0.098286 ,  1.3126   , -0.12116  ,
       -1.2399   , -0.091429 ,  0.35294  ,  0.64645  ,  0.089642 ,
        0.70294  ,  1.1244   ,  0.38639  ,  0.52084  ,  0.98787  ,
        0.79952  , -0.34625  ,  0.14095  ,  0.80167  ,  0.20987  ,
       -0.86007  , -0.15308  ,  0.074523 ,  0.40816  ,  0.019208 ,
        0.51587  , -0.34428  , -0.24525  , -0.77984  ,  0.27425  ,
        0.22418  ,  0.20164  ,  0.017431 , -0.014697 , -1.0235   ,
       -0.39695  , -0.0056188,  0.30569  ,  0.31748  ,  0.021404 ,
        0.11837  , -0.11319  ,  0.42456  ,  0.53405  , -0.16717  ,
       -0.27185  , -0.6255   ,  0.12883  ,  0.62529  , -0.52086  ],
      dtype=float32)
len(word_vectors['dog'])
100

$ A = (a_1, a_2, \ldots, a_n)$

$ B = (b_1, b_2, \ldots, b_n)$

$A \cdot B = a_1* b_1 + a_2_b_2 + \ldots a_nb_n$

$A \cdot B = |A| |B| cos(\theta)$

cosine_similarity = $\frac{A \cdot B}{|A||B|}$

image.png

word_vectors['dog'] + word_vectors['dog'] - word_vectors['man']
array([ 0.24340999,  0.23372999,  0.34519994, -1.19175   , -1.4724072 ,
        0.34235   ,  0.60779   ,  0.261443  ,  0.06009999, -1.37846   ,
       -0.88091004,  0.08861998,  1.05097   , -0.37221998, -0.05504   ,
        2.07504   ,  1.2128501 , -0.17209001,  0.5188256 ,  0.68386996,
        0.26919997,  0.977559  , -0.41735998, -2.29253   ,  0.06891   ,
        1.9723799 , -1.7875899 , -0.1394    , -0.08426201,  0.73421997,
        0.449713  ,  0.27947   ,  1.1328939 ,  1.48901   ,  1.44769   ,
        2.25301   , -0.23492998, -0.721868  ,  0.78779006, -0.73836505,
        0.88069   , -0.447323  , -1.29005   , -1.39741   , -1.10009   ,
        0.50502   , -1.6576351 , -0.055184  ,  0.38991004, -0.76956004,
        0.185334  ,  0.43640798, -0.882702  ,  0.83290005,  0.13615999,
       -0.23210001,  0.58739203,  0.24005997,  0.05180001, -0.398276  ,
        0.99437   ,  1.40552   ,  1.3153701 ,  1.20883   ,  1.23647   ,
        1.692517  , -1.5952799 , -0.22698998,  2.10365   ,  0.15522999,
       -1.87457   , -0.01184002,  0.03998601,  1.0829899 , -0.315964  ,
        0.98266095, -0.86874   ,  0.09540001, -1.0042601 ,  0.83836997,
       -0.29442003,  0.05798   ,  0.063619  ,  0.197066  , -0.7356999 ,
       -0.222     ,  0.5118224 ,  0.73807997,  0.733638  ,  0.577438  ,
       -0.04933   ,  0.14863001,  0.39170003,  1.022125  , -0.08759001,
       -0.589356  , -0.86798   ,  1.19477   ,  1.211442  , -0.50261   ],
      dtype=float32)
word_vectors.most_similar(positive=['orange'])
[('yellow', 0.7358633279800415),
 ('red', 0.7140780687332153),
 ('blue', 0.7118035554885864),
 ('green', 0.7111418843269348),
 ('pink', 0.6775072813034058),
 ('purple', 0.6774232387542725),
 ('black', 0.6709616184234619),
 ('colored', 0.665260910987854),
 ('lemon', 0.6251963376998901),
 ('peach', 0.616862416267395)]
word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
[('queen', 0.7698541283607483),
 ('monarch', 0.6843380928039551),
 ('throne', 0.6755737066268921),
 ('daughter', 0.6594556570053101),
 ('princess', 0.6520533561706543),
 ('prince', 0.6517034769058228),
 ('elizabeth', 0.6464517116546631),
 ('mother', 0.631171703338623),
 ('emperor', 0.6106470823287964),
 ('wife', 0.6098655462265015)]
word_vectors.most_similar(positive=['paris', 'germany'], negative=['france'])
[('berlin', 0.8846380710601807),
 ('frankfurt', 0.7985544204711914),
 ('vienna', 0.76759934425354),
 ('munich', 0.7542588710784912),
 ('hamburg', 0.718237042427063),
 ('bonn', 0.6890878677368164),
 ('prague', 0.6842441558837891),
 ('cologne', 0.6762093305587769),
 ('zurich', 0.6653269529342651),
 ('leipzig', 0.6619254350662231)]
word_vectors.most_similar(positive=['walking', 'swam'], negative=['swimming'])
[('walked', 0.6780266761779785),
 ('crawled', 0.6523419618606567),
 ('wandered', 0.6384279727935791),
 ('hopped', 0.6131909489631653),
 ('walks', 0.6122221946716309),
 ('walk', 0.6120144128799438),
 ('strolled', 0.6010454893112183),
 ('slept', 0.5912748575210571),
 ('wandering', 0.5861444473266602),
 ('waited', 0.5791574716567993)]
word_vectors.most_similar(positive=['puppy', 'cat'], negative=['dog'])
[('puppies', 0.6867596507072449),
 ('kitten', 0.6866798400878906),
 ('kittens', 0.6383703947067261),
 ('monkey', 0.6171090602874756),
 ('rabbit', 0.6136822700500488),
 ('pup', 0.6054644584655762),
 ('tabby', 0.5937005281448364),
 ('retriever', 0.5934329628944397),
 ('bitch', 0.5817775726318359),
 ('hound', 0.5778555870056152)]
word_vectors.most_similar(positive=['cat'])
[('dog', 0.8798074722290039),
 ('rabbit', 0.7424427270889282),
 ('cats', 0.7323004007339478),
 ('monkey', 0.7288710474967957),
 ('pet', 0.7190139293670654),
 ('dogs', 0.7163873314857483),
 ('mouse', 0.6915251016616821),
 ('puppy', 0.6800068616867065),
 ('rat', 0.6641027331352234),
 ('spider', 0.6501134634017944)]

image.png