13 KiB
13 KiB
!pip install gensim
Requirement already satisfied: gensim in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (3.8.3) Requirement already satisfied: smart-open>=1.8.1 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (4.0.1) Requirement already satisfied: scipy>=0.18.1 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (1.5.2) Requirement already satisfied: six>=1.5.0 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (1.15.0) Requirement already satisfied: numpy>=1.11.3 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (1.19.2)
from IPython.display import Image
import gensim.downloader
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
Mikolov et al. (2013)
The cat XXX on the
Nauka nienadzorowana- nie trzeba zaetykietowanego korpusu
word_vectors = gensim.downloader.load("glove-wiki-gigaword-100")
[===============-----------------------------------] 30.2% 38.7/128.1MB downloaded
IOPub message rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_msg_rate_limit`. Current values: NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec) NotebookApp.rate_limit_window=3.0 (secs)
[==================================================] 100.0% 128.1/128.1MB downloaded
word_vectors['dog']
array([ 0.30817 , 0.30938 , 0.52803 , -0.92543 , -0.73671 , 0.63475 , 0.44197 , 0.10262 , -0.09142 , -0.56607 , -0.5327 , 0.2013 , 0.7704 , -0.13983 , 0.13727 , 1.1128 , 0.89301 , -0.17869 , -0.0019722, 0.57289 , 0.59479 , 0.50428 , -0.28991 , -1.3491 , 0.42756 , 1.2748 , -1.1613 , -0.41084 , 0.042804 , 0.54866 , 0.18897 , 0.3759 , 0.58035 , 0.66975 , 0.81156 , 0.93864 , -0.51005 , -0.070079 , 0.82819 , -0.35346 , 0.21086 , -0.24412 , -0.16554 , -0.78358 , -0.48482 , 0.38968 , -0.86356 , -0.016391 , 0.31984 , -0.49246 , -0.069363 , 0.018869 , -0.098286 , 1.3126 , -0.12116 , -1.2399 , -0.091429 , 0.35294 , 0.64645 , 0.089642 , 0.70294 , 1.1244 , 0.38639 , 0.52084 , 0.98787 , 0.79952 , -0.34625 , 0.14095 , 0.80167 , 0.20987 , -0.86007 , -0.15308 , 0.074523 , 0.40816 , 0.019208 , 0.51587 , -0.34428 , -0.24525 , -0.77984 , 0.27425 , 0.22418 , 0.20164 , 0.017431 , -0.014697 , -1.0235 , -0.39695 , -0.0056188, 0.30569 , 0.31748 , 0.021404 , 0.11837 , -0.11319 , 0.42456 , 0.53405 , -0.16717 , -0.27185 , -0.6255 , 0.12883 , 0.62529 , -0.52086 ], dtype=float32)
len(word_vectors['dog'])
100
$ A = (a_1, a_2, \ldots, a_n)$
$ B = (b_1, b_2, \ldots, b_n)$
$A \cdot B = a_1* b_1 + a_2_b_2 + \ldots a_nb_n$
$A \cdot B = |A| |B| cos(\theta)$
cosine_similarity = $\frac{A \cdot B}{|A||B|}$
word_vectors['dog'] + word_vectors['dog'] - word_vectors['man']
array([ 0.24340999, 0.23372999, 0.34519994, -1.19175 , -1.4724072 , 0.34235 , 0.60779 , 0.261443 , 0.06009999, -1.37846 , -0.88091004, 0.08861998, 1.05097 , -0.37221998, -0.05504 , 2.07504 , 1.2128501 , -0.17209001, 0.5188256 , 0.68386996, 0.26919997, 0.977559 , -0.41735998, -2.29253 , 0.06891 , 1.9723799 , -1.7875899 , -0.1394 , -0.08426201, 0.73421997, 0.449713 , 0.27947 , 1.1328939 , 1.48901 , 1.44769 , 2.25301 , -0.23492998, -0.721868 , 0.78779006, -0.73836505, 0.88069 , -0.447323 , -1.29005 , -1.39741 , -1.10009 , 0.50502 , -1.6576351 , -0.055184 , 0.38991004, -0.76956004, 0.185334 , 0.43640798, -0.882702 , 0.83290005, 0.13615999, -0.23210001, 0.58739203, 0.24005997, 0.05180001, -0.398276 , 0.99437 , 1.40552 , 1.3153701 , 1.20883 , 1.23647 , 1.692517 , -1.5952799 , -0.22698998, 2.10365 , 0.15522999, -1.87457 , -0.01184002, 0.03998601, 1.0829899 , -0.315964 , 0.98266095, -0.86874 , 0.09540001, -1.0042601 , 0.83836997, -0.29442003, 0.05798 , 0.063619 , 0.197066 , -0.7356999 , -0.222 , 0.5118224 , 0.73807997, 0.733638 , 0.577438 , -0.04933 , 0.14863001, 0.39170003, 1.022125 , -0.08759001, -0.589356 , -0.86798 , 1.19477 , 1.211442 , -0.50261 ], dtype=float32)
word_vectors.most_similar(positive=['orange'])
[('yellow', 0.7358633279800415), ('red', 0.7140780687332153), ('blue', 0.7118035554885864), ('green', 0.7111418843269348), ('pink', 0.6775072813034058), ('purple', 0.6774232387542725), ('black', 0.6709616184234619), ('colored', 0.665260910987854), ('lemon', 0.6251963376998901), ('peach', 0.616862416267395)]
word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
[('queen', 0.7698541283607483), ('monarch', 0.6843380928039551), ('throne', 0.6755737066268921), ('daughter', 0.6594556570053101), ('princess', 0.6520533561706543), ('prince', 0.6517034769058228), ('elizabeth', 0.6464517116546631), ('mother', 0.631171703338623), ('emperor', 0.6106470823287964), ('wife', 0.6098655462265015)]
word_vectors.most_similar(positive=['paris', 'germany'], negative=['france'])
[('berlin', 0.8846380710601807), ('frankfurt', 0.7985544204711914), ('vienna', 0.76759934425354), ('munich', 0.7542588710784912), ('hamburg', 0.718237042427063), ('bonn', 0.6890878677368164), ('prague', 0.6842441558837891), ('cologne', 0.6762093305587769), ('zurich', 0.6653269529342651), ('leipzig', 0.6619254350662231)]
word_vectors.most_similar(positive=['walking', 'swam'], negative=['swimming'])
[('walked', 0.6780266761779785), ('crawled', 0.6523419618606567), ('wandered', 0.6384279727935791), ('hopped', 0.6131909489631653), ('walks', 0.6122221946716309), ('walk', 0.6120144128799438), ('strolled', 0.6010454893112183), ('slept', 0.5912748575210571), ('wandering', 0.5861444473266602), ('waited', 0.5791574716567993)]
word_vectors.most_similar(positive=['puppy', 'cat'], negative=['dog'])
[('puppies', 0.6867596507072449), ('kitten', 0.6866798400878906), ('kittens', 0.6383703947067261), ('monkey', 0.6171090602874756), ('rabbit', 0.6136822700500488), ('pup', 0.6054644584655762), ('tabby', 0.5937005281448364), ('retriever', 0.5934329628944397), ('bitch', 0.5817775726318359), ('hound', 0.5778555870056152)]
word_vectors.most_similar(positive=['cat'])
[('dog', 0.8798074722290039), ('rabbit', 0.7424427270889282), ('cats', 0.7323004007339478), ('monkey', 0.7288710474967957), ('pet', 0.7190139293670654), ('dogs', 0.7163873314857483), ('mouse', 0.6915251016616821), ('puppy', 0.6800068616867065), ('rat', 0.6641027331352234), ('spider', 0.6501134634017944)]