tau-2020-pytorch-tutorial/word2vec.ipynb
Jakub Pokrywka 80333aca0a pretty view
2020-12-16 01:01:20 +01:00

470 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: gensim in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (3.8.3)\n",
"Requirement already satisfied: smart-open>=1.8.1 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (4.0.1)\n",
"Requirement already satisfied: scipy>=0.18.1 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (1.5.2)\n",
"Requirement already satisfied: six>=1.5.0 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (1.15.0)\n",
"Requirement already satisfied: numpy>=1.11.3 in /home/kuba/anaconda3/envs/tau/lib/python3.8/site-packages (from gensim) (1.19.2)\n"
]
}
],
"source": [
"!pip install gensim "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Image\n",
"import gensim.downloader"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>.container { width:100% !important; }</style>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.core.display import display, HTML\n",
"display(HTML(\"<style>.container { width:100% !important; }</style>\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Mikolov et al. (2013)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![title](w2v.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The cat XXX on the"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Nauka nienadzorowana- nie trzeba zaetykietowanego korpusu"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[===============-----------------------------------] 30.2% 38.7/128.1MB downloaded"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"IOPub message rate exceeded.\n",
"The notebook server will temporarily stop sending output\n",
"to the client in order to avoid crashing it.\n",
"To change this limit, set the config variable\n",
"`--NotebookApp.iopub_msg_rate_limit`.\n",
"\n",
"Current values:\n",
"NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
"NotebookApp.rate_limit_window=3.0 (secs)\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[==================================================] 100.0% 128.1/128.1MB downloaded\n"
]
}
],
"source": [
"word_vectors = gensim.downloader.load(\"glove-wiki-gigaword-100\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.30817 , 0.30938 , 0.52803 , -0.92543 , -0.73671 ,\n",
" 0.63475 , 0.44197 , 0.10262 , -0.09142 , -0.56607 ,\n",
" -0.5327 , 0.2013 , 0.7704 , -0.13983 , 0.13727 ,\n",
" 1.1128 , 0.89301 , -0.17869 , -0.0019722, 0.57289 ,\n",
" 0.59479 , 0.50428 , -0.28991 , -1.3491 , 0.42756 ,\n",
" 1.2748 , -1.1613 , -0.41084 , 0.042804 , 0.54866 ,\n",
" 0.18897 , 0.3759 , 0.58035 , 0.66975 , 0.81156 ,\n",
" 0.93864 , -0.51005 , -0.070079 , 0.82819 , -0.35346 ,\n",
" 0.21086 , -0.24412 , -0.16554 , -0.78358 , -0.48482 ,\n",
" 0.38968 , -0.86356 , -0.016391 , 0.31984 , -0.49246 ,\n",
" -0.069363 , 0.018869 , -0.098286 , 1.3126 , -0.12116 ,\n",
" -1.2399 , -0.091429 , 0.35294 , 0.64645 , 0.089642 ,\n",
" 0.70294 , 1.1244 , 0.38639 , 0.52084 , 0.98787 ,\n",
" 0.79952 , -0.34625 , 0.14095 , 0.80167 , 0.20987 ,\n",
" -0.86007 , -0.15308 , 0.074523 , 0.40816 , 0.019208 ,\n",
" 0.51587 , -0.34428 , -0.24525 , -0.77984 , 0.27425 ,\n",
" 0.22418 , 0.20164 , 0.017431 , -0.014697 , -1.0235 ,\n",
" -0.39695 , -0.0056188, 0.30569 , 0.31748 , 0.021404 ,\n",
" 0.11837 , -0.11319 , 0.42456 , 0.53405 , -0.16717 ,\n",
" -0.27185 , -0.6255 , 0.12883 , 0.62529 , -0.52086 ],\n",
" dtype=float32)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_vectors['dog']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"100"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(word_vectors['dog'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$ A = (a_1, a_2, \\ldots, a_n)$\n",
"\n",
"$ B = (b_1, b_2, \\ldots, b_n)$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$A \\cdot B = a_1* b_1 + a_2*b_2 + \\ldots a_n*b_n$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$A \\cdot B = |A| |B| cos(\\theta)$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"cosine_similarity = $\\frac{A \\cdot B}{|A||B|}$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![image.png](cos.png)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.24340999, 0.23372999, 0.34519994, -1.19175 , -1.4724072 ,\n",
" 0.34235 , 0.60779 , 0.261443 , 0.06009999, -1.37846 ,\n",
" -0.88091004, 0.08861998, 1.05097 , -0.37221998, -0.05504 ,\n",
" 2.07504 , 1.2128501 , -0.17209001, 0.5188256 , 0.68386996,\n",
" 0.26919997, 0.977559 , -0.41735998, -2.29253 , 0.06891 ,\n",
" 1.9723799 , -1.7875899 , -0.1394 , -0.08426201, 0.73421997,\n",
" 0.449713 , 0.27947 , 1.1328939 , 1.48901 , 1.44769 ,\n",
" 2.25301 , -0.23492998, -0.721868 , 0.78779006, -0.73836505,\n",
" 0.88069 , -0.447323 , -1.29005 , -1.39741 , -1.10009 ,\n",
" 0.50502 , -1.6576351 , -0.055184 , 0.38991004, -0.76956004,\n",
" 0.185334 , 0.43640798, -0.882702 , 0.83290005, 0.13615999,\n",
" -0.23210001, 0.58739203, 0.24005997, 0.05180001, -0.398276 ,\n",
" 0.99437 , 1.40552 , 1.3153701 , 1.20883 , 1.23647 ,\n",
" 1.692517 , -1.5952799 , -0.22698998, 2.10365 , 0.15522999,\n",
" -1.87457 , -0.01184002, 0.03998601, 1.0829899 , -0.315964 ,\n",
" 0.98266095, -0.86874 , 0.09540001, -1.0042601 , 0.83836997,\n",
" -0.29442003, 0.05798 , 0.063619 , 0.197066 , -0.7356999 ,\n",
" -0.222 , 0.5118224 , 0.73807997, 0.733638 , 0.577438 ,\n",
" -0.04933 , 0.14863001, 0.39170003, 1.022125 , -0.08759001,\n",
" -0.589356 , -0.86798 , 1.19477 , 1.211442 , -0.50261 ],\n",
" dtype=float32)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_vectors['dog'] + word_vectors['dog'] - word_vectors['man']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('yellow', 0.7358633279800415),\n",
" ('red', 0.7140780687332153),\n",
" ('blue', 0.7118035554885864),\n",
" ('green', 0.7111418843269348),\n",
" ('pink', 0.6775072813034058),\n",
" ('purple', 0.6774232387542725),\n",
" ('black', 0.6709616184234619),\n",
" ('colored', 0.665260910987854),\n",
" ('lemon', 0.6251963376998901),\n",
" ('peach', 0.616862416267395)]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_vectors.most_similar(positive=['orange'])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('queen', 0.7698541283607483),\n",
" ('monarch', 0.6843380928039551),\n",
" ('throne', 0.6755737066268921),\n",
" ('daughter', 0.6594556570053101),\n",
" ('princess', 0.6520533561706543),\n",
" ('prince', 0.6517034769058228),\n",
" ('elizabeth', 0.6464517116546631),\n",
" ('mother', 0.631171703338623),\n",
" ('emperor', 0.6106470823287964),\n",
" ('wife', 0.6098655462265015)]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[('berlin', 0.8846380710601807),\n",
" ('frankfurt', 0.7985544204711914),\n",
" ('vienna', 0.76759934425354),\n",
" ('munich', 0.7542588710784912),\n",
" ('hamburg', 0.718237042427063),\n",
" ('bonn', 0.6890878677368164),\n",
" ('prague', 0.6842441558837891),\n",
" ('cologne', 0.6762093305587769),\n",
" ('zurich', 0.6653269529342651),\n",
" ('leipzig', 0.6619254350662231)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_vectors.most_similar(positive=['paris', 'germany'], negative=['france'])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('walked', 0.6780266761779785),\n",
" ('crawled', 0.6523419618606567),\n",
" ('wandered', 0.6384279727935791),\n",
" ('hopped', 0.6131909489631653),\n",
" ('walks', 0.6122221946716309),\n",
" ('walk', 0.6120144128799438),\n",
" ('strolled', 0.6010454893112183),\n",
" ('slept', 0.5912748575210571),\n",
" ('wandering', 0.5861444473266602),\n",
" ('waited', 0.5791574716567993)]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_vectors.most_similar(positive=['walking', 'swam'], negative=['swimming'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('puppies', 0.6867596507072449),\n",
" ('kitten', 0.6866798400878906),\n",
" ('kittens', 0.6383703947067261),\n",
" ('monkey', 0.6171090602874756),\n",
" ('rabbit', 0.6136822700500488),\n",
" ('pup', 0.6054644584655762),\n",
" ('tabby', 0.5937005281448364),\n",
" ('retriever', 0.5934329628944397),\n",
" ('bitch', 0.5817775726318359),\n",
" ('hound', 0.5778555870056152)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_vectors.most_similar(positive=['puppy', 'cat'], negative=['dog'])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('dog', 0.8798074722290039),\n",
" ('rabbit', 0.7424427270889282),\n",
" ('cats', 0.7323004007339478),\n",
" ('monkey', 0.7288710474967957),\n",
" ('pet', 0.7190139293670654),\n",
" ('dogs', 0.7163873314857483),\n",
" ('mouse', 0.6915251016616821),\n",
" ('puppy', 0.6800068616867065),\n",
" ('rat', 0.6641027331352234),\n",
" ('spider', 0.6501134634017944)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_vectors.most_similar(positive=['cat'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![image.png](linear-relationships.png)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}