polish-urban-legends-public/Untitled.ipynb

368 lines
42 KiB
Plaintext
Raw Permalink Normal View History

2021-04-20 18:43:03 +02:00
{
"cells": [
{
"cell_type": "code",
2021-04-20 19:11:45 +02:00
"execution_count": 164,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from many_stop_words import get_stop_words\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from unidecode import unidecode\n",
"from nltk.tokenize import word_tokenize\n",
2021-04-20 19:06:45 +02:00
"import string\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.cluster import KMeans"
2021-04-20 18:43:03 +02:00
]
},
{
"cell_type": "code",
2021-04-20 19:11:45 +02:00
"execution_count": 165,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [],
"source": [
"data=pd.read_csv('dev-0/in.tsv', sep='\\t', header=None)\n",
2021-04-20 19:06:45 +02:00
"data_test=pd.read_csv('test-A/in.tsv', sep='\\t', header=None)"
2021-04-20 18:43:03 +02:00
]
},
{
"cell_type": "code",
2021-04-20 19:11:45 +02:00
"execution_count": 166,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [],
"source": [
"def remove_punctuations(text):\n",
" for punctuation in string.punctuation:\n",
" text = text.replace(punctuation, '')\n",
" return text"
]
},
{
"cell_type": "code",
2021-04-20 19:11:45 +02:00
"execution_count": 167,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [],
"source": [
"data[0] = data[0].str.lower()\n",
2021-04-20 19:06:45 +02:00
"data_test[0] = data_test[0].str.lower()\n",
2021-04-20 18:43:03 +02:00
"stop_words = get_stop_words('pl')"
]
},
{
"cell_type": "code",
2021-04-20 19:11:45 +02:00
"execution_count": 168,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [],
"source": [
"data[0] = data[0].apply(unidecode)\n",
2021-04-20 19:06:45 +02:00
"data_test[0] = data_test[0].apply(unidecode)\n",
2021-04-20 18:43:03 +02:00
"uni_stop_words = [unidecode(x) for x in stop_words]"
]
},
{
"cell_type": "code",
2021-04-20 19:11:45 +02:00
"execution_count": 169,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [],
"source": [
2021-04-20 19:06:45 +02:00
"data[0] = data[0].apply(remove_punctuations)\n",
"data_test[0] = data_test[0].apply(remove_punctuations)"
2021-04-20 18:43:03 +02:00
]
},
{
"cell_type": "code",
2021-04-20 19:11:45 +02:00
"execution_count": 170,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [],
"source": [
2021-04-20 19:06:45 +02:00
"data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))\n",
"data_test[0] = data_test[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))"
2021-04-20 18:43:03 +02:00
]
},
{
"cell_type": "code",
2021-04-20 19:11:45 +02:00
"execution_count": 171,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [],
"source": [
"tf=TfidfVectorizer()\n",
2021-04-20 19:06:45 +02:00
"text_tf= tf.fit_transform(data[0])\n",
"text_test_tf= tf.fit_transform(data_test[0])"
2021-04-20 18:43:03 +02:00
]
},
{
"cell_type": "code",
2021-04-20 19:11:45 +02:00
"execution_count": 174,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2021-04-20 19:11:45 +02:00
"0 opowiesc prawdziwa olsztyn akademik 7 pietro i...\n",
"1 podejrzewam polowaniu mowy prostu znalazl mart...\n",
"2 smutne przypomina historie balwankami wredny f...\n",
"3 kumpla zdawal walentynki polozyl koperte laski...\n",
"4 przypomniala krakowskich urban legends chyba n...\n",
" ... \n",
"82 wczoraj popoludniowej audycji trojce prowadzac...\n",
"83 sluchajcie uwielbiam opowiadacv sluchac jakies...\n",
"84 wczoraj probie koncertu czwartkowego akompania...\n",
"85 zuzanna mala historia przyszla panna mloda kup...\n",
"86 koszmar zaczyna niewinnego spotkania jednym to...\n",
"Name: 0, Length: 87, dtype: object"
2021-04-20 18:43:03 +02:00
]
},
2021-04-20 19:11:45 +02:00
"execution_count": 174,
2021-04-20 18:43:03 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2021-04-20 19:11:45 +02:00
"data[0]"
2021-04-20 18:43:03 +02:00
]
},
{
"cell_type": "code",
2021-04-20 19:11:45 +02:00
"execution_count": 173,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [
{
"data": {
2021-04-20 19:11:45 +02:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEWCAYAAACOv5f1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAtx0lEQVR4nO3dd5xU5fXH8c8BpIOgECU2kCjGBuKiiIIiKjbAXiKKWJAoYkPEGqNJFI0aK7GAvaAUUUFFcEWNoi6IgC0IP6yAFBVEI8Xz++O5K8Oy7S4ze2d3vu/Xa147c+fembPDcu6dp5zH3B0REcktNZIOQEREKp+Sv4hIDlLyFxHJQUr+IiI5SMlfRCQHKfmLiOQgJX/ZgJmdbmZvpjx2M/tDkjGlSzp/FzObb2YHpeO1kmZmp5jZxAy99mtmdlYJz11rZo9l4n2ldEr+OSpKXD+b2Y8pt7uSjgt+O/m4md1WZHuvaPtD5XydEpNOppnZQ2a2qsjne2KaXruOmd1gZl9E/4ZzzOxSM7NyHt8y+hxrFW5z98fd/ZB0xCdVQ62yd5FqrIe7T0o6iBLMBU4ws0vdfU20rQ/w3wRjiusmd7+qogebWa2U3z3VM8CWwOHAJ0Ae8CiwDTCwou8nuUVX/lJeh5vZPDNbYmY3m1kNADOrYWZXmdnnZvatmT1iZptGzz1sZpdE97eKrjbPix63NrNlha9TjIXALKB7tP9mQCfgudSdzKyjmb1lZt+b2QdmdkC0/e9AZ+CuYr7VHBRdLX9vZncXXjGX9rtEz58aPbfUzK6s6AdpZmeb2WfR7/+cmf0+5Tk3s/PMbA4wp5hjuwGHAMe6+2x3X+PuU4HewHmFTVrRt54bzOxdM1tuZuOizxDg9ejn99Fns08JTX3nRp/TCjO7Pvo3eyt6vafNrHa0b1Mze8HMFpvZd9H9rSvwuWxiZk+a2ejC15bMUfKX8jqacIXZHugFnBFtPz26dQW2BxoChYl2CnBAdH9/YB7QJeXxG+7+aynv+QhwWnT/JGAc8Evhk2a2FTAe+BuwGTAIGG1mzd39SuANYIC7N3T3ASmveyTQAdgdOIHoBFPa72JmOwPDgFOB3wObAxVJcAcCN0Tv2wL4HHiqyG5HAXsDOxfzEgcD77j7l6kb3f0d4CugW8rm0wj/Ti2ANcAd0fbCf4Mm0Wfzdgnhdgf2BDoCg4H7CCeZbYBdgZOj/WoADwLbAdsCP7Pub6BczKwe8Czh3/cEd18V53iJT8k/tz0bXf0W3s4uZd+h7r7M3b8A/sW6//inALe6+zx3/xG4HDgpak+eAuwXXd13AW4C9o2O2z96vjRjgQOiq+/TCCeDVL2BCe4+wd1/dfdXgAJCc0hpbnT376PfJR9oV47f5TjgBXd/3d1/Aa4GSjtxAQxK+WyXpLzHCHefHr3O5cA+ZtYy5bgbos/652JesxmwoIT3WxA9X+jR6NvByijeE8ysZhkxp7rJ3Ze7+4fAbGBi9Nn8ALwI7AHg7kvdfbS7/+TuK4C/E/59y6sx8BKhqa+vu6+NcaxUkJJ/bjvK3Zuk3O4vZd/UK83PCVe/RD8/L/JcLWALd58LrCQk187AC8A3ZtaGciT/KPmNB64CNnf3/xTZZTvg+NQTGLAf4Uq3NAtT7v9EuMIv9XeJnvvtM4gS6tIy3uefKZ9tYVJe7z2ik8xSYKuU49a7qi9iCSX/fi2i54t7nc+BTVj/5FCWRSn3fy7mcUMAM6tvZvdGTWLLCc1KTWKcaDoSvoXd6Ko0WWmU/KW8tkm5vy3wTXT/G0ISTn1uDesSxRTCVXNtd/86etwHaArMKMf7PgJcAhQ3HPBLwtVt6gmsgbvfGD0fN5GU9rssIOUzMLP6hKafuNZ7DzNrEL3O1yn7lBb3JGBvM0v998DM9o7iezVlc9F/s9WEk0O6E+wlQBtgb3dvzLpmpXKNPgImEprCJpvZFmmOTUqg5C/ldWnUsbcNcAEwMtr+JHCRmbUys4bAP4CRKaNUpgADWNfJ+Fr0+M1yfr2fQmjnvrOY5x4DephZdzOraWZ1zeyAlM7GRYS2+/Iq7XcZBRxpZvtFnZHXUbH/P08Cfc2snZnVid7jHXefX56Do9FZkwl9G7tEv3dHwmcxzN1TO4l7m9nO0YnqOmBU9JkvJjRZxflsStOI8E3g+6hT+S9xX8DdbwKeIJwA4nw7kQpS8s9tz9v649DHlrLvOGAa4Wp9PDA82j6CMMzwdeD/gP8B56ccN4WQHAqT/5tA/ZTHpfJgsrsvK+a5Lwmdz1cQEtqXwKWs+7u+HTguGoFyR9Hji1Hi7xK1e59HSFALgO8IHayxRMn7amB09DqtCZ3ZcRxL6Kt4CfiRkPiHs/7nTvS7PERo5qpLNAzU3X8itMv/J2ou6xj39yjiX0A9wreKqVFcsbn79YRO30kpI5MkQ0xNbCLVj5m9Bjzm7g8kHYtkJ135i4jkICV/EZEcpGYfEZEcpCt/EZEcVGUKuzVr1sxbtmyZdBgiIlXKtGnTlrh786Lbq0zyb9myJQUFBUmHISJSpZjZ58VtV7OPiEgOUvIXEclBSv4iIjlIyV9EJAcp+YuI5KBqm/xvugny89fflp8ftouI5Lpqm/w7dIATTlh3AsjPD487dEg2LhGRbFBlxvnH1bUrPP00HH00dO4MU6eGx127Jh2ZiEjyqu2VP4RE36IFvPACdOyoxC8iUqhaJ//8fFiyBFq3DieAwYOTjkhEJDtU2+Rf2Mb/9NMwezbk5cHNN8OQIUlHJiKSvGqb/N97b10bf9268PrrsOeeMHQoPFbcUuAiIjmk2nb4Fm3iqVcvnAB69IA+faBGDfjTn5KJTUQkadX2yr849evD88/D/vvDqafCU08lHZGISDJyKvnDuhNA587Qu3doGhIRyTU5l/wBGjQIo386dQpNP6NGJR2RiEjlysnkD9CwIYwfH8b/n3wyjBmTdEQiIpUnZ5M/QKNG8OKLoeTDiSfCs88mHZGISOXI6eQP4QTw0kthGOgJJ4T+ABGR6i7nkz9A48bw8svQrh0ce2zoDxARqc6U/CObbgoTJ0LbtuEEMGFC0hGJiGSOkn+KJk3CCWDXXeGYY8K3ARGR6ijjyd/MLjKzD81stpk9aWZ1zayVmb1jZp+Z2Ugzq53pOMqraVN45RX44x+hV69wMhARqW4ymvzNbCtgIJDn7rsCNYGTgKHAbe7+B+A74MxMxhHXZpvBpEnQpg0ccQT885/rP68VwUSkqquMZp9aQD0zqwXUBxYABwKFU6seBo6qhDhi2XxzmDwZttkGLr0UbrklbNeKYCJSHWS0sJu7f21m/wS+AH4GJgLTgO/dfU2021fAVpmMo6KaNYN33oG99oJBg2DatNAkpBXBRKSqy3SzT1OgF9AK+D3QADg0xvH9zKzAzAoWL16coShL17x5OAE0awZPPhlGBTVpkkgoIiJpk+lmn4OA/3P3xe6+GhgD7As0iZqBALYGvi7uYHe/z93z3D2vefPmGQ61ZB9+GH526wbz5kH79mFG8KefJhaSiMhGyXTy/wLoaGb1zcyAbsBHQD5wXLRPH2BchuOosNQVwSZNgnHjQmXQ556DXXaBs86CL79MOkoRkXgymvzd/R1Cx+50YFb0fvcBlwEXm9lnwObA8EzGsTFSVwSDsBjMCy+EPoABA+DRR+EPf4CLLoKEWqZERGIzd086hnLJy8vzgoKCpMPYwOefw3XXwUMPhW8EF10El1wS+gZERJJmZtPcPa/ods3w3UjbbQfDh4d+gcMOg+uvh+23D3MDfv456ehERIqn5J8mO+0UmocKCsLQ0EsvDc1B994Lq1cnHZ2IyPqU/NNszz3DGgFTpkDLltC/fygVcfLJYdJYKs0UFpGkKPlnSJcu8OaboXO4YcOwWHz37vD3v4O7ZgqLSLKU/DPILNQGmj49TBDbYgu46qpQM+j44zVTWESSo+RfCWrUgJNOgvnz4eCDYc4c2GQT2HHHpCMTkVyl5F+J3nwT3n8/tP8vXBhWDvvgg6SjEpFcpORfSVJnCj/xBNx/PyxbBh07atUwEal8Sv6VpOhM4bPOCp3Am20
2021-04-20 18:43:03 +02:00
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"Sum_of_squared_distances = []\n",
2021-04-20 19:06:45 +02:00
"K = range(2,20)\n",
2021-04-20 18:43:03 +02:00
"for k in K:\n",
" km = KMeans(n_clusters=k, max_iter=200, n_init=10)\n",
" km = km.fit(text_tf)\n",
" Sum_of_squared_distances.append(km.inertia_)\n",
"plt.plot(K, Sum_of_squared_distances, 'bx-')\n",
"plt.xlabel('k')\n",
"plt.ylabel('Sum_of_squared_distances')\n",
"plt.title('Elbow Method For Optimal k')\n",
"plt.show()"
]
},
{
"cell_type": "code",
2021-04-20 19:06:45 +02:00
"execution_count": 161,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [
{
2021-04-20 19:06:45 +02:00
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAyoElEQVR4nO3dd5iU5dnG4d8FWLFgQWMXG1ETRQQUNSo2sIXYSzS22GKLGrESsRvsRiX22AXBglGxsWKJDSMWQCNRFFQUY1CwgOD9/fG8++2wDrszMMPszl7nccyx87aZ+92BuffpigjMzMzqa1XpAMzMrGlygjAzs7ycIMzMLC8nCDMzy8sJwszM8nKCMDOzvJwgrGCSDpb0fM52SFqrkjGVSinvRdJ4SduV4rUqTdJvJT1Rptd+RtLv53Csn6Q7y/G+VjgnCJtN9uX2naRpOY9rKh0X/H+CCklX1NvfO9v/9wJfZ45fTOUm6e+SZtT7/e5TotdeSNJFkj7KPsP3JJ0iSQVev3r2e2xTuy8i7oqIHUoRnzU/bRo/xVqgXSPiqUoHMQf/AfaWdEpEzMz2HQT8u4IxFat/RJw1txdLapNz77nuA34G7AS8A3QB7gBWAY6f2/ezlsslCJtXO0l6X9IXki6R1ApAUitJZ0n6UNLnkm6XtGR27DZJJ2fPV8r+aj0m215T0pe1r5PHJOAtoGd2/tLAZsDQ3JMkbSrpn5KmSHpD0tbZ/guAXwHX5CkdbZf91T1F0rW1f3k3dC/Z8QOzY/+VdObc/iIlHS5pXHb/QyWtmHMsJB0j6T3gvTzXbgvsAOwREW9HxMyIeAk4ADimtvosKz1dJOkVSV9Leij7HQI8m/2ckv1uus+hWvEP2e9pqqTzss/sn9nrDZK0YHbuUpL+IWmypP9lz1eei9/LApLukTSk9rVt/nCCsHm1G+kv1c5Ab+DQbP/B2aMHsAawGFD7ZTwC2Dp7vhXwPrBlzvZzEfFjA+95O/C77Pm+wEPA9NqDklYCHgHOB5YG/gQMkdQ+Is4EngOOjYjFIuLYnNfdBegKbADsTZaEGroXSesBA4ADgRWBZYC5+RLcBrgoe98VgA+Be+ud9htgE2C9PC+xPfByREzI3RkRLwMTgW1zdv+O9DmtAMwErs72134G7bLfzYtzCLcnsDGwKdAHuIGUiFYBfgHsl53XCrgVWA1YFfiOun8DBZG0CPAg6fPdOyJmFHO9zRsnCMvnweyv6NrH4Q2c+5eI+DIiPgKupO7L4bfA5RHxfkRMA04H9s3qt0cAW2SlhC2B/sDm2XVbZccb8gCwdfZX/O9ICSPXAcCjEfFoRPwYEU8CI0lVLw25OCKmZPdSA3Qq4F72BP4REc9GxHSgL9BQcgP4U87v9ouc97glIv6Vvc7pQHdJq+dcd1H2u/4uz2suC3w6h/f7NDte646slPFNFu/eklo3EnOu/hHxdUSMBt4Gnsh+N18BjwEbAUTEfyNiSER8GxFTgQtIn2+hlgCGkaoVD4mIWUVcayXgBGH5/CYi2uU8bmzg3Ny/WD8k/RVN9vPDesfaAMtHxH+Ab0hfwL8C/gF8IqkjBSSI7AvyEeAsYJmIeKHeKasBe+UmOWAL0l/MDZmU8/xbUkmhwXvJjv3/7yD70v1vI+9zac7vtvaLe7b3yBLRf4GVcq6brXRQzxfM+f5WyI7ne50PgQWYPYE05rOc59/l2V4MQNKikq7Pqt++JlVhtSsiGW1KKs1dHJ5VtCKcIGxerZLzfFXgk+z5J6Qv6txjM6n7MhlB+ut7wYj4ONs+CFgKGFXA+94OnAzk6wo5gfRXcm6SaxsRF2fHi/2yaehePiXndyBpUVI1U7Fmew9JbbPX+TjnnIbifgrYRFLu54GkTbL4hufsrv+Z/UBKIKX+Ej4Z6AhsEhFLUFeFVVCvKuAJUrXb05KWL3FsVgAnCJtXp2SNkasAJwADs/33ACdK6iBpMeBCYGBO75sRwLHUNYw+k20/X2BVwghSvftf8xy7E9hVUk9JrSUtLGnrnAbSz0htCYVq6F4GA7tI2iJrQD2Xuft/dQ9wiKROkhbK3uPliBhfyMVZr7OnSW0t62f3vSnpdzEgInIbtg+QtF6WzM4FBme/88mk6rFifjcNWZxUopiSNYSfXewLRER/4G5SkiimlGMl4ARh+Tys2fvpP9DAuQ8Br5H+6n8EuDnbfwupi+WzwAfA98BxOdeNIH2B1CaI54FFc7YbFMnTEfFlnmMTSA3mZ5C+9CYAp1D37/0qYM+sZ83V9a/PY473ktXDH0P6EvsU+B+pUbgo2Rd8X2BI9jprkhrgi7EHqe1kGDCNlBxuZvbfO9m9/J1UpbYwWRfYiPiW1E7wQlY1t2mx91HPlcAipNLJS1lcRYuI80gN1U/l9Liy+UCu2jNrOSQ9A9wZETdVOhZr+lyCMDOzvJwgzMwsL1cxmZlZXi5BmJlZXs1+sr5ll102Vl999UqHYWbWrLz22mtfRET7hs5p9gli9dVXZ+TIkZUOw8ysWZH0YWPnuIrJzMzycoIwM7O8nCDMzCwvJwgzM8vLCcLMzPJqcQmif3+oqZl9X01N2m9mZnVaXILo2hX23rsuSdTUpO2uXSsbl5lZU9Psx0EUq0cPGDQI9twTOnaE995L2z16VDoyM7OmpcWVICAlg65d4cUXYemlYZNNKh2RmVnT0yITRE0NvPYa7Lgj/Pvf0K0bfPmTZWfMzFq2FpcgatscBg2CRx+Fs8+G0aOhc2eYWPQ6YGZm1avFJYhXX529zaFfP7jsMpg0CTbbDMaMqWh4ZmZNRrNfD6JLly5Risn6Ro1KVU7Tp8M//pGShZlZtZL0WkR0aeicFleCmJNOneCf/4RlloHttoOHH650RGZmlVX2BCGpnaTBkt6RNFZSd0kDJY3KHuMljco5/3RJ4yS9K6lnuePL1aEDvPACrL8+7LYb3Hrr/Hx3M7OmZX6UIK4ChkXEz4ENgbERsU9EdIqITsAQ4H4ASesB+wLrA72A6yS1ng8x/r/lloPhw2GbbeDQQ1O10/Dhs5/jkddm1hKUNUFIWhLYErgZICJmRMSUnOMC9gbuyXb1Bu6NiOkR8QEwDuhWzhjzWXzx1A6x//4wbBjsvDM8/XQ65pHXZtZSlLsE0QGYDNwq6XVJN0lqm3P8V8BnEfFetr0SMCHn+MRs32wkHSFppKSRkydPLkvgCy4Id9wBJ54I33+fksSZZ9Z1kfXIazOrduVOEG2AzsCAiNgI+AY4Lef4ftSVHgoWETdERJeI6NK+fYNLqs6TVq1SF9j+/VPvpgsvhKOPdnIws5ah3AliIjAxIl7OtgeTEgaS2gC7AwNzzv8YWCVne+VsX8VI0KULLLxwev7Xv/50Nlgzs2pU1gQREZOACZI6Zru2BWqHom0HvBMRueOXhwL7SlpIUgdgbeCVcsbYmNo2h4EDUwN2+/azzwZrZlat5sdsrscBd0laEHgfOCTbvy/1qpciYrSkQaQkMhM4JiJmzYcY5yh35PWMGbDXXnDkkWm/q5rMrJp5JHURItL4iCeegLfegjXXnC9va2ZWch5JXWISXHstLLAAHHFEShhmZtXKCaJIK62UejUNH+6R1mZW3Zwg5sLhh8OWW8LJJ6dZYM3MqpETxFxo1QpuuAG++w6OO67S0ZiZlYcTxFzq2BH+/GcYPBgefLDS0ZiZlZ4TxDw45RTYYAM45hj46qtKR2NmVlpOEPNggQXg5ptTO8Spp1Y6GjOz0nKCmEdduqQJ/a6/Hp59ttLRmJmVjhNECZxzTlps6PDD08yvZmbVwAmiBNq2Tb2a/v1vOO+8SkdjZlYaThAlst12sPHGcPHF8MYbdfu9+pyZNVdOECXUt2+afmPvvWHmTK8+Z2bNmxNECfXuDWedlaqaNt/cq8+ZWfPmBFFi55yTqppeeSWNuG7XrtIRmZnNHSeIEnvmGfjww1R6+OKL1A22b9+0ZKmZWXPiBFFCtW0OgwalFegeeCANpjv//JQo5tOyFWZmJeEEUUK5q88B/PrX8NhjcPDB8OWXsOmmcMYZHithZs2DV5SbT6ZMSdOD33ILrLsubLM
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
2021-04-20 18:43:03 +02:00
}
],
"source": [
2021-04-20 19:06:45 +02:00
"Sum_of_squared_distances = []\n",
"K = range(2,30)\n",
"for k in K:\n",
" km = KMeans(n_clusters=k, max_iter=200, n_init=10)\n",
" km = km.fit(text_test_tf)\n",
" Sum_of_squared_distances.append(km.inertia_)\n",
"plt.plot(K, Sum_of_squared_distances, 'bx-')\n",
"plt.xlabel('k')\n",
"plt.ylabel('Sum_of_squared_distances')\n",
"plt.title('Elbow Method For Optimal k')\n",
"plt.show()"
2021-04-20 18:43:03 +02:00
]
},
{
"cell_type": "code",
2021-04-20 19:15:41 +02:00
"execution_count": 179,
2021-04-20 18:43:03 +02:00
"metadata": {},
2021-04-20 19:06:45 +02:00
"outputs": [],
"source": [
"true_k_dev = 10\n",
"model_dev = KMeans(n_clusters=true_k_dev, init='k-means++', max_iter=200, n_init=10)\n",
"model_dev.fit(text_tf)\n",
"labels_dev=model_dev.labels_\n",
"clusters_dev=pd.DataFrame(list(labels_dev),columns=['cluster'])"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"true_k_test = 28\n",
"model_test = KMeans(n_clusters=true_k_test, init='k-means++', max_iter=200, n_init=10)\n",
"model_test.fit(text_test_tf)\n",
"labels_test=model_test.labels_\n",
"clusters_test=pd.DataFrame(list(labels_test),columns=['cluster'])"
]
},
{
"cell_type": "code",
2021-04-20 19:15:41 +02:00
"execution_count": 180,
2021-04-20 19:06:45 +02:00
"metadata": {},
"outputs": [],
2021-04-20 18:43:03 +02:00
"source": [
2021-04-20 19:06:45 +02:00
"clusters_dev.to_csv(\"dev-0\\out.tsv\", sep=\"\\t\",index=False,header=None)"
2021-04-20 18:43:03 +02:00
]
},
{
"cell_type": "code",
2021-04-20 19:06:45 +02:00
"execution_count": 163,
2021-04-20 18:43:03 +02:00
"metadata": {},
"outputs": [],
"source": [
2021-04-20 19:06:45 +02:00
"clusters_test.to_csv(\"test-A\\out.tsv\", sep=\"\\t\",index=False,header=None)"
2021-04-20 18:43:03 +02:00
]
},
2021-04-20 19:11:45 +02:00
{
"cell_type": "code",
2021-04-20 19:15:41 +02:00
"execution_count": 181,
2021-04-20 19:11:45 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cluster</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2021-04-20 19:15:41 +02:00
" <td>6</td>\n",
2021-04-20 19:11:45 +02:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2021-04-20 19:15:41 +02:00
" <td>5</td>\n",
2021-04-20 19:11:45 +02:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2021-04-20 19:15:41 +02:00
" <td>2</td>\n",
2021-04-20 19:11:45 +02:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2021-04-20 19:15:41 +02:00
" <td>8</td>\n",
2021-04-20 19:11:45 +02:00
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2021-04-20 19:15:41 +02:00
" <td>6</td>\n",
2021-04-20 19:11:45 +02:00
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82</th>\n",
2021-04-20 19:15:41 +02:00
" <td>2</td>\n",
2021-04-20 19:11:45 +02:00
" </tr>\n",
" <tr>\n",
" <th>83</th>\n",
2021-04-20 19:15:41 +02:00
" <td>6</td>\n",
2021-04-20 19:11:45 +02:00
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
2021-04-20 19:15:41 +02:00
" <td>4</td>\n",
2021-04-20 19:11:45 +02:00
" </tr>\n",
" <tr>\n",
" <th>85</th>\n",
2021-04-20 19:15:41 +02:00
" <td>6</td>\n",
2021-04-20 19:11:45 +02:00
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>87 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" cluster\n",
2021-04-20 19:15:41 +02:00
"0 6\n",
"1 5\n",
"2 2\n",
"3 8\n",
"4 6\n",
2021-04-20 19:11:45 +02:00
".. ...\n",
2021-04-20 19:15:41 +02:00
"82 2\n",
"83 6\n",
"84 4\n",
"85 6\n",
2021-04-20 19:11:45 +02:00
"86 5\n",
"\n",
"[87 rows x 1 columns]"
]
},
2021-04-20 19:15:41 +02:00
"execution_count": 181,
2021-04-20 19:11:45 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clusters_dev"
]
},
2021-04-20 18:43:03 +02:00
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}