diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..533e864 --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,179 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from nltk.tokenize import RegexpTokenizer\n", + "from stop_words import get_stop_words\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "data=pd.read_csv('dev-0/in.tsv', sep='\\t', header=None)\n", + "expected_data=pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "data[0] = data[0].str.lower()\n", + "filtered_words = [word for word in data[0] if word not in get_stop_words('polish')]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "token = RegexpTokenizer(r'[a-zA-Z0-9]+')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "cv = CountVectorizer(lowercase=True,ngram_range = (1,1),tokenizer = token.tokenize)\n", + "text_counts= cv.fit_transform(data[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<1x5048 sparse matrix of type ''\n", + "\twith 234 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " text_counts, expected_data[0], test_size=0.3, random_state=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MultinomialNB Accuracy: 0.6296296296296297\n" + ] + } + ], + "source": [ + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn import metrics\n", + "clf = MultinomialNB().fit(X_train, y_train)\n", + "predicted= clf.predict(X_test)\n", + "print(\"MultinomialNB Accuracy:\",metrics.accuracy_score(y_test, predicted))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "tf=TfidfVectorizer()\n", + "text_tf= tf.fit_transform(filtered_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " text_tf, expected_data[0], test_size=0.3, random_state=123)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MultinomialNB Accuracy: 0.2222222222222222\n" + ] + } + ], + "source": [ + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn import metrics\n", + "clf = MultinomialNB().fit(X_train, y_train)\n", + "predicted= clf.predict(X_test)\n", + "print(\"MultinomialNB Accuracy:\",metrics.accuracy_score(y_test, predicted))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..963a355 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,354 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from nltk.tokenize import RegexpTokenizer\n", + "from many_stop_words import get_stop_words\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from unidecode import unidecode\n", + "from nltk.tokenize import word_tokenize\n", + "import string" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [], + "source": [ + "data=pd.read_csv('dev-0/in.tsv', sep='\\t', header=None)\n", + "expected_data=pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_punctuations(text):\n", + " for punctuation in string.punctuation:\n", + " text = text.replace(punctuation, '')\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "data[0] = data[0].str.lower()\n", + "stop_words = get_stop_words('pl')" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "data[0] = data[0].apply(unidecode)\n", + "uni_stop_words = [unidecode(x) for x in stop_words]" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 opowiesc prawdziwa... olsztyn, akademik, 7 pie...\n", + "1 ja podejrzewam ze o polowaniu nie bylo mowy, p...\n", + "2 smutne. przypomina mi to historie z balwankami...\n", + "3 mam kumpla ktory zdawal w walentynki i polozyl...\n", + "4 przypomniala mi sie jedna z krakowskich urban ...\n", + " ... \n", + "82 wczoraj w popoludniowej audycji w trojce prowa...\n", + "83 sluchajcie! uwielbiam opowiadacv i sluchac jak...\n", + "84 wczoraj na probie (do koncertu czwartkowego) n...\n", + "85 zuzanna mala byla taka jedna historia ze przys...\n", + "86 koszmar zaczyna sie od niewinnego spotkania w ...\n", + "Name: 0, Length: 87, dtype: object" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [], + "source": [ + "data[0] = data[0].apply(remove_punctuations)" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [], + "source": [ + "data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "tf=TfidfVectorizer()\n", + "text_tf= tf.fit_transform(data[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<87x5203 sparse matrix of type ''\n", + "\twith 8407 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_tf" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEWCAYAAACOv5f1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAtPklEQVR4nO3debxV8/7H8deniUZJCUWRe7uGq9RJSSLdJFO4RDJkyhyJzD9xEZmHRCRlKCmZS+LIlHQaUMkQZYqboRmpPr8/vuvcdumc9q69z9rn7Pfz8diPc/ba0/vs+Ky1vus7mLsjIiK5pVzcAUREpOSp+IuI5CAVfxGRHKTiLyKSg1T8RURykIq/iEgOUvGXvzCz7mb2TsJ9N7Nd48yULun8W8xsnpn9Kx3vFTcz62Zm4zP03m+a2ZlFPNbXzJ7IxOdK8VT8c1RUuH4zs2UJt/vjzgX/2/m4md213vbO0fbHknyfIotOppnZY2a2cr3v9/g0vfcWZtbPzL6O/g0/N7PLzMySfH3D6HusULjN3Z9094PTkU9Khwobf4qUYUe4+4S4QxRhLtDFzC5z91XRtlOBz2LMlKr+7n7Npr7YzCok/O2JngG2Aw4F5gB5wOPAjkDPTf08yS068pdkHWpmX5rZT2Z2m5mVAzCzcmZ2jZnNN7P/mtkwM9sqemyomfWOfq8XHW2eH91vZGa/FL7PBvwAfAx0jJ5fC2gNvJD4JDNrZWbvmdkiM/vQzA6Mtt8E7A/cv4Gzmn9FR8uLzGxA4RFzcX9L9PjJ0WM/m9nVm/pFmtlZZvZF9Pe/YGY7JDzmZna+mX0OfL6B17YHDgb+7e4z3X2Vu78PnAScX9ikFZ319DOzD8xsiZk9H32HAG9FPxdF382+RTT1nRd9T0vN7D/Rv9l70fuNNLNK0XO3NrOXzGyhmf0a/V5/E76XimY23MxGF763ZI6KvyTraMIRZjOgM3B6tL17dGsH7AJUAwoL7UTgwOj3A4AvgbYJ99929zXFfOYw4JTo9xOA54E/Ch80s3rAy8CNQC3gUmC0mdVx96uBt4EL3L2au1+Q8L6HAy2AvYAuRDuY4v4WM9sdGAicDOwAbANsSoE7COgXfe72wHxgxHpPOwpoCey+gbfoAEx2928SN7r7ZOBboH3C5lMI/07bA6uAe6Pthf8GNaPvZlIRcTsCzYFWQB9gEGEnsyOwJ9A1el45YAjQANgJ+I21/w0kxcwqA88R/n27uPvKVF4vqVPxz23PRUe/hbezinnure7+i7t/DdzN2v/xuwF3uvuX7r4MuBI4IWpPngi0iY7u2wL9gf2i1x0QPV6cMcCB0dH3KYSdQaKTgFfc/RV3X+PurwEFhOaQ4tzi7ouivyUfaJrE33Is8JK7v+XufwDXAsXtuAAuTfhuf0r4jEfdfVr0PlcC+5pZw4TX9Yu+69828J61gQVFfN6C6PFCj0dnB8ujvF3MrPxGMifq7+5L3H0WMBMYH303i4GxwN4A7v6zu4929xXuvhS4ifDvm6wawDhCU99p7r46hdfKJlLxz21HuXvNhNvDxTw38UhzPuHol+jn/PUeqwDUdfe5wHJCcd0feAn43swak0Txj4rfy8A1wDbu/u56T2kAHJe4AwPaEI50i/NDwu8rCEf4xf4t0WP/+w6igvrzRj7n9oTvtrAor/MZ0U7mZ6BewuvWOapfz08U/fdtHz2+ofeZD1Rk3Z3DxvyY8PtvG7hfDcDMqpjZQ1GT2BJCs1LNFHY0rQhnYbe4ZposMSr+kqwdE37fCfg++v17QhFOfGwVawvFRMJRcyV3/y66fyqwNTAjic8dBvQGNtQd8BvC0W3iDqyqu98SPZ5qISnub1lAwndgZlUITT+pWuczzKxq9D7fJTynuNwTgJZmlvjvgZm1jPK9kbB5/X+zPwk7h3QX2N5AY6Clu9dgbbNSUr2PgPGEprDXzaxumrNJEVT8JVmXRRf2dgQuAp6Otg8HepnZzmZWDbgZeDqhl8pE4ALWXmR8M7r/TpKn9xMJ7dz3beCxJ4AjzKyjmZU3sy3N7MCEi40/Etruk1Xc3zIKONzM2kQXI29g0/7/GQ6cZmZNzWyL6DMmu/u8ZF4c9c56nXBtY4/o725F+C4GunviReKTzGz3aEd1AzAq+s4XEpqsUvluilOdcCawKLqofF2qb+Du/YGnCDuAVM5OZBOp+Oe2F23dfuhjinnu88BUwtH6y8DgaPujhG6GbwFfAb8DFya8biKhOBQW/3eAKgn3i+XB6+7+ywYe+4Zw8fkqQkH7BriMtf9d3wMcG/VAuXf9129AkX9L1O59PqFALQB+JVxgTUlUvK8FRkfv04hwMTsV/yZcqxgHLCMU/sGs+70T/S2PEZq5tiTqBuruKwjt8u9GzWWtUv071nM3UJlwVvF+lCtl7v4fwkXfCQk9kyRDTE1sImWPmb0JPOHuj8SdRbKTjvxFRHKQir+ISA5Ss4+ISA7Skb+ISA4qNRO71a5d2xs2bBh3DBGRUmXq1Kk/uXud9beXmuLfsGFDCgoK4o4hIlKqmNn8DW1Xs4+ISA5S8RcRyUEq/iIiOUjFX0QkB6n4i4jkoDJb/Pv3h/z8dbfl54ftIiK5rswW/xYtoEuXtTuA/Pxwv0WLeHOJiGSDUtPPP1Xt2sHIkXDUUXDAATBpUrjfrl3cyURE4ldmj/whFPoddoAXX4SttoJdd407kYhIdijTxT8/H376CTp2hLlzoXFjGDQINJediOS6Mlv8C9v4R46EcePgiSdg1So4+2zo0AHmzYs7oYhIfMps8Z8yZd02/m7dYOxYOPpomDwZ9twTBgyANWvizSkiEodSM59/Xl6ep2tit/nz4ayz4LXXwsXgwYOhUaO0vLWISFYxs6nunrf+9jJ75F+cBg3g1VfhkUdg+nT45z/h7rth9eq4k4mIlIycLP4AZnDGGTBrVmga6tUL2raFTz+NO5mISOblbPEvVL8+vPQSDB0Ks2dD06Zw2206CxCRsi3niz+Es4BTTgnFv2NH6NMHWrcO90VEyiIV/wTbbw9jxsBTT4VxAXvvDTffHLqIioiUJSr+6zGDrl3DtYAjj4Srr4aWLeGjj+JOJiKSPir+RahbF555Jty++Qby8uCGG+DPP+NOJiKy+VT8N+LYY0Pb/7HHwnXXhVlBp0+PO5WIyOZR8U9C7drhOsCYMfDDD2EHcM018McfcScTEdk0Kv4pOOqocBZw4olw003QvHmYRkJEpLRR8U9RrVowbFgYG/Drr9CqFVxxBfz+e9zJRESSp+K/iQ47LPQIOu00uPXW0C100qS4U4mIJEfFfzPUrBnmBxo3DpYvh/32g969YcWKuJOJiBRPxT8NOnaEmTOhRw+4805o0gTefjvuVCIiRVPxT5MaNeDBB+H118OI4AMOgJ49wxmBiEi2UfFPs4MOgo8/hgsugPvuC9NF5+fHnUpEZF0q/hlQrRrcey9MnAjlyoUdwrnnwtKlcScTEQlU/DOobdswJ1CvXvDQQ2HpyPHj404lIqLin3FVqoSLwO+8A5Urh4vDZ54JixfHnUxEcpmKfwlp3TrMCdSnDwwZAnvsAa+8EncqEclVGS/+ZtbLzGaZ2UwzG25mW5rZzmY22cy+MLOnzaxSpnNkg8qVw4CwSZNgq63CQLFTTw0jhUVESlJGi7+Z1QN6AnnuvidQHjgBuBW4y913BX4Fzshkjmyzzz4wbVpYK+DJJ2H33cNOYP1eQfn50L9/PBlFpGwriWafCkBlM6sAVAEWAAcBo6LHhwJHlUCOrLLFFnDjjfDBB7DttmG+oE6d4LnnwuP5+dClS5hBVEQk3TJa/N39O+B24GtC0V8MTAUWuXvh4ojfAvU29Hoz62FmBWZWsHDhwkxGjU2zZmFm0L59w0IxxxwT1g7o0gVGjoR27eJOKCJlUaabfbYGOgM7AzsAVYFDkn29uw9y9zx3z6tTp06GUsavUqWwUMz06WEFsdGjoU6dcFFYRCQTMt3s8y/gK3df6O5/As8C+wE1o2YggPrAdxnOUSr8/HOYGqJdO/jkE/jb32D4cHCPO5mIlDWZLv5fA63MrIqZGdAemA3kA8dGzzkVeD7DObJeYRv/yJHwxhvw6KNhdtATT4Sjj4YFC+JOKCJlSabb/CcTLuxOAz6OPm8QcDlwiZl9AWwDDM5kjtJgypR12/hPOw3GjoVDDw1TRu+xR7gorLMAEUkH81JSTfLy8rygoCDuGLH49FM4/XR4772wM3joIahfP+5UIlIamNlUd89bf3vSR/5mtp+ZVY1+P8nM7jSzBukMKRvWuDG89RbcfXdoHtpjDxg8WGcBIrLpUmn2GQisMLMmQG9gLjAsI6nkL8qXh4suCtNF7713mB/okEPg66/jTiYipVEqxX+VhzaizsD97j4AqJ6ZWFKURo3CBeEBA+Ddd8NZwIMPwpo1cScTkdIkleK/1MyuBE4GXjazckDFzMSS4pQrB+edF5aObNUqrBXwr3/Bl1/GnUxESotUiv/xwB/A6e7+A6F//m0ZSSVJadgwrA8waBAUFIRVw+67T2cBIrJxSRf/qOCPBraINv0EjMlEKEmeGZx1FsyaFRaP6dkTDjwQPv887mQiks1S6e1zFqHP/kPRpnrAcxnIJJtgxx3D+gBDhoTVw/baKywis3p13MlEJBul0uxzPmFqhiUA7v45sG0mQsmmMYPu3WH2bOjQAXr3hjZtYM6cuJOJSLZJpfj/4e4rC+9Ec/Oop3kW2mEHeP55eOIJ+OwzaNo0LCKzatVGXyoiOSKV4j/RzK4izM3fAXgGeDEzsWRzmUG3buFawGGHwRVXhKUkZ86MO5mIZINUiv8VwELCHD1nA68A12QilKTPdtvBqFHw9NPw1Vdh/YAbbwxrB4hI7kql+FcGHnX349z9WODRaJtkObMwY+js2WGxmGuvDUtJzpgRdzIRiUsqxf911i32lYEJ6Y0jmVSnDowYAc8+G6aIbtEC/u//YOXKjb9WRMqWVIr/lu6+rPBO9HuV9EeSTDv66HAW0LUr/Oc/kJcHU6fGnUpESlIqxX+5mTUrvGNmzYHf0h9JSkKtWmF9gBdfDCuItWwJV10Fv/8edzIRKQmpFP+LgWfM7G0zewd4GrggI6mkxBx+eOgRdOqp0K9fuCA8eXLcqUQk01KZ3mEK8A/gXOAcYDd3V2NBGVCzZlgfYNw4WLYsdAm99FL4Ted1ImVWqss4tgD2ApoBXc3slPRHkrh07BjGAZx1FtxxBzRpAu+8E3cqEcmEVOb2eRy4HWhD2Am0AP6yNJiUbjVqhPUBJkwIYwHatg2LyCxfHncyEUmnCik8Nw/Y3UvLor+yWdq3D6uGXXkl3HsvvPRSaBo68MC4k4lIOqTS7DMT2C5TQST7VKsW1geYODEMFGvXDs4/H5YujTuZiGyuVIp/bWC2mb1qZi8U3jIVTLJH27ZhmuhevWDgQNhpJ7j99nWfk58P/fvHk09EUpdKs0/fTIWQ7FelSlgf4Nhj4YQT4LLLQsF/6imYNi1MHzFyZNwpRSRZSRd/d5+YySBSOrRuDZ9+CqedFiaL22mn0CQ0ZkxoFhKR0iGV3j6tzGyKmS0zs5VmttrMlmQynGSnypXDHEGnnw5LlsDixWG+oBUr4k4mIslKpc3/fqAr8DlhUrczgQGZCCXZLz8fXnghrBNQuTLcfz/svbdGB4uUFikN8nL3L4Dy7r7a3YcAh2QmlmSz/Py1bfz9+sHLL8NWW8GiRaFZ6JprNFOoSLZL5YLvCjOrBMwws/7AAlIfISxlwJQpofAXtvG3axfa/N9+G+bNg5tuCjuEYcPgn/+MNaqIFMGSHbNlZg2AH4FKQC9gK2CAu8/NXLy18vLyvKCgoCQ+SjbTCy+EKSIWLQpTRvfuDeXLx51KJDeZ2VR3/8tsDKkcuR/l7r+7+xJ3v97dLwEOT19EKSuOPDLMEXTEEXD55XDAATC3RA4RRCRZqRT/UzewrXtxLzCzxmY2I+G2xMwuNrOmZvZ+tK3AzPZJKbVkvTp14Jln4Iknwo6gSZMwZ5AmBxHJDhst/mbW1cxeBHZOHNlrZm8CvxT3Wnf/1N2buntToDmwAhgD9Aeuj7b/X3Rfyhgz6NYtFP/WreHcc6FTJ/juu7iTiUgyF3zfI1zcrQ3ckbB9KfBRCp/VHpjr7vPNzIEa0fatgO9TeB8pZerXh1dfDUf+l14Ke+4JAwaEZSTN4k4nkptSueBbFfjN3deY2d8JC7uMdfc/k3z9o8A0d7/fzHYDXgWMcPbR2t3nb+A1PYAeADvttFPz+fP/8hQpZT7/PKwaNmlSmCpi4ECoXTvuVCJlVzou+L4FbGlm9YDxwMnAY0l+eCXgSOCZaNO5QC9335HQc2jwhl7n7oPcPc/d8+rUqZNCVMlWf/tb6BLarx88/3w4C3jppbhTieSeVIq/ufsK4BjgAXc/Dtgjydd2Ihz1/xjdPxV4Nvr9GUAXfHNI+fJhZHBBAdStG3oFnXlmmCpCREpGSsXfzPYFugEvR9uS7b3dFRiecP974IDo94MIU0ZIjtlrL/jgg7BgzJAh4f6bb8adSiQ3pFL8LwauBMa4+ywz2wXI39iLomsFHVh7pA9wFnCHmX0I3EzUri+5Z4st4Oabw1rBFSuG0cK9emnxeJFMS/qCb9w0wrfsW748DAobMAD+8Q94/HHI0yrRIptlky/4mtnd0c8X1+vnr5W8JK2qVg2zg44fH5aKbNUK+vYNC8mLSHol08//8ejn7cU+SyRNOnQIA8N69oTrrw+9gYYNg913jzuZSNmx0SN/d58a/Zy4oVvmI0ouqlkzFPzRo2H+fGjWLCwjuWZN3MlEyoZkmn0+NrOPirqVREjJXcccE84CDjkkzA7arh189VXcqURKv2R6+xwOHAGMi27dottY4JXMRRMJ6tYN6wUMGQLTp4cuoY88okniRDZHMs0+86OpFzq4ex93/zi6XQ4cnPmIImEOoO7d4eOPoUWLsF7AEUfAggVxJxMpnVId5LVfwp3WKb5eZLM1aAATJsA998Drr4fpIUaOjDuVSOmTSvE+A3jAzOaZ2TzgAeD0jKQSKUa5cqEn0PTp0KgRHH98mCH0l2InGBeRREkXf3ef6u5NgCZAk2ie/mmFj5vZhhZ7EcmYf/wD3nsvLBU5alQ4Cxg7Nu5UIqVDys027r7Y3Rdv4KGL0pBHJCUVKsA118DkyVCrFhx6KJxzDixbFncykeyWzjZ7LcshsWnWLMwSetllMGhQWDbynXfiTiWSvdJZ/NXxTmK15ZbQvz9MnBi6gbZtC336wE03Qf56UxDm54fniuQqHflLmbP//vDhh6E76G23hTEBxxyzdgeQnw9duoQuoyK5Kp3F/900vpfIZqleHR56CF5+Gf74IywUc/jhcPXVofCPHBlGC4vkqo1O6WxmlxT3uLvfmdZERdCUzrKpfvkFzj8fRowI9087DR59NN5MIiVlc9bwrR7d8ghr79aLbucAzdIZUiQTatWCHj2gRo1wRjBkSBgd/OuvcScTiU8y0ztc7+7XA/WBZu7e2917A82BnTIdUGRzFbbxP/ccfPcdHHtsmCa6USMYPlxzBEluSqXNvy6wMuH+ymibSFabMmVtG3/16vDMM/Dgg1ClCpx4YpgxdO7cuFOKlKxUiv8w4AMz62tmfYHJwNCMpBJJoz59/npx9+yzwzoB994LkyaF0cE33QQrV274PUTKmlSmd7gJOA34Nbqd5u43ZyqYSKaVLw8XXgiffBJ6Al1zDTRtCm+/HXcykcxLtatnFWCJu98DfGtmO2cgk0iJqlcvNAW99BKsWBEGh51xBvz8c9zJRDIn6eJvZtcBlwNXRpsqAk9kIpRIHA47DGbNCs1EQ4eGieOGDdMFYSmbUjnyPxo4ElgO4O7fE7qAipQZVavCrbfCtGmw665w6qnQvj18+mncyUTSK5Xiv9LDiDAHMLOqmYkkEr+99oJ33w29gqZNC/f79oXff487mUh6pFL8R5rZQ0BNMzsLmAA8nJlYIvErVy70CpozB/79b7j++rATeOONuJOJbL6kir+ZGfA0MAoYDTQG/s/d78tgNpGssN128NRT8OqrsGZNaAY6+WT473/jTiay6ZIq/lFzzyvu/pq7X+bul7r7axnOJpJVDj44LCB/9dXw9NPhgvAjj4Qdgkhpk0qzzzQz0yS4ktMqV4Ybb4QZM8LAsLPOggMOCL2EREqTVIp/S2CSmc01s4/M7GMz+yhTwUSy2e67w5tvwuDBMHt2GBx21VVhnIBIaZBK8e8INAIOAo4ADo9+iuSkcuXg9NPDBeFu3aBfv3A2MG5c3MlENi6V6R3mu/t84DdCd8//dfsUyWV16sBjj4VeQBUrQqdOcMIJsGBB3MlEipbKCN8jzexz4CtgIjAPGLuR1zQ2sxkJtyVmdnH02IVmNsfMZpmZVlOVUq9dO/joo9AldMyYcEF44EBdEJbslEqzz3+AVsBn7r4z0B54v7gXuPun7t7U3ZsS5v9fAYwxs3ZAZ6CJu+8B3L4p4UWyzRZbwP/9X+gVlJcH550HrVuHNYVFskkqxf9Pd/8ZKGdm5dw9n7C6V7LaA3OjpqNzgVvc/Q8Ad1ePaSlT/v53mDAhzA305ZfQvDlceiksWxZ3MpEgleK/yMyqAW8BT5rZPUTz/CTpBGB49Pvfgf3NbLKZTSyqC6mZ9TCzAjMrWLhwYQofJRI/szAYbM6csG7wHXfAHnvAiy/GnUwkteLfmXCxtxcwDphLkr19zKwSYVK4Z6JNFYBahGakywhTR9j6r3P3Qe6e5+55derUSSGqSPaoVQsefjisE1CtGhx5ZJgu4ttv404muSyV3j7L3X21u69y96Hufm/UDJSMTsA0d/8xuv8t8KwHHwBrgNqpRRcpXdq0genT4eab4ZVXYLfd4J57YPXquJNJLkqlt8/SqLfOEjP73cxWm9mSJF/elbVNPgDPAe2i9/07UAn4KdksIqVVpUpw5ZVhRPB++8HFF8M++8DUqXEnk1yTypF/dXev4e41gMrAv4EHNva6aOrnDsCzCZsfBXYxs5nACODUaP4gkZywyy4wdiyMGAHffx92ABddBDfcAPn56z43Px/6qzO0pJltTs01s+nuvnca8xQpLy/PCwoKSuKjRErUokVhaogHHwzXB/78M4wTOOigUPi7dIGRI/+6CL1IMsxsqrv/pWdmhRTe4JiEu+UI3Ty1tIXIZqpZEx54IKwa1qNHGCjWqROcc06YSlqFXzIhld4+RyTcOgJLCT2ARCQNWraEggK47bYwKvjee6FjRxV+yYykj/zd/bRMBhGRMDdQ8+ZQvTqULw9PPglbbx12BH/tDC2y6VJp9rm3uMfdvefmxxHJbYVt/KNHh53AIYfA/ffD3LnhOsAWW8SdUMqKVJp9tgSaAZ9Ht6aELppTo5uIbKYpU9a28deoAe+8E6aLHjs2XAD+4Ye4E0pZkXRvHzN7H2jj7qui+xWBt929VQbz/Y96+0guGzkSuneHbbaB554LZwUiySiqt08qR/5bAzUS7leLtolIhnXpAu++G9r927QJ4wNENkcqxf8WYLqZPWZmQ4FpwM2ZiSUi69t779As1Lw5dO0axgZorQDZVKmM8B1CWMd3DGG07r7uPjRTwUTkr+rWDSuGnXlmWDayc2dYkuwkKyIJUpnbZz9gqbs/D1QH+phZg4wlE5ENqlQJBg2C++4LF4JbtYIvvog7lZQ2qTT7DARWmFkT4BLClM7DMpJKRIplBhdcAOPHw48/hrmBJkyIO5WUJqkU/1XR5GudgQHuPoBwBiAiMTnooHAdYIcdwmjge+4BTZEoyUil+C81syuBk4CXzawcUDEzsUQkWbvsApMmwRFHhCmizzgD/vgj7lSS7VIp/scDfwBnuPsPQH3gtoykEpGUVK8Ozz4L114LQ4aEQWIaECbFSaW3zw/ufqe7vx3d/9rd/9fmb2aTMhFQRJJTrlxYD2DkSPjwQ8jLCxPFiWxIKkf+G7NlGt9LRDbRcceFAWHly8P++4dpoUXWl87ir8tMIlmiadNwIbhFizA30BVXaK1gWVc6i7+IZJFttw3dP3v0gFtvDQPCFi+OO5Vki40WfzNLdhJZzTYukmUqVQrLQw4YAOPGhQFhn38edyrJBskc+U8CMLPHN/K8kzc/joikmxmcdx689hosXBgGhI0fH3cqiVsyxb+SmZ0ItDazY9a/FT7J3WdmLqaIbK527cJ1gB13DGsE33WXBoTlsmRW8joH6AbUJKzfm8gJk7yJSCmw887w3ntwyilwySVhsfgHH9QKYbloo8Xf3d8B3jGzAncfXAKZRCSDqlWDUaPCmIDrr4c5c8IAse23jzuZlKRUevs8bmY9zWxUdLswWs1LREqZcuWgb9+wE/joozAgbMqUuFNJSUql+D8ANI9+PkBYz3dgJkKJSMn4979DM1DFimFA2JNPxp1ISkoybf6FWrh7k4T7b5jZh+kOJCIlq0mTcNR/7LFw0knhTODmm8MIYSm7UjnyX21mjQrvmNkugMYMipQBdeqErqDnnAP9+8ORR2pAWFmXypH/ZUC+mX1JGNDVADgtI6lEpMRVqgQDB8Jee0HPntCyJbzwAvz973Enk0xIZVbP14G/AT2BC4HG7p5f+LiZdUh/PBEpaeeeG6aF+PnnMCDs1VfjTiSZkNLcPu7+h7t/FN3WXy7i1jTmEpEYHXBAuA7QoAEceijceacGhJU16ZzY7S9z+5hZYzObkXBbYmYXJzze28zczGqnMYeIpEHDhmFq6KOPht69oXt3+P33uFNJumR0Smd3/9Tdm7p7U0I30RXAGAAz2xE4GPg6jRlEJI2qVQuLw/TtC8OGwYEHwvffx51K0qEkp3RuD8x19/nR/buAPmgdAJGsVq4cXHcdjB4NM2eGNQI0IKz0S2fxn7eRx08AhgOYWWfgO3cvdpyAmfUwswIzK1i4cGF6UorIJjnmmDAgrFKlMCDsiSfiTiSbwzzJqzhmVh44DGhIQhdRd78ziddWAr4H9gCWAvnAwe6+2MzmAXnu/lNx75GXl+cFWpBUJHY//RQGhE2cCF26hGUiCweE5eeHs4I+feLNKGuZ2VR3z1t/eypH/i8C3YFtgOoJt2R0Aqa5+49AI2Bn4MOo8NcHppnZdilkEZGY1K4dBoR17hyuB+y7LyxaFAp/ly6hWUiyXyqDvOq7+16b+DldiZp83P1jYNvCB5I98heR7FGxIjz3XJgW+q67oFEjWLMmzA7arl3c6SQZqRz5jzWzg1P9ADOrCnRA8/6LlDl33hnWBvjll3D0P2JE+F2yXyrF/31gjJn9FvXXX2pmSzb2Indf7u7buPsGZwpx94Y66hcpnfLz4ZVXQht/5crw8MPwj3/A0KEaFJbtUin+dwL7AlXcvYa7V3f3GhnKJSJZrrCNf+RIuPVWePll2GqrcE2ge/cwJmD27LhTSlFSKf7fADM92e5BIlKmTZkSCn9hG3+7dqHN/9RTYdAg+PjjMF30FVfA8uXxZpW/SqWr52PALsBY4H/z+iTT1TMd1NVTpHRZuDA0Bz32WJgj6N57w1TRUrLS0dXzK+B1oBKpd/UUkRxTpw4MGQJvvRWmiejcGY46Cr7WhC5ZIemunu5+fSaDiEjZtP/+MH166BJ6/fWw225huohevUKXUYlH0kf+ZpZvZm+sf8tkOBEpGypWDE1As2dDhw5w+eXQtGk4K5B4pNLscylhNa/LgGuBGYAa4UUkaQ0ahMFhL7wQLgIfcEDoGaSpu0peKit5TU24vevulwAHZi6aiJRVRxwBs2aFnkBPPgmNG4ceQmvWxJ0sd6TS7FMr4VbbzA4BtspgNhEpw6pWhX794MMP4Z//hLPPhjZtwn3JvFSafaYSmnkKgPeAS4AzMhFKRHLH7rvDm2+GUcFffAHNm4c5g5YujTtZ2bbR4m9mLcxsO3ff2d13Aa4H5kQ3jd8Tkc1mFuYImjMHzjwT7r47TBMxapSmiciUZI78HwJWAphZW6AfMBRYDAzKXDQRyTW1asGDD4ZFY7bdFo47LiwgP3du3MnKnmSKf3l3L5yn73hgkLuPdvdrgV0zF01EclWrVmH6iLvvDovI77EH3HAD/PHHRl8qSUqq+JtZ4WCw9kBi3/5U1gMQEUlahQpw0UXwySdhdPB114ULwxMmxJ2sbEim+A8HJprZ88BvwNsAZrYroelHRCRj6tWDp5+GV18NXUE7dICuXWHBgriTlW4bLf7ufhPQG3gMaJMwq2c54MLMRRMRWevgg2HmzHAG8Oyz4YLw/ffD6tVxJyudkurq6e7vu/sYd1+esO0zd5+WuWgiIuvackvo2zfsBFq2hAsvDD+nTIk7WemTSj9/EZGs8Le/hWagESPg++/DDuD888NSkpIcFX8RKZXM4Pjjw9iACy8MXUQbN4YnntDYgGSo+ItIqVajBtxzT2j6adgQTj4Z2rcPOwUpmoq/iJQJzZqFwWEDB4b1A/baC665Bn77Le5k2UnFX0TKjPLl4ZxzwlH/CSfATTeFAWKvvBJ3suyj4i8iZU7dujBsGOTnhx5Chx0WBog9/fS6z8vPh/7948kYNxV/ESmzDjwQZsyAm2+Gzz4LZwPnngt//hkKf5cu0KJF3CnjYV5KLovn5eV5QYEWDhORTfPVV3DiifD++2ECuZUrw2CxDh3iTpZZZjbV3fPW364jfxHJCTvvHC4IH388/PILLFsWFpAZNCg3J4xT8ReRnPHmm/D666EXUI0aUKlS2AHsumuYKiKXegap+ItITihs4x85Ev7zn7CQ/M8/w623hoXlL7wQdtkF7rgjLC5f1qn4i0hOmDIlFP527cL9du3CfYC33w47h913h0svDYPF+vWDJUtii5txuuArIpLgvffCmcG4cbD11mFNgZ49w++lUSwXfM2ssZnNSLgtMbOLzew2M5tjZh+Z2Rgzq5nJHCIiyWrdGsaOhQ8+gLZtwyyiDRrAVVfBTz/FnS59Mlr83f1Td2/q7k2B5sAKYAzwGrCnu+8FfAZcmckcIiKpatEiXBeYMQMOOQRuuSXsBC69FH74Ie50m68k2/zbA3Pdfb67j3f3VdH294H6JZhDRCRpTZqEawOzZsExx8Bdd4Vuoz17wrffxp1u05Vk8T+BsCTk+k4HxpZgDhGRlO22Gzz+OHz6aRgsNnAgNGoU5hKaNy/udKkrkeJvZpWAI4Fn1tt+NbAKeLKI1/UwswIzK1i4cGHmg4qIbMSuu8LgwfD553D66TBkSFhc5vTTw7bSoqSO/DsB09z9x8INZtYdOBzo5kV0OXL3Qe6e5+55derUKZmkIiJJaNgwHP1/+WVYRWz48LCucLduMHt23Ok2rqSKf1cSmnzM7BCgD3Cku68ooQwiImlXrx7cfXdo+undG55/HvbcE447Dj78MO50Rct48TezqkAH4NmEzfcD1YHXoi6gD2Y6h4hIJtWtG6aHnjcvdAsdPx6aNoXOnbNzgfmMF393X+7u27j74oRtu7r7joXdQN39nEznEBEpCbVrw403wvz5cMMNYfTwPvuE7qLvvht3urU0vYOISAbUrAnXXht2ArfcAtOmQZs2cNBBYSqJuCdXUPEXEcmg6tXh8svDegJ33hmWmDzoINh//zCFRFw7ARV/EZESULUq9OoVegcNGABffw2dOkHLlvDCCyW/E1DxFxEpQVtuCeedB198AQ8/HKaV7twZ9t4bRo2CNWtKJoeKv4hIDCpVgjPPDCOGhw6F338P3UP33BO6doUJE9Z9froXm1fxFxGJUYUKcMopYe6gESOgfPnws2NH6NMnc4vNaz5/EZEssmZNGCjWp09oGtpqKzALi80XLkSTCi3gLiJSCpQrB0cfDZ99BiecAIsXQ48em1b4i/2c9L6diIikw5tvhnb/a6+FRx8NTT/ppOIvIpJlEhebv+GG8LNLl/TuAFT8RUSyTFGLzadzjiBd8BURKcN0wVdERP5HxV9EJAep+IuI5CAVfxGRHKTiLyKSg0pNbx8zWwjM38SX1wZ+SmOcTCtNeUtTVihdeZU1c0pT3s3N2sDd66y/sdQU/81hZgUb6uqUrUpT3tKUFUpXXmXNnNKUN1NZ1ewjIpKDVPxFRHJQrhT/QXEHSFFpyluaskLpyqusmVOa8mYka060+YuIyLpy5chfREQSqPiLiOSgMl38zWxHM8s3s9lmNsvMLoo7U1HMbEsz+8DMPoyyXh93po0xs/JmNt3MXoo7y8aY2Twz+9jMZphZ1k8Pa2Y1zWyUmc0xs0/MbN+4M22ImTWOvtPC2xIzuzjuXEUxs17R/18zzWy4mW0Zd6bimNlFUdZZ6f5ey3Sbv5ltD2zv7tPMrDowFTjK3WfHHO0vzMyAqu6+zMwqAu8AF7n7+zFHK5KZXQLkATXc/fC48xTHzOYBee5eKgb2mNlQ4G13f8TMKgFV3H1RzLGKZWblge+Alu6+qQMyM8bM6hH+v9rd3X8zs5HAK+7+WLzJNszM9gRGAPsAK4FxwDnu/kU63r9MH/m7+wJ3nxb9vhT4BKgXb6oN82BZdLdidMvaPbOZ1QcOAx6JO0tZY2ZbAW2BwQDuvjLbC3+kPTA3Gwt/ggpAZTOrAFQBvo85T3F2Aya7+wp3XwVMBI5J15uX6eKfyMwaAnsDk2OOUqSoGWUG8F/gNXfP2qzA3UAfYE3MOZLlwHgzm2pmPeIOsxE7AwuBIVGz2iNmVjXuUEk4ARged4iiuPt3wO3A18ACYLG7j483VbFmAvub2TZmVgU4FNgxXW+eE8XfzKoBo4GL3X1J3HmK4u6r3b0pUB/YJzrtyzpmdjjwX3efGneWFLRx92ZAJ+B8M2sbd6BiVACaAQPdfW9gOXBFvJGKFzVNHQk8E3eWopjZ1kBnws51B6CqmZ0Ub6qiufsnwK3AeEKTzwxgdbrev8wX/6j9fDTwpLs/G3eeZESn+PnAITFHKcp+wJFRO/oI4CAzeyLeSMWLjvpw9/8CYwjtqNnqW+DbhDO/UYSdQTbrBExz9x/jDlKMfwFfuftCd/8TeBZoHXOmYrn7YHdv7u5tgV+Bz9L13mW6+EcXUQcDn7j7nXHnKY6Z1TGzmtHvlYEOwJxYQxXB3a909/ru3pBwqv+Gu2ftEZSZVY0u+BM1nxxMOKXOSu7+A/CNmTWONrUHsq6Twnq6ksVNPpGvgVZmViWqDe0J1wGzlpltG/3cidDe/1S63rtCut4oS+0HnAx8HLWlA1zl7q/EF6lI2wNDox4T5YCR7p71XShLibrAmPD/OxWAp9x9XLyRNupC4MmoOeVL4LSY8xQp2qF2AM6OO0tx3H2ymY0CpgGrgOlk/zQPo81sG+BP4Px0Xvgv0109RURkw8p0s4+IiGyYir+ISA5S8RcRyUEq/iIiOUjFX0QkB6n4i2wiM2toZlk7XkCkOCr+IiI5SMVfJA3MbJdoErYWcWcRSUZZH+ErknHRNAwjgO7u/mHceUSSoeIvsnnqAM8Dx2TjIkEiRVGzj8jmWUyYMKxN3EFEUqEjf5HNsxI4GnjVzJa5e9pmXRTJJBV/kc3k7sujBW5ei3YAL8SdSWRjNKuniEgOUpu/iEgOUvEXEclBKv4iIjlIxV9EJAep+IuI5CAVfxGRHKTiLyKSg/4fqToEXnp1tHgAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.cluster import KMeans\n", + "Sum_of_squared_distances = []\n", + "K = range(2,10)\n", + "for k in K:\n", + " km = KMeans(n_clusters=k, max_iter=200, n_init=10)\n", + " km = km.fit(text_tf)\n", + " Sum_of_squared_distances.append(km.inertia_)\n", + "plt.plot(K, Sum_of_squared_distances, 'bx-')\n", + "plt.xlabel('k')\n", + "plt.ylabel('Sum_of_squared_distances')\n", + "plt.title('Elbow Method For Optimal k')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " cluster\n", + "50 0\n", + "81 0\n", + "24 0\n", + "7 0\n", + "75 0\n", + ".. ...\n", + "55 8\n", + "3 9\n", + "39 9\n", + "78 9\n", + "43 9\n", + "\n", + "[87 rows x 1 columns]\n" + ] + } + ], + "source": [ + "true_k = 10\n", + "model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=10)\n", + "model.fit(text_tf)\n", + "labels=model.labels_\n", + "clusters=pd.DataFrame(list(labels),columns=['cluster'])\n", + "print(clusters.sort_values(by=['cluster']))" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cluster
01
17
23
39
44
......
823
834
842
857
864
\n", + "

87 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " cluster\n", + "0 1\n", + "1 7\n", + "2 3\n", + "3 9\n", + "4 4\n", + ".. ...\n", + "82 3\n", + "83 4\n", + "84 2\n", + "85 7\n", + "86 4\n", + "\n", + "[87 rows x 1 columns]" + ] + }, + "execution_count": 130, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clusters" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "clusters.to_csv(\"dev-0\\out.tsv\", sep=\"\\t\",index=False,header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/dev-0/out.tsv b/dev-0/out.tsv new file mode 100644 index 0000000..c42233b --- /dev/null +++ b/dev-0/out.tsv @@ -0,0 +1,87 @@ +1 +7 +3 +9 +4 +6 +2 +0 +6 +3 +0 +6 +7 +4 +7 +2 +7 +7 +3 +4 +8 +4 +4 +8 +0 +4 +5 +4 +4 +7 +2 +2 +2 +4 +7 +2 +7 +4 +5 +9 +6 +1 +2 +9 +1 +3 +2 +7 +5 +2 +0 +3 +2 +4 +1 +8 +7 +7 +2 +3 +2 +7 +2 +2 +6 +4 +2 +1 +3 +2 +4 +3 +1 +2 +7 +0 +0 +1 +9 +4 +3 +0 +3 +4 +2 +7 +4