{ "cells": [ { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from many_stop_words import get_stop_words\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from unidecode import unidecode\n", "from nltk.tokenize import word_tokenize\n", "import string\n", "import matplotlib.pyplot as plt\n", "from sklearn.cluster import KMeans" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [], "source": [ "data=pd.read_csv('dev-0/in.tsv', sep='\\t', header=None)\n", "data_test=pd.read_csv('test-A/in.tsv', sep='\\t', header=None)" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [], "source": [ "def remove_punctuations(text):\n", " for punctuation in string.punctuation:\n", " text = text.replace(punctuation, '')\n", " return text" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [], "source": [ "data[0] = data[0].str.lower()\n", "data_test[0] = data_test[0].str.lower()\n", "stop_words = get_stop_words('pl')" ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [], "source": [ "data[0] = data[0].apply(unidecode)\n", "data_test[0] = data_test[0].apply(unidecode)\n", "uni_stop_words = [unidecode(x) for x in stop_words]" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [], "source": [ "data[0] = data[0].apply(remove_punctuations)\n", "data_test[0] = data_test[0].apply(remove_punctuations)" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [], "source": [ "data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))\n", "data_test[0] = data_test[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [], "source": [ "tf=TfidfVectorizer()\n", "text_tf= tf.fit_transform(data[0])\n", "text_test_tf= tf.fit_transform(data_test[0])" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 opowiesc prawdziwa olsztyn akademik 7 pietro i...\n", "1 podejrzewam polowaniu mowy prostu znalazl mart...\n", "2 smutne przypomina historie balwankami wredny f...\n", "3 kumpla zdawal walentynki polozyl koperte laski...\n", "4 przypomniala krakowskich urban legends chyba n...\n", " ... \n", "82 wczoraj popoludniowej audycji trojce prowadzac...\n", "83 sluchajcie uwielbiam opowiadacv sluchac jakies...\n", "84 wczoraj probie koncertu czwartkowego akompania...\n", "85 zuzanna mala historia przyszla panna mloda kup...\n", "86 koszmar zaczyna niewinnego spotkania jednym to...\n", "Name: 0, Length: 87, dtype: object" ] }, "execution_count": 174, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[0]" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "Sum_of_squared_distances = []\n", "K = range(2,20)\n", "for k in K:\n", " km = KMeans(n_clusters=k, max_iter=200, n_init=10)\n", " km = km.fit(text_tf)\n", " Sum_of_squared_distances.append(km.inertia_)\n", "plt.plot(K, Sum_of_squared_distances, 'bx-')\n", "plt.xlabel('k')\n", "plt.ylabel('Sum_of_squared_distances')\n", "plt.title('Elbow Method For Optimal k')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "Sum_of_squared_distances = []\n", "K = range(2,30)\n", "for k in K:\n", " km = KMeans(n_clusters=k, max_iter=200, n_init=10)\n", " km = km.fit(text_test_tf)\n", " Sum_of_squared_distances.append(km.inertia_)\n", "plt.plot(K, Sum_of_squared_distances, 'bx-')\n", "plt.xlabel('k')\n", "plt.ylabel('Sum_of_squared_distances')\n", "plt.title('Elbow Method For Optimal k')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [], "source": [ "true_k_dev = 10\n", "model_dev = KMeans(n_clusters=true_k_dev, init='k-means++', max_iter=200, n_init=10)\n", "model_dev.fit(text_tf)\n", "labels_dev=model_dev.labels_\n", "clusters_dev=pd.DataFrame(list(labels_dev),columns=['cluster'])" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [], "source": [ "true_k_test = 28\n", "model_test = KMeans(n_clusters=true_k_test, init='k-means++', max_iter=200, n_init=10)\n", "model_test.fit(text_test_tf)\n", "labels_test=model_test.labels_\n", "clusters_test=pd.DataFrame(list(labels_test),columns=['cluster'])" ] }, { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [], "source": [ "clusters_dev.to_csv(\"dev-0\\out.tsv\", sep=\"\\t\",index=False,header=None)" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "clusters_test.to_csv(\"test-A\\out.tsv\", sep=\"\\t\",index=False,header=None)" ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cluster
03
19
21
33
45
......
821
835
842
857
865
\n", "

87 rows × 1 columns

\n", "
" ], "text/plain": [ " cluster\n", "0 3\n", "1 9\n", "2 1\n", "3 3\n", "4 5\n", ".. ...\n", "82 1\n", "83 5\n", "84 2\n", "85 7\n", "86 5\n", "\n", "[87 rows x 1 columns]" ] }, "execution_count": 178, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clusters_dev" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }