final

updated
2021-04-20 19:15:41 +02:00 · 2021-04-20 19:13:31 +02:00 · 2021-04-20 19:11:45 +02:00 · 2021-04-20 19:06:45 +02:00 · 2021-04-20 18:43:03 +02:00
5 changed files with 1399 additions and 0 deletions
--- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@ -0,0 +1,179 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from nltk.tokenize import RegexpTokenizer\n",
+    "from stop_words import get_stop_words\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data=pd.read_csv('dev-0/in.tsv', sep='\\t', header=None)\n",
+    "expected_data=pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data[0] = data[0].str.lower()\n",
+    "filtered_words = [word for word in data[0] if word not in get_stop_words('polish')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "token = RegexpTokenizer(r'[a-zA-Z0-9]+')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cv = CountVectorizer(lowercase=True,ngram_range = (1,1),tokenizer = token.tokenize)\n",
+    "text_counts= cv.fit_transform(data[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<1x5048 sparse matrix of type '<class 'numpy.int64'>'\n",
+       "\twith 234 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    text_counts, expected_data[0], test_size=0.3, random_state=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MultinomialNB Accuracy: 0.6296296296296297\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn import metrics\n",
+    "clf = MultinomialNB().fit(X_train, y_train)\n",
+    "predicted= clf.predict(X_test)\n",
+    "print(\"MultinomialNB Accuracy:\",metrics.accuracy_score(y_test, predicted))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "tf=TfidfVectorizer()\n",
+    "text_tf= tf.fit_transform(filtered_words)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    text_tf, expected_data[0], test_size=0.3, random_state=123)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MultinomialNB Accuracy: 0.2222222222222222\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn import metrics\n",
+    "clf = MultinomialNB().fit(X_train, y_train)\n",
+    "predicted= clf.predict(X_test)\n",
+    "print(\"MultinomialNB Accuracy:\",metrics.accuracy_score(y_test, predicted))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/Untitled.ipynb
+++ b/Untitled.ipynb
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
@ -0,0 +1,87 @@
+1
+7
+3
+9
+4
+6
+2
+0
+6
+3
+0
+6
+7
+4
+7
+2
+7
+7
+3
+4
+8
+4
+4
+8
+0
+4
+5
+4
+4
+7
+2
+2
+2
+4
+7
+2
+7
+4
+5
+9
+6
+1
+2
+9
+1
+3
+2
+7
+5
+2
+0
+3
+2
+4
+1
+8
+7
+7
+2
+3
+2
+7
+2
+2
+6
+4
+2
+1
+3
+2
+4
+3
+1
+2
+7
+0
+0
+1
+9
+4
+3
+0
+3
+4
+2
+7
+4
--- a/script.py
+++ b/script.py
@ -0,0 +1,75 @@
+import pandas as pd
+from many_stop_words import get_stop_words
+from sklearn.feature_extraction.text import TfidfVectorizer
+from unidecode import unidecode
+from nltk.tokenize import word_tokenize
+import string
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+
+data=pd.read_csv('dev-0/in.tsv', sep='\t', header=None)
+data_test=pd.read_csv('test-A/in.tsv', sep='\t', header=None)
+
+def remove_punctuations(text):
+    for punctuation in string.punctuation:
+        text = text.replace(punctuation, '')
+    return text
+
+data[0] = data[0].str.lower()
+data_test[0] = data_test[0].str.lower()
+stop_words = get_stop_words('pl')
+
+data[0] = data[0].apply(unidecode)
+data_test[0] = data_test[0].apply(unidecode)
+uni_stop_words = [unidecode(x) for x in stop_words]
+
+data[0] = data[0].apply(remove_punctuations)
+data_test[0] = data_test[0].apply(remove_punctuations)
+
+data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))
+data_test[0] = data_test[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))
+
+tf=TfidfVectorizer()
+text_tf= tf.fit_transform(data[0])
+text_test_tf= tf.fit_transform(data_test[0])
+
+Sum_of_squared_distances = []
+K = range(2,20)
+for k in K:
+    km = KMeans(n_clusters=k, max_iter=200, n_init=10)
+    km = km.fit(text_tf)
+    Sum_of_squared_distances.append(km.inertia_)
+
+plt.plot(K, Sum_of_squared_distances, 'bx-')
+plt.xlabel('k')
+plt.ylabel('Sum_of_squared_distances')
+plt.title('Elbow Method For Optimal k')
+plt.show()
+
+Sum_of_squared_distances = []
+K = range(2,30)
+for k in K:
+    km = KMeans(n_clusters=k, max_iter=200, n_init=10)
+    km = km.fit(text_test_tf)
+    Sum_of_squared_distances.append(km.inertia_)
+
+plt.plot(K, Sum_of_squared_distances, 'bx-')
+plt.xlabel('k')
+plt.ylabel('Sum_of_squared_distances')
+plt.title('Elbow Method For Optimal k')
+plt.show()
+
+true_k_dev = 10
+model_dev = KMeans(n_clusters=true_k_dev, init='k-means++', max_iter=200, n_init=10)
+model_dev.fit(text_tf)
+labels_dev=model_dev.labels_
+clusters_dev=pd.DataFrame(list(labels_dev),columns=['cluster'])
+
+true_k_test = 28
+model_test = KMeans(n_clusters=true_k_test, init='k-means++', max_iter=200, n_init=10)
+model_test.fit(text_test_tf)
+labels_test=model_test.labels_
+clusters_test=pd.DataFrame(list(labels_test),columns=['cluster'])
+
+clusters_dev.to_csv("dev-0\out.tsv", sep="\t",index=False,header=None)
+clusters_test.to_csv("test-A\out.tsv", sep="\t",index=False,header=None)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
@ -0,0 +1,691 @@
+27
+17
+3
+19
+1
+7
+27
+10
+19
+1
+2
+20
+15
+22
+12
+1
+1
+11
+1
+12
+10
+15
+7
+22
+25
+17
+19
+13
+10
+1
+4
+5
+7
+6
+8
+2
+20
+19
+3
+27
+21
+23
+1
+15
+25
+21
+0
+11
+3
+12
+3
+24
+19
+22
+9
+23
+19
+3
+16
+24
+21
+1
+25
+17
+12
+6
+22
+7
+0
+12
+9
+8
+1
+1
+11
+19
+27
+12
+21
+2
+9
+26
+18
+2
+17
+20
+19
+19
+17
+21
+22
+9
+8
+17
+1
+1
+27
+25
+27
+14
+25
+15
+1
+13
+20
+0
+7
+20
+11
+17
+15
+3
+12
+3
+20
+17
+17
+12
+11
+19
+11
+10
+16
+21
+19
+3
+1
+23
+15
+23
+9
+8
+21
+23
+16
+8
+4
+19
+18
+4
+27
+10
+11
+4
+8
+19
+17
+4
+19
+23
+1
+1
+17
+12
+22
+20
+1
+14
+1
+15
+22
+17
+4
+11
+9
+20
+18
+22
+8
+8
+2
+19
+14
+20
+1
+18
+19
+16
+23
+2
+26
+11
+5
+1
+10
+10
+10
+18
+10
+9
+27
+8
+20
+19
+14
+14
+19
+3
+19
+27
+21
+24
+27
+25
+1
+1
+3
+11
+17
+27
+15
+1
+12
+7
+14
+20
+12
+7
+16
+10
+12
+0
+9
+17
+18
+8
+22
+13
+18
+20
+0
+13
+23
+9
+7
+25
+8
+22
+7
+19
+27
+12
+6
+13
+19
+16
+9
+9
+21
+11
+0
+2
+26
+15
+24
+18
+5
+1
+22
+11
+23
+15
+12
+13
+4
+13
+4
+2
+24
+11
+24
+10
+9
+19
+7
+1
+25
+15
+11
+1
+19
+9
+23
+11
+15
+27
+11
+3
+1
+7
+27
+0
+22
+2
+9
+9
+1
+27
+1
+13
+25
+11
+12
+9
+2
+16
+19
+7
+17
+2
+17
+9
+6
+1
+18
+2
+9
+4
+5
+24
+21
+18
+15
+17
+21
+21
+17
+7
+11
+25
+7
+19
+19
+23
+24
+3
+19
+6
+12
+19
+17
+21
+15
+12
+22
+11
+1
+20
+0
+0
+22
+7
+9
+15
+1
+22
+9
+1
+27
+1
+5
+8
+20
+20
+9
+4
+3
+5
+11
+22
+17
+21
+20
+13
+10
+14
+23
+1
+22
+19
+24
+2
+4
+25
+27
+15
+25
+20
+13
+7
+19
+6
+12
+3
+12
+2
+27
+17
+1
+21
+17
+19
+23
+14
+22
+12
+7
+10
+10
+15
+21
+27
+10
+20
+23
+9
+11
+9
+4
+5
+20
+0
+20
+7
+22
+24
+3
+17
+13
+12
+8
+22
+11
+24
+26
+12
+21
+15
+22
+7
+16
+3
+21
+14
+1
+2
+1
+26
+15
+13
+24
+2
+27
+13
+21
+23
+20
+11
+21
+9
+11
+0
+23
+2
+27
+1
+3
+19
+7
+21
+21
+23
+21
+10
+1
+0
+24
+23
+8
+16
+22
+18
+21
+0
+22
+25
+19
+9
+24
+17
+27
+3
+11
+22
+15
+11
+15
+4
+17
+11
+25
+3
+2
+13
+19
+6
+15
+1
+15
+25
+7
+22
+7
+2
+24
+20
+2
+1
+2
+11
+15
+10
+22
+11
+17
+13
+19
+18
+16
+5
+26
+27
+21
+3
+19
+15
+24
+12
+9
+0
+3
+4
+1
+11
+15
+7
+16
+5
+20
+15
+1
+21
+24
+13
+8
+26
+27
+27
+8
+6
+7
+3
+16
+10
+13
+1
+23
+19
+10
+8
+3
+3
+9
+2
+21
+20
+15
+11
+20
+19
+23
+13
+10
+7
+24
+9
+26
+23
+19
+9
+2
+20
+22
+7
+15
+2
+27
+20
+10
+24
+3
+12
+9
+12
+23
+2
+16
+27
+21
+1
+20
+5
+27
+13
+20
+19
+11
+11
+2
+17
+25
+15
+9
+3
+12
+18
+25
+9
+1
+25
+20
+11
+8
+1
+21
+27
+18
+22
+16
+4
+12
+27
+8
+23
+10
+22
+19
+22
+13
+2
+9
+13
+26
+20
+12
+0
+1
+24
+20
+22
+20
+7
+1
+19
+19
+15
+16
+19
+8
+19
+15
+1
+16
+22
+27
+18
+1
+16
+16
+7
+16
+8
+7
+22
+5
+3
+12
+13
+27
+10
+22
Author	SHA1	Message	Date
bednarco	296fe0638e	final	2021-04-20 19:15:41 +02:00
bednarco	b17760162b	updated	2021-04-20 19:13:31 +02:00
bednarco	b7150f138d	updated	2021-04-20 19:11:45 +02:00
bednarco	6194b5dd46	test-A, script	2021-04-20 19:06:45 +02:00
bednarco	960a201fb5	out	2021-04-20 18:43:03 +02:00