75 lines
2.4 KiB
Python
75 lines
2.4 KiB
Python
|
import pandas as pd
|
||
|
from many_stop_words import get_stop_words
|
||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
from unidecode import unidecode
|
||
|
from nltk.tokenize import word_tokenize
|
||
|
import string
|
||
|
import matplotlib.pyplot as plt
|
||
|
from sklearn.cluster import KMeans
|
||
|
|
||
|
data=pd.read_csv('dev-0/in.tsv', sep='\t', header=None)
|
||
|
data_test=pd.read_csv('test-A/in.tsv', sep='\t', header=None)
|
||
|
|
||
|
def remove_punctuations(text):
|
||
|
for punctuation in string.punctuation:
|
||
|
text = text.replace(punctuation, '')
|
||
|
return text
|
||
|
|
||
|
data[0] = data[0].str.lower()
|
||
|
data_test[0] = data_test[0].str.lower()
|
||
|
stop_words = get_stop_words('pl')
|
||
|
|
||
|
data[0] = data[0].apply(unidecode)
|
||
|
data_test[0] = data_test[0].apply(unidecode)
|
||
|
uni_stop_words = [unidecode(x) for x in stop_words]
|
||
|
|
||
|
data[0] = data[0].apply(remove_punctuations)
|
||
|
data_test[0] = data_test[0].apply(remove_punctuations)
|
||
|
|
||
|
data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))
|
||
|
data_test[0] = data_test[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))
|
||
|
|
||
|
tf=TfidfVectorizer()
|
||
|
text_tf= tf.fit_transform(data[0])
|
||
|
text_test_tf= tf.fit_transform(data_test[0])
|
||
|
|
||
|
Sum_of_squared_distances = []
|
||
|
K = range(2,20)
|
||
|
for k in K:
|
||
|
km = KMeans(n_clusters=k, max_iter=200, n_init=10)
|
||
|
km = km.fit(text_tf)
|
||
|
Sum_of_squared_distances.append(km.inertia_)
|
||
|
|
||
|
plt.plot(K, Sum_of_squared_distances, 'bx-')
|
||
|
plt.xlabel('k')
|
||
|
plt.ylabel('Sum_of_squared_distances')
|
||
|
plt.title('Elbow Method For Optimal k')
|
||
|
plt.show()
|
||
|
|
||
|
Sum_of_squared_distances = []
|
||
|
K = range(2,30)
|
||
|
for k in K:
|
||
|
km = KMeans(n_clusters=k, max_iter=200, n_init=10)
|
||
|
km = km.fit(text_test_tf)
|
||
|
Sum_of_squared_distances.append(km.inertia_)
|
||
|
|
||
|
plt.plot(K, Sum_of_squared_distances, 'bx-')
|
||
|
plt.xlabel('k')
|
||
|
plt.ylabel('Sum_of_squared_distances')
|
||
|
plt.title('Elbow Method For Optimal k')
|
||
|
plt.show()
|
||
|
|
||
|
true_k_dev = 10
|
||
|
model_dev = KMeans(n_clusters=true_k_dev, init='k-means++', max_iter=200, n_init=10)
|
||
|
model_dev.fit(text_tf)
|
||
|
labels_dev=model_dev.labels_
|
||
|
clusters_dev=pd.DataFrame(list(labels_dev),columns=['cluster'])
|
||
|
|
||
|
true_k_test = 28
|
||
|
model_test = KMeans(n_clusters=true_k_test, init='k-means++', max_iter=200, n_init=10)
|
||
|
model_test.fit(text_test_tf)
|
||
|
labels_test=model_test.labels_
|
||
|
clusters_test=pd.DataFrame(list(labels_test),columns=['cluster'])
|
||
|
|
||
|
clusters_dev.to_csv("dev-0\out.tsv", sep="\t",index=False,header=None)
|
||
|
clusters_test.to_csv("test-A\out.tsv", sep="\t",index=False,header=None)
|