import pandas as pd from many_stop_words import get_stop_words from sklearn.feature_extraction.text import TfidfVectorizer from unidecode import unidecode from nltk.tokenize import word_tokenize import string import matplotlib.pyplot as plt from sklearn.cluster import KMeans data=pd.read_csv('dev-0/in.tsv', sep='\t', header=None) data_test=pd.read_csv('test-A/in.tsv', sep='\t', header=None) def remove_punctuations(text): for punctuation in string.punctuation: text = text.replace(punctuation, '') return text data[0] = data[0].str.lower() data_test[0] = data_test[0].str.lower() stop_words = get_stop_words('pl') data[0] = data[0].apply(unidecode) data_test[0] = data_test[0].apply(unidecode) uni_stop_words = [unidecode(x) for x in stop_words] data[0] = data[0].apply(remove_punctuations) data_test[0] = data_test[0].apply(remove_punctuations) data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words])) data_test[0] = data_test[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words])) tf=TfidfVectorizer() text_tf= tf.fit_transform(data[0]) text_test_tf= tf.fit_transform(data_test[0]) Sum_of_squared_distances = [] K = range(2,20) for k in K: km = KMeans(n_clusters=k, max_iter=200, n_init=10) km = km.fit(text_tf) Sum_of_squared_distances.append(km.inertia_) plt.plot(K, Sum_of_squared_distances, 'bx-') plt.xlabel('k') plt.ylabel('Sum_of_squared_distances') plt.title('Elbow Method For Optimal k') plt.show() Sum_of_squared_distances = [] K = range(2,30) for k in K: km = KMeans(n_clusters=k, max_iter=200, n_init=10) km = km.fit(text_test_tf) Sum_of_squared_distances.append(km.inertia_) plt.plot(K, Sum_of_squared_distances, 'bx-') plt.xlabel('k') plt.ylabel('Sum_of_squared_distances') plt.title('Elbow Method For Optimal k') plt.show() true_k_dev = 10 model_dev = KMeans(n_clusters=true_k_dev, init='k-means++', max_iter=200, n_init=10) model_dev.fit(text_tf) labels_dev=model_dev.labels_ clusters_dev=pd.DataFrame(list(labels_dev),columns=['cluster']) true_k_test = 28 model_test = KMeans(n_clusters=true_k_test, init='k-means++', max_iter=200, n_init=10) model_test.fit(text_test_tf) labels_test=model_test.labels_ clusters_test=pd.DataFrame(list(labels_test),columns=['cluster']) clusters_dev.to_csv("dev-0\out.tsv", sep="\t",index=False,header=None) clusters_test.to_csv("test-A\out.tsv", sep="\t",index=False,header=None)