polish-urban-legends-public/script.py

import pandas as pd
from many_stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from unidecode import unidecode
from nltk.tokenize import word_tokenize
import string
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

data=pd.read_csv('dev-0/in.tsv', sep='\t', header=None)
data_test=pd.read_csv('test-A/in.tsv', sep='\t', header=None)

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data[0] = data[0].str.lower()
data_test[0] = data_test[0].str.lower()
stop_words = get_stop_words('pl')

data[0] = data[0].apply(unidecode)
data_test[0] = data_test[0].apply(unidecode)
uni_stop_words = [unidecode(x) for x in stop_words]

data[0] = data[0].apply(remove_punctuations)
data_test[0] = data_test[0].apply(remove_punctuations)

data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))
data_test[0] = data_test[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))

tf=TfidfVectorizer()
text_tf= tf.fit_transform(data[0])
text_test_tf= tf.fit_transform(data_test[0])

Sum_of_squared_distances = []
K = range(2,20)
for k in K:
    km = KMeans(n_clusters=k, max_iter=200, n_init=10)
    km = km.fit(text_tf)
    Sum_of_squared_distances.append(km.inertia_)

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

Sum_of_squared_distances = []
K = range(2,30)
for k in K:
    km = KMeans(n_clusters=k, max_iter=200, n_init=10)
    km = km.fit(text_test_tf)
    Sum_of_squared_distances.append(km.inertia_)

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

true_k_dev = 10
model_dev = KMeans(n_clusters=true_k_dev, init='k-means++', max_iter=200, n_init=10)
model_dev.fit(text_tf)
labels_dev=model_dev.labels_
clusters_dev=pd.DataFrame(list(labels_dev),columns=['cluster'])

true_k_test = 28
model_test = KMeans(n_clusters=true_k_test, init='k-means++', max_iter=200, n_init=10)
model_test.fit(text_test_tf)
labels_test=model_test.labels_
clusters_test=pd.DataFrame(list(labels_test),columns=['cluster'])

clusters_dev.to_csv("dev-0\out.tsv", sep="\t",index=False,header=None)
clusters_test.to_csv("test-A\out.tsv", sep="\t",index=False,header=None)
test-A, script 2021-04-20 19:06:45 +02:00			`import pandas as pd`
			`from many_stop_words import get_stop_words`
			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from unidecode import unidecode`
			`from nltk.tokenize import word_tokenize`
			`import string`
			`import matplotlib.pyplot as plt`
			`from sklearn.cluster import KMeans`

			`data=pd.read_csv('dev-0/in.tsv', sep='\t', header=None)`
			`data_test=pd.read_csv('test-A/in.tsv', sep='\t', header=None)`

			`def remove_punctuations(text):`
			`for punctuation in string.punctuation:`
			`text = text.replace(punctuation, '')`
			`return text`

			`data[0] = data[0].str.lower()`
			`data_test[0] = data_test[0].str.lower()`
			`stop_words = get_stop_words('pl')`

			`data[0] = data[0].apply(unidecode)`
			`data_test[0] = data_test[0].apply(unidecode)`
			`uni_stop_words = [unidecode(x) for x in stop_words]`

			`data[0] = data[0].apply(remove_punctuations)`
			`data_test[0] = data_test[0].apply(remove_punctuations)`

			`data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))`
			`data_test[0] = data_test[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))`

			`tf=TfidfVectorizer()`
			`text_tf= tf.fit_transform(data[0])`
			`text_test_tf= tf.fit_transform(data_test[0])`

			`Sum_of_squared_distances = []`
			`K = range(2,20)`
			`for k in K:`
			`km = KMeans(n_clusters=k, max_iter=200, n_init=10)`
			`km = km.fit(text_tf)`
			`Sum_of_squared_distances.append(km.inertia_)`

			`plt.plot(K, Sum_of_squared_distances, 'bx-')`
			`plt.xlabel('k')`
			`plt.ylabel('Sum_of_squared_distances')`
			`plt.title('Elbow Method For Optimal k')`
			`plt.show()`

			`Sum_of_squared_distances = []`
			`K = range(2,30)`
			`for k in K:`
			`km = KMeans(n_clusters=k, max_iter=200, n_init=10)`
			`km = km.fit(text_test_tf)`
			`Sum_of_squared_distances.append(km.inertia_)`

			`plt.plot(K, Sum_of_squared_distances, 'bx-')`
			`plt.xlabel('k')`
			`plt.ylabel('Sum_of_squared_distances')`
			`plt.title('Elbow Method For Optimal k')`
			`plt.show()`

			`true_k_dev = 10`
			`model_dev = KMeans(n_clusters=true_k_dev, init='k-means++', max_iter=200, n_init=10)`
			`model_dev.fit(text_tf)`
			`labels_dev=model_dev.labels_`
			`clusters_dev=pd.DataFrame(list(labels_dev),columns=['cluster'])`

			`true_k_test = 28`
			`model_test = KMeans(n_clusters=true_k_test, init='k-means++', max_iter=200, n_init=10)`
			`model_test.fit(text_test_tf)`
			`labels_test=model_test.labels_`
			`clusters_test=pd.DataFrame(list(labels_test),columns=['cluster'])`

			`clusters_dev.to_csv("dev-0\out.tsv", sep="\t",index=False,header=None)`
			`clusters_test.to_csv("test-A\out.tsv", sep="\t",index=False,header=None)`