polish-urban-legends-public/Untitled.ipynb at master

forked from kubapok/polish-urban-legends-public

bednarco 296fe0638e final

2021-04-20 19:15:41 +02:00

42 KiB

Raw Permalink Blame History

import pandas as pd
from many_stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from unidecode import unidecode
from nltk.tokenize import word_tokenize
import string
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

data=pd.read_csv('dev-0/in.tsv', sep='\t', header=None)
data_test=pd.read_csv('test-A/in.tsv', sep='\t', header=None)

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data[0] = data[0].str.lower()
data_test[0] = data_test[0].str.lower()
stop_words = get_stop_words('pl')

data[0] = data[0].apply(unidecode)
data_test[0] = data_test[0].apply(unidecode)
uni_stop_words = [unidecode(x) for x in stop_words]

data[0] = data[0].apply(remove_punctuations)
data_test[0] = data_test[0].apply(remove_punctuations)

data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))
data_test[0] = data_test[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))

tf=TfidfVectorizer()
text_tf= tf.fit_transform(data[0])
text_test_tf= tf.fit_transform(data_test[0])

data[0]

0     opowiesc prawdziwa olsztyn akademik 7 pietro i...
1     podejrzewam polowaniu mowy prostu znalazl mart...
2     smutne przypomina historie balwankami wredny f...
3     kumpla zdawal walentynki polozyl koperte laski...
4     przypomniala krakowskich urban legends chyba n...
                            ...                        
82    wczoraj popoludniowej audycji trojce prowadzac...
83    sluchajcie uwielbiam opowiadacv sluchac jakies...
84    wczoraj probie koncertu czwartkowego akompania...
85    zuzanna mala historia przyszla panna mloda kup...
86    koszmar zaczyna niewinnego spotkania jednym to...
Name: 0, Length: 87, dtype: object

Sum_of_squared_distances = []
K = range(2,20)
for k in K:
    km = KMeans(n_clusters=k, max_iter=200, n_init=10)
    km = km.fit(text_tf)
    Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

Sum_of_squared_distances = []
K = range(2,30)
for k in K:
    km = KMeans(n_clusters=k, max_iter=200, n_init=10)
    km = km.fit(text_test_tf)
    Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

true_k_dev = 10
model_dev = KMeans(n_clusters=true_k_dev, init='k-means++', max_iter=200, n_init=10)
model_dev.fit(text_tf)
labels_dev=model_dev.labels_
clusters_dev=pd.DataFrame(list(labels_dev),columns=['cluster'])

true_k_test = 28
model_test = KMeans(n_clusters=true_k_test, init='k-means++', max_iter=200, n_init=10)
model_test.fit(text_test_tf)
labels_test=model_test.labels_
clusters_test=pd.DataFrame(list(labels_test),columns=['cluster'])

clusters_dev.to_csv("dev-0\out.tsv", sep="\t",index=False,header=None)

clusters_test.to_csv("test-A\out.tsv", sep="\t",index=False,header=None)

clusters_dev

	cluster
0	6
1	5
2	2
3	8
4	6
...	...
82	2
83	6
84	4
85	6
86	5

87 rows × 1 columns

42 KiB Raw Permalink Blame History Unescape Escape

42 KiB

Raw Permalink Blame History