polish-urban-legends-public/main.py

46 lines
1.7 KiB
Python

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans
stopwords = []
with open('./stopwords.txt') as file:
for stopword in file.readlines():
stopwords.append(stopword.strip())
with open("./dev-0/in.tsv") as in_file:
content = in_file.readlines()
content_clear = []
for string in content:
to_add = ""
for word in string.split():
word = word.strip().replace(",", "")
if word not in stopwords:
to_add = to_add + " " + word
content_clear.append(to_add)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(content_clear)
#wyniki jakie uzyskałam wachały się między 0.72 a 0.74. Sprawdziłam to na 20 próbach i nigdy nie spadły poniżej 0.70
kmeans = KMeans(n_clusters=50).fit(vectors)
result=kmeans.labels_
result=np.array2string(result, separator='\n').replace(" ", "").replace("[", "").replace("]", "")
with open("./dev-0/out.tsv", "w") as file:
file.write(result)
with open("./test-A/in.tsv") as in_file:
content = in_file.readlines()
content_clear = []
for string in content:
to_add = ""
for word in string.split():
word = word.strip().replace(",", "")
if word not in stopwords:
to_add = to_add + " " + word
content_clear.append(to_add)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(content_clear)
kmeans = KMeans(n_clusters=50).fit(vectors)
result=kmeans.labels_
result=np.array2string(result, separator='\n').replace(" ", "").replace("[", "")
with open("./test-A/out.tsv", "w") as file:
file.write(result)