46 lines
1.7 KiB
Python
46 lines
1.7 KiB
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
import numpy as np
|
|
import sklearn.metrics
|
|
from sklearn.cluster import KMeans
|
|
|
|
stopwords = []
|
|
with open('./stopwords.txt') as file:
|
|
for stopword in file.readlines():
|
|
stopwords.append(stopword.strip())
|
|
|
|
with open("./dev-0/in.tsv") as in_file:
|
|
content = in_file.readlines()
|
|
content_clear = []
|
|
for string in content:
|
|
to_add = ""
|
|
for word in string.split():
|
|
word = word.strip().replace(",", "")
|
|
if word not in stopwords:
|
|
to_add = to_add + " " + word
|
|
content_clear.append(to_add)
|
|
vectorizer = TfidfVectorizer()
|
|
vectors = vectorizer.fit_transform(content_clear)
|
|
#wyniki jakie uzyskałam wachały się między 0.72 a 0.74. Sprawdziłam to na 20 próbach i nigdy nie spadły poniżej 0.70
|
|
kmeans = KMeans(n_clusters=50).fit(vectors)
|
|
result=kmeans.labels_
|
|
result=np.array2string(result, separator='\n').replace(" ", "").replace("[", "").replace("]", "")
|
|
with open("./dev-0/out.tsv", "w") as file:
|
|
file.write(result)
|
|
|
|
with open("./test-A/in.tsv") as in_file:
|
|
content = in_file.readlines()
|
|
content_clear = []
|
|
for string in content:
|
|
to_add = ""
|
|
for word in string.split():
|
|
word = word.strip().replace(",", "")
|
|
if word not in stopwords:
|
|
to_add = to_add + " " + word
|
|
content_clear.append(to_add)
|
|
vectorizer = TfidfVectorizer()
|
|
vectors = vectorizer.fit_transform(content_clear)
|
|
kmeans = KMeans(n_clusters=50).fit(vectors)
|
|
result=kmeans.labels_
|
|
result=np.array2string(result, separator='\n').replace(" ", "").replace("[", "")
|
|
with open("./test-A/out.tsv", "w") as file:
|
|
file.write(result) |