36 lines
1.4 KiB
36 lines
1.4 KiB
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans
stopwords = []
with open('./stopwords.txt') as file:
for stopword in file.readlines():
with open("./dev-0/in.tsv") as source:
documents_tmp = [document for document in source.readlines()]
documents = []
for doc in documents_tmp:
documents.append(" ".join([word for word in doc.split() if word not in stopwords]))
model = TfidfVectorizer()
doc_vectors = model.fit_transform(documents)
#przy takim ustawieniu zbliżyłem się do 0.8, średnio przy 10 próbach wyszło mi 0.76, jednak wynik nigdy nie spadł ponizej 0.74
pred = KMeans(n_clusters=20, max_iter=3000).fit_predict(doc_vectors)
with open("./dev-0/out.tsv", "w") as result:
for prediction in pred:
result.write(str(prediction) + '\n')
with open("./test-A/in.tsv") as source:
documents_tmp = [document for document in source.readlines()]
documents = []
for doc in documents_tmp:
documents.append(" ".join([word for word in doc.split() if word not in stopwords]))
model = TfidfVectorizer()
doc_vectors = model.fit_transform(documents)
pred = KMeans(n_clusters=20, max_iter=3000).fit_predict(doc_vectors)
with open("./test-A/out.tsv", "w") as result:
for prediction in pred:
result.write(str(prediction) + '\n')