36 lines
1.4 KiB
Python
36 lines
1.4 KiB
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
import numpy as np
|
|
import sklearn.metrics
|
|
from sklearn.cluster import KMeans
|
|
|
|
stopwords = []
|
|
with open('./stopwords.txt') as file:
|
|
for stopword in file.readlines():
|
|
stopwords.append(stopword.strip())
|
|
|
|
with open("./dev-0/in.tsv") as source:
|
|
documents_tmp = [document for document in source.readlines()]
|
|
documents = []
|
|
for doc in documents_tmp:
|
|
documents.append(" ".join([word for word in doc.split() if word not in stopwords]))
|
|
model = TfidfVectorizer()
|
|
doc_vectors = model.fit_transform(documents)
|
|
#przy takim ustawieniu zbliżyłem się do 0.8, średnio przy 10 próbach wyszło mi 0.76, jednak wynik nigdy nie spadł ponizej 0.74
|
|
pred = KMeans(n_clusters=20, max_iter=3000).fit_predict(doc_vectors)
|
|
with open("./dev-0/out.tsv", "w") as result:
|
|
for prediction in pred:
|
|
result.write(str(prediction) + '\n')
|
|
|
|
with open("./test-A/in.tsv") as source:
|
|
documents_tmp = [document for document in source.readlines()]
|
|
documents = []
|
|
for doc in documents_tmp:
|
|
documents.append(" ".join([word for word in doc.split() if word not in stopwords]))
|
|
model = TfidfVectorizer()
|
|
doc_vectors = model.fit_transform(documents)
|
|
pred = KMeans(n_clusters=20, max_iter=3000).fit_predict(doc_vectors)
|
|
with open("./test-A/out.tsv", "w") as result:
|
|
for prediction in pred:
|
|
result.write(str(prediction) + '\n')
|
|
|