polish-urban-legends-public/result.py

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans

stopwords = []
with open('./stopwords.txt') as file:
        for stopword in file.readlines():
            stopwords.append(stopword.strip())

with open("./dev-0/in.tsv") as source:
  documents_tmp = [document for document in source.readlines()]
  documents = []
  for doc in documents_tmp:
    documents.append(" ".join([word for word in doc.split() if word not in stopwords]))
  model = TfidfVectorizer()
  doc_vectors = model.fit_transform(documents)
  #przy takim ustawieniu zbliżyłem się do 0.8, średnio przy 10 próbach wyszło mi 0.76, jednak wynik nigdy nie spadł ponizej 0.74
  pred = KMeans(n_clusters=20, max_iter=3000).fit_predict(doc_vectors)
  with open("./dev-0/out.tsv", "w") as result:
    for prediction in pred:
      result.write(str(prediction) + '\n')

with open("./test-A/in.tsv") as source:
  documents_tmp = [document for document in source.readlines()]
  documents = []
  for doc in documents_tmp:
    documents.append(" ".join([word for word in doc.split() if word not in stopwords]))
  model = TfidfVectorizer()
  doc_vectors = model.fit_transform(documents)
  pred = KMeans(n_clusters=20, max_iter=3000).fit_predict(doc_vectors)
  with open("./test-A/out.tsv", "w") as result:
    for prediction in pred:
      result.write(str(prediction) + '\n')