polish-urban-legends-426220/solution.py

42 lines
1.6 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans
# Lista z https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
def remove_stop_words(document, stopwords):
return " ".join([word for word in document.split() if word not in stopwords])
def write_predictions_to_file(predictions, file):
with open(file, "w") as out_file:
for prediction in predictions:
out_file.write(str(prediction) + '\n')
def load_stop_words():
with open('stopwords.txt') as stopwords_file:
stopwords = []
for stopword in stopwords_file.readlines():
# Remove whitespaces from words, to remove all stop words
stopwords.append(stopword.strip())
return stopwords
def load_tsv_file(file):
stopwords = load_stop_words()
with open(file) as in_file:
documents = []
for document in in_file.readlines():
documents.append(remove_stop_words(document, stopwords))
return documents
def prepare_prediction(catalog):
documents = load_tsv_file(catalog + "/in.tsv")
document_vectors = TfidfVectorizer().fit_transform(documents)
# Z n_clusters w okolicy 20 uzyskiwałem wyniki pokroju 0.85 (według Geval). Jednakże, z n_clusters 50 wyniki było
# przy każdym uruchomieniu zbliżone do siebie. Przy n = 20 wachają się od 0.7 do 0.85
predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(document_vectors)
write_predictions_to_file(predictions, catalog + "/out.tsv")
prepare_prediction("dev-0")
prepare_prediction("test-A")