polish-urban-legends-426220/solution.py

42 lines
1.6 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans
# Lista z https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
def remove_stop_words(document, stopwords):
return " ".join([word for word in document.split() if word not in stopwords])
def write_predictions_to_file(predictions, file):
with open(file, "w") as out_file:
for prediction in predictions:
out_file.write(str(prediction) + '\n')
def load_stop_words():
with open('stopwords.txt') as stopwords_file:
stopwords = []
for stopword in stopwords_file.readlines():
# Remove whitespaces from words, to remove all stop words
stopwords.append(stopword.strip())
return stopwords
def load_tsv_file(file):
stopwords = load_stop_words()
with open(file) as in_file:
documents = []
for document in in_file.readlines():
documents.append(remove_stop_words(document, stopwords))
return documents
def prepare_prediction(catalog):
documents = load_tsv_file(catalog + "/in.tsv")
document_vectors = TfidfVectorizer().fit_transform(documents)
# Z n_clusters w okolicy 20 uzyskiwałem wyniki pokroju 0.85 (według Geval). Jednakże, z n_clusters 50 wyniki było
# przy każdym uruchomieniu zbliżone do siebie. Przy n = 20 wachają się od 0.7 do 0.85
predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(document_vectors)
write_predictions_to_file(predictions, catalog + "/out.tsv")
prepare_prediction("dev-0")
prepare_prediction("test-A")