polish-urban-legends-426220/solution.py

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans

# Lista z https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
def remove_stop_words(document, stopwords):
    return " ".join([word for word in document.split() if word not in stopwords])

def write_predictions_to_file(predictions, file):
    with open(file, "w") as out_file:
        for prediction in predictions:
            out_file.write(str(prediction) + '\n')

def load_stop_words():
    with open('stopwords.txt') as stopwords_file:
        stopwords = []
        for stopword in stopwords_file.readlines():
            # Remove whitespaces from words, to remove all stop words
            stopwords.append(stopword.strip())
        return stopwords

def load_tsv_file(file):
    stopwords = load_stop_words()
    with open(file) as in_file:
        documents = []
        for document in in_file.readlines():
            documents.append(remove_stop_words(document, stopwords))
        return documents

def prepare_prediction(catalog):
    documents = load_tsv_file(catalog + "/in.tsv")
    document_vectors = TfidfVectorizer().fit_transform(documents)

    # Z n_clusters w okolicy 20 uzyskiwałem wyniki pokroju 0.85 (według Geval). Jednakże, z n_clusters 50 wyniki było 
    # przy każdym uruchomieniu zbliżone do siebie. Przy n = 20 wachają się od 0.7 do 0.85
    predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(document_vectors)
    write_predictions_to_file(predictions, catalog + "/out.tsv")

prepare_prediction("dev-0")
prepare_prediction("test-A")