polish-urban-legends-public/solution.py

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans


def preprocess(document, stopwords):
    return " ".join([word for word in document.split() if word not in stopwords])

def predict(in_file, out_file, stopwords):

    with open(in_file) as in_file:
        documents = [preprocess(document, stopwords)
                     for document in in_file.readlines()]

        vectorizer = TfidfVectorizer()

        document_vectors = vectorizer.fit_transform(documents)
        predictions = KMeans(
            n_clusters=25, max_iter=1000).fit_predict(document_vectors)

        with open(out_file, "w") as out_file:
            for prediction in predictions:
                out_file.write(str(prediction) + '\n')


def main():
    with open('stopwords.txt') as stopwords_file:
        stopwords = [stopword.strip()
                     for stopword in stopwords_file.readlines()]

    predict("dev-0/in.tsv", "dev-0/out.tsv", stopwords)
    predict("test-A/in.tsv", "test-A/out.tsv", stopwords)


if __name__ == '__main__':
    main()