polish-urban-legends-public/solution-testA.py

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans


def preprocess(document, stopwords):
    return " ".join([word for word in document.split() if word not in stopwords])


def main():
    with open('stopwords.txt') as stopwords_file:
        stopwords = [stopword.strip()
                     for stopword in stopwords_file.readlines()]

    with open("test-A/in.tsv") as in_file:
        documents = [preprocess(document, stopwords)
                     for document in in_file.readlines()]

        vectorizer = TfidfVectorizer()

        document_vectors = vectorizer.fit_transform(documents)
        predictions = KMeans(
            n_clusters=25, max_iter=1000).fit_predict(document_vectors)

        with open("test-A/out.tsv", "w") as out_file:
            for prediction in predictions:
                out_file.write(str(prediction) + '\n')


if __name__ == '__main__':
    main()