polish-urban-legends-public/solution-testA.py

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans


def preprocess(document, stopwords):
    return " ".join([word for word in document.split() if word not in stopwords])


def main():
    with open('stopwords.txt') as stopwords_file:
        stopwords = [stopword.strip()
                     for stopword in stopwords_file.readlines()]

    with open("test-A/in.tsv") as in_file:
        documents = [preprocess(document, stopwords)
                     for document in in_file.readlines()]

        vectorizer = TfidfVectorizer()

        document_vectors = vectorizer.fit_transform(documents)
        predictions = KMeans(
            n_clusters=25, max_iter=1000).fit_predict(document_vectors)

        with open("test-A/out.tsv", "w") as out_file:
            for prediction in predictions:
                out_file.write(str(prediction) + '\n')


if __name__ == '__main__':
    main()
Add first solution 2021-04-15 18:16:41 +02:00			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`import numpy as np`
			`import sklearn.metrics`
			`from sklearn.cluster import KMeans`


Improve the result by around 0.1 geval score by removing polish stopwords 2021-04-15 18:29:53 +02:00			`def preprocess(document, stopwords):`
			`return " ".join([word for word in document.split() if word not in stopwords])`


Add first solution 2021-04-15 18:16:41 +02:00			`def main():`
Improve the result by around 0.1 geval score by removing polish stopwords 2021-04-15 18:29:53 +02:00			`with open('stopwords.txt') as stopwords_file:`
			`stopwords = [stopword.strip()`
			`for stopword in stopwords_file.readlines()]`

Add first solution 2021-04-15 18:16:41 +02:00			`with open("test-A/in.tsv") as in_file:`
Improve the result by around 0.1 geval score by removing polish stopwords 2021-04-15 18:29:53 +02:00			`documents = [preprocess(document, stopwords)`
			`for document in in_file.readlines()]`
Add first solution 2021-04-15 18:16:41 +02:00
			`vectorizer = TfidfVectorizer()`

			`document_vectors = vectorizer.fit_transform(documents)`
Improve the result by around 0.1 geval score by removing polish stopwords 2021-04-15 18:29:53 +02:00			`predictions = KMeans(`
Improve the results to 0.8ish 2021-04-17 14:23:23 +02:00			`n_clusters=25, max_iter=1000).fit_predict(document_vectors)`
Add first solution 2021-04-15 18:16:41 +02:00
			`with open("test-A/out.tsv", "w") as out_file:`
			`for prediction in predictions:`
			`out_file.write(str(prediction) + '\n')`


			`if __name__ == '__main__':`
			`main()`