polish-urban-legends-public/solution.py

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans


def preprocess(document, stopwords):
    return " ".join([word for word in document.split() if word not in stopwords])

def predict(in_file, out_file, stopwords):
   
    with open(in_file) as in_file:
        documents = [preprocess(document, stopwords)
                     for document in in_file.readlines()]

        vectorizer = TfidfVectorizer()

        document_vectors = vectorizer.fit_transform(documents)
        predictions = KMeans(
            n_clusters=25, max_iter=1000).fit_predict(document_vectors)

        with open(out_file, "w") as out_file:
            for prediction in predictions:
                out_file.write(str(prediction) + '\n')


def main():
    with open('stopwords.txt') as stopwords_file:
        stopwords = [stopword.strip()
                     for stopword in stopwords_file.readlines()]

    predict("dev-0/in.tsv", "dev-0/out.tsv", stopwords)
    predict("test-A/in.tsv", "test-A/out.tsv", stopwords)


if __name__ == '__main__':
    main()
Merge two scripts into one 2021-04-17 14:29:51 +02:00			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`import numpy as np`
			`import sklearn.metrics`
			`from sklearn.cluster import KMeans`


			`def preprocess(document, stopwords):`
			`return " ".join([word for word in document.split() if word not in stopwords])`

			`def predict(in_file, out_file, stopwords):`

			`with open(in_file) as in_file:`
			`documents = [preprocess(document, stopwords)`
			`for document in in_file.readlines()]`

			`vectorizer = TfidfVectorizer()`

			`document_vectors = vectorizer.fit_transform(documents)`
			`predictions = KMeans(`
			`n_clusters=25, max_iter=1000).fit_predict(document_vectors)`

			`with open(out_file, "w") as out_file:`
			`for prediction in predictions:`
			`out_file.write(str(prediction) + '\n')`


			`def main():`
			`with open('stopwords.txt') as stopwords_file:`
			`stopwords = [stopword.strip()`
			`for stopword in stopwords_file.readlines()]`

			`predict("dev-0/in.tsv", "dev-0/out.tsv", stopwords)`
			`predict("test-A/in.tsv", "test-A/out.tsv", stopwords)`


			`if __name__ == '__main__':`
			`main()`