polish-urban-legends-public/solution.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


def calcDevZero(tfidfVectorizer, stopwords):
    with open("dev-0/in.tsv", encoding='utf-8') as input_file:
        docs = [" ".join([sword for sword in document.split() if sword not in stopwords]) for document in
                input_file.readlines()]
        vectors = tfidfVectorizer.fit_transform(docs)
        predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(vectors)
        with open("dev-0/out.tsv", "w", encoding='utf-8') as output_file:
            for prediction in predictions:
                output_file.write(str(prediction) + '\n')


def calcTestA(tfidfVectorizer, stopwords):
    with open("test-A/in.tsv", encoding='utf-8') as input_file:
        docs = [" ".join([sword for sword in document.split() if sword not in stopwords]) for document in
                input_file.readlines()]
        vectors = tfidfVectorizer.fit_transform(docs)
        predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(vectors)
        with open("test-A/out.tsv", "w", encoding='utf-8') as output_file:
            for prediction in predictions:
                output_file.write(str(prediction) + '\n')


def setStopWords(filename):
    stopwords = []
    with open(filename, encoding='utf-8') as stopwords_file:
        stopwords = [sWord.strip() for sWord in stopwords_file.readlines()]
    return stopwords


def main():
    tfidfVectorizer = TfidfVectorizer()
    stopwords = setStopWords('stopwords.txt')
    calcDevZero(tfidfVectorizer, stopwords)
    calcTestA(tfidfVectorizer, stopwords)


if __name__ == '__main__':
    main()
task done 2021-04-26 15:39:52 +02:00			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from sklearn.cluster import KMeans`


			`def calcDevZero(tfidfVectorizer, stopwords):`
			`with open("dev-0/in.tsv", encoding='utf-8') as input_file:`
			`docs = [" ".join([sword for sword in document.split() if sword not in stopwords]) for document in`
			`input_file.readlines()]`
			`vectors = tfidfVectorizer.fit_transform(docs)`
			`predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(vectors)`
			`with open("dev-0/out.tsv", "w", encoding='utf-8') as output_file:`
			`for prediction in predictions:`
			`output_file.write(str(prediction) + '\n')`


			`def calcTestA(tfidfVectorizer, stopwords):`
			`with open("test-A/in.tsv", encoding='utf-8') as input_file:`
			`docs = [" ".join([sword for sword in document.split() if sword not in stopwords]) for document in`
			`input_file.readlines()]`
			`vectors = tfidfVectorizer.fit_transform(docs)`
			`predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(vectors)`
			`with open("test-A/out.tsv", "w", encoding='utf-8') as output_file:`
			`for prediction in predictions:`
			`output_file.write(str(prediction) + '\n')`


			`def setStopWords(filename):`
			`stopwords = []`
			`with open(filename, encoding='utf-8') as stopwords_file:`
			`stopwords = [sWord.strip() for sWord in stopwords_file.readlines()]`
			`return stopwords`


			`def main():`
			`tfidfVectorizer = TfidfVectorizer()`
			`stopwords = setStopWords('stopwords.txt')`
			`calcDevZero(tfidfVectorizer, stopwords)`
			`calcTestA(tfidfVectorizer, stopwords)`


			`if __name__ == '__main__':`
			`main()`