polish-urban-legends-public/script.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


def openFileWithStopwords(filename):
    stopwords = []
    with open(filename, encoding='utf-8') as file:
        stopwords = [stopWord.strip() for stopWord in file.readlines()]
    return stopwords


def calculate(clusters, maxiter, tfidfVectorizer, stopwords, fileIn, fileOut):
    with open(fileIn, encoding='utf-8') as infile:
        documents = [" ".join([stopword for stopword in document.split() if stopword not in stopwords]) for document in
                infile.readlines()]
        results = KMeans(n_clusters=clusters, max_iter=maxiter).fit_predict(tfidfVectorizer.fit_transform(documents))
        with open(fileOut, "w", encoding='utf-8') as output:
            for result in results:
                output.write(str(result) + '\n')


def main():
    vectorizer = TfidfVectorizer()
    clusters = 20
    maxiter = 1000
    stopwords = openFileWithStopwords('stopwords.txt')
    calculate(clusters, maxiter, vectorizer, stopwords, "dev-0/in.tsv", "dev-0/out.tsv")
    calculate(clusters, maxiter, vectorizer, stopwords, "test-A/in.tsv", "test-A/out.tsv")


if __name__ == '__main__':
    main()
done 2021-04-26 23:38:04 +02:00			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from sklearn.cluster import KMeans`


			`def openFileWithStopwords(filename):`
			`stopwords = []`
			`with open(filename, encoding='utf-8') as file:`
			`stopwords = [stopWord.strip() for stopWord in file.readlines()]`
			`return stopwords`


			`def calculate(clusters, maxiter, tfidfVectorizer, stopwords, fileIn, fileOut):`
			`with open(fileIn, encoding='utf-8') as infile:`
			`documents = [" ".join([stopword for stopword in document.split() if stopword not in stopwords]) for document in`
			`infile.readlines()]`
			`results = KMeans(n_clusters=clusters, max_iter=maxiter).fit_predict(tfidfVectorizer.fit_transform(documents))`
			`with open(fileOut, "w", encoding='utf-8') as output:`
			`for result in results:`
			`output.write(str(result) + '\n')`


			`def main():`
			`vectorizer = TfidfVectorizer()`
			`clusters = 20`
			`maxiter = 1000`
			`stopwords = openFileWithStopwords('stopwords.txt')`
			`calculate(clusters, maxiter, vectorizer, stopwords, "dev-0/in.tsv", "dev-0/out.tsv")`
			`calculate(clusters, maxiter, vectorizer, stopwords, "test-A/in.tsv", "test-A/out.tsv")`


			`if __name__ == '__main__':`
			`main()`