from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans def calcDevZero(tfidfVectorizer, stopwords): with open("dev-0/in.tsv", encoding='utf-8') as input_file: docs = [" ".join([sword for sword in document.split() if sword not in stopwords]) for document in input_file.readlines()] vectors = tfidfVectorizer.fit_transform(docs) predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(vectors) with open("dev-0/out.tsv", "w", encoding='utf-8') as output_file: for prediction in predictions: output_file.write(str(prediction) + '\n') def calcTestA(tfidfVectorizer, stopwords): with open("test-A/in.tsv", encoding='utf-8') as input_file: docs = [" ".join([sword for sword in document.split() if sword not in stopwords]) for document in input_file.readlines()] vectors = tfidfVectorizer.fit_transform(docs) predictions = KMeans(n_clusters=20, max_iter=1000).fit_predict(vectors) with open("test-A/out.tsv", "w", encoding='utf-8') as output_file: for prediction in predictions: output_file.write(str(prediction) + '\n') def setStopWords(filename): stopwords = [] with open(filename, encoding='utf-8') as stopwords_file: stopwords = [sWord.strip() for sWord in stopwords_file.readlines()] return stopwords def main(): tfidfVectorizer = TfidfVectorizer() stopwords = setStopWords('stopwords.txt') calcDevZero(tfidfVectorizer, stopwords) calcTestA(tfidfVectorizer, stopwords) if __name__ == '__main__': main()