#!/usr/bin/env python # coding: utf-8 from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import string from stop_words import get_stop_words stop_words = get_stop_words('polish') inp1 = open('dev-0/in.tsv', 'r', encoding="utf-8") out1 = open("dev-0/out.tsv", "w") linia1 = inp1.readlines() inp1.close() prep=[] for x in linia1: temp = "" for y in x.split(): y = y.strip().replace(",", "") if y not in stop_words: temp = temp + " " + y prep.append(temp) vectorizer1 = TfidfVectorizer() vectorizer1 = vectorizer1.fit_transform(prep) predict1 = KMeans(n_clusters=25, max_iter=1000).fit_predict(vectorizer1) print(predict1) for x in predict1: out1.write(str(x) + '\n') out1.close() inp2 = open('test-A/in.tsv', 'r', encoding="utf-8") out2 = open("test-A/out.tsv", "w") linia2 = inp2.readlines() inp2.close() prep2=[] for x2 in linia1: temp2 = "" for y2 in x2.split(): y2 = y2.strip().replace(",", "") if y2 not in stop_words: temp2 = temp2 + " " + y2 prep2.append(temp2) vectorizer2 = TfidfVectorizer() vectorizer2 = vectorizer2.fit_transform(prep) predict2 = KMeans(n_clusters=25, max_iter=1000).fit_predict(vectorizer2) print(predict2) for y in predict2: out2.write(str(y) + '\n') out2.close()