from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import sklearn.metrics from sklearn.cluster import KMeans stopwords = [] with open('./stopwords.txt') as file: for stopword in file.readlines(): stopwords.append(stopword.strip()) with open("./dev-0/in.tsv") as in_file: content = in_file.readlines() content_clear = [] for string in content: to_add = "" for word in string.split(): word = word.strip().replace(",", "") if word not in stopwords: to_add = to_add + " " + word content_clear.append(to_add) vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(content_clear) #wyniki jakie uzyskałam wachały się między 0.72 a 0.74. Sprawdziłam to na 20 próbach i nigdy nie spadły poniżej 0.70 kmeans = KMeans(n_clusters=50).fit(vectors) result=kmeans.labels_ result=np.array2string(result, separator='\n').replace(" ", "").replace("[", "").replace("]", "") with open("./dev-0/out.tsv", "w") as file: file.write(result) with open("./test-A/in.tsv") as in_file: content = in_file.readlines() content_clear = [] for string in content: to_add = "" for word in string.split(): word = word.strip().replace(",", "") if word not in stopwords: to_add = to_add + " " + word content_clear.append(to_add) vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(content_clear) kmeans = KMeans(n_clusters=50).fit(vectors) result=kmeans.labels_ result=np.array2string(result, separator='\n').replace(" ", "").replace("[", "") with open("./test-A/out.tsv", "w") as file: file.write(result)