#!/usr/bin/env python # coding: utf-8 # In[51]: from sklearn.feature_extraction.text import TfidfVectorizer # In[52]: import numpy as np import sklearn.metrics from sklearn.cluster import KMeans # In[53]: stopwords = [] with open('./stop_words.txt', encoding='utf-8') as file: for stopword in file.readlines(): stopwords.append(stopword.strip()) # In[54]: b = [] c = [] # In[55]: print(stopwords) # In[56]: with open("./dev-0/in.tsv", encoding='utf-8') as in_file: a = in_file.readlines() # In[57]: for string in a: to_add = "" for word in string.split(): word = word.strip().replace(",", "") if word not in stopwords: to_add = to_add + " " + word b.append(to_add) kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(b)) out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "").replace("]", "") with open("./dev-0/out.tsv", "w") as file: file.write(out) # In[58]: with open("./test-A/in.tsv", encoding='utf-8') as in_file: a = in_file.readlines() # In[59]: for string in a: to_add = "" for word in string.split(): word = word.strip().replace(",", "") if word not in stopwords: to_add = to_add + " " + word c.append(to_add) kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(content_clear)) out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "") with open("./test-A/out.tsv", "w") as file: file.write(result) # In[ ]: