#!/usr/bin/env python
# coding: utf-8

# In[51]:


from sklearn.feature_extraction.text import TfidfVectorizer


# In[52]:


import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans


# In[53]:


stopwords = []
with open('./stop_words.txt', encoding='utf-8') as file:
        for stopword in file.readlines():
            stopwords.append(stopword.strip())


# In[54]:


b = []
c = []


# In[55]:


print(stopwords)


# In[56]:


with open("./dev-0/in.tsv", encoding='utf-8') as in_file:
    a = in_file.readlines()


# In[57]:


for string in a:
  to_add = ""
  for word in string.split():
    word = word.strip().replace(",", "")
    if word not in stopwords:
      to_add = to_add + " " + word
  b.append(to_add)
kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(b))
out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "").replace("]", "")
with open("./dev-0/out.tsv", "w") as file:
  file.write(out)


# In[58]:


with open("./test-A/in.tsv", encoding='utf-8') as in_file:
    a = in_file.readlines()


# In[59]:


for string in a:
  to_add = ""
  for word in string.split():
    word = word.strip().replace(",", "")
    if word not in stopwords:
      to_add = to_add + " " + word
  c.append(to_add)
kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(content_clear))
out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "")
with open("./test-A/out.tsv", "w") as file:
  file.write(result)


# In[ ]: