93 lines
1.5 KiB
Python
93 lines
1.5 KiB
Python
|
#!/usr/bin/env python
|
||
|
# coding: utf-8
|
||
|
|
||
|
# In[51]:
|
||
|
|
||
|
|
||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
|
||
|
|
||
|
|
||
|
# In[52]:
|
||
|
|
||
|
|
||
|
import numpy as np
|
||
|
import sklearn.metrics
|
||
|
from sklearn.cluster import KMeans
|
||
|
|
||
|
|
||
|
# In[53]:
|
||
|
|
||
|
|
||
|
stopwords = []
|
||
|
with open('./stop_words.txt', encoding='utf-8') as file:
|
||
|
for stopword in file.readlines():
|
||
|
stopwords.append(stopword.strip())
|
||
|
|
||
|
|
||
|
# In[54]:
|
||
|
|
||
|
|
||
|
b = []
|
||
|
c = []
|
||
|
|
||
|
|
||
|
# In[55]:
|
||
|
|
||
|
|
||
|
print(stopwords)
|
||
|
|
||
|
|
||
|
# In[56]:
|
||
|
|
||
|
|
||
|
with open("./dev-0/in.tsv", encoding='utf-8') as in_file:
|
||
|
a = in_file.readlines()
|
||
|
|
||
|
|
||
|
|
||
|
# In[57]:
|
||
|
|
||
|
|
||
|
for string in a:
|
||
|
to_add = ""
|
||
|
for word in string.split():
|
||
|
word = word.strip().replace(",", "")
|
||
|
if word not in stopwords:
|
||
|
to_add = to_add + " " + word
|
||
|
b.append(to_add)
|
||
|
kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(b))
|
||
|
out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "").replace("]", "")
|
||
|
with open("./dev-0/out.tsv", "w") as file:
|
||
|
file.write(out)
|
||
|
|
||
|
|
||
|
# In[58]:
|
||
|
|
||
|
|
||
|
with open("./test-A/in.tsv", encoding='utf-8') as in_file:
|
||
|
a = in_file.readlines()
|
||
|
|
||
|
|
||
|
# In[59]:
|
||
|
|
||
|
|
||
|
for string in a:
|
||
|
to_add = ""
|
||
|
for word in string.split():
|
||
|
word = word.strip().replace(",", "")
|
||
|
if word not in stopwords:
|
||
|
to_add = to_add + " " + word
|
||
|
c.append(to_add)
|
||
|
kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(content_clear))
|
||
|
out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "")
|
||
|
with open("./test-A/out.tsv", "w") as file:
|
||
|
file.write(result)
|
||
|
|
||
|
|
||
|
# In[ ]:
|
||
|
|
||
|
|
||
|
|
||
|
|