polish-urban-legends-public/Untitled.py

93 lines
1.5 KiB
Python
Raw Normal View History

2021-04-25 15:39:23 +02:00
#!/usr/bin/env python
# coding: utf-8
# In[51]:
from sklearn.feature_extraction.text import TfidfVectorizer
# In[52]:
import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans
# In[53]:
stopwords = []
with open('./stop_words.txt', encoding='utf-8') as file:
for stopword in file.readlines():
stopwords.append(stopword.strip())
# In[54]:
b = []
c = []
# In[55]:
print(stopwords)
# In[56]:
with open("./dev-0/in.tsv", encoding='utf-8') as in_file:
a = in_file.readlines()
# In[57]:
for string in a:
to_add = ""
for word in string.split():
word = word.strip().replace(",", "")
if word not in stopwords:
to_add = to_add + " " + word
b.append(to_add)
kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(b))
out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "").replace("]", "")
with open("./dev-0/out.tsv", "w") as file:
file.write(out)
# In[58]:
with open("./test-A/in.tsv", encoding='utf-8') as in_file:
a = in_file.readlines()
# In[59]:
for string in a:
to_add = ""
for word in string.split():
word = word.strip().replace(",", "")
if word not in stopwords:
to_add = to_add + " " + word
c.append(to_add)
kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(content_clear))
out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "")
with open("./test-A/out.tsv", "w") as file:
file.write(result)
# In[ ]: