polish-urban-legends-public/Untitled.py

#!/usr/bin/env python
# coding: utf-8

# In[51]:


from sklearn.feature_extraction.text import TfidfVectorizer


# In[52]:


import numpy as np
import sklearn.metrics
from sklearn.cluster import KMeans


# In[53]:


stopwords = []
with open('./stop_words.txt', encoding='utf-8') as file:
        for stopword in file.readlines():
            stopwords.append(stopword.strip())


# In[54]:


b = []
c = []


# In[55]:


print(stopwords)


# In[56]:


with open("./dev-0/in.tsv", encoding='utf-8') as in_file:
    a = in_file.readlines()


# In[57]:


for string in a:
  to_add = ""
  for word in string.split():
    word = word.strip().replace(",", "")
    if word not in stopwords:
      to_add = to_add + " " + word
  b.append(to_add)
kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(b))
out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "").replace("]", "")
with open("./dev-0/out.tsv", "w") as file:
  file.write(out)


# In[58]:


with open("./test-A/in.tsv", encoding='utf-8') as in_file:
    a = in_file.readlines()


# In[59]:


for string in a:
  to_add = ""
  for word in string.split():
    word = word.strip().replace(",", "")
    if word not in stopwords:
      to_add = to_add + " " + word
  c.append(to_add)
kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(content_clear))
out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "")
with open("./test-A/out.tsv", "w") as file:
  file.write(result)


# In[ ]:
zad 2021-04-25 15:39:23 +02:00			`#!/usr/bin/env python`
			`# coding: utf-8`

			`# In[51]:`


			`from sklearn.feature_extraction.text import TfidfVectorizer`



			`# In[52]:`


			`import numpy as np`
			`import sklearn.metrics`
			`from sklearn.cluster import KMeans`


			`# In[53]:`


			`stopwords = []`
			`with open('./stop_words.txt', encoding='utf-8') as file:`
			`for stopword in file.readlines():`
			`stopwords.append(stopword.strip())`


			`# In[54]:`


			`b = []`
			`c = []`


			`# In[55]:`


			`print(stopwords)`


			`# In[56]:`


			`with open("./dev-0/in.tsv", encoding='utf-8') as in_file:`
			`a = in_file.readlines()`



			`# In[57]:`


			`for string in a:`
			`to_add = ""`
			`for word in string.split():`
			`word = word.strip().replace(",", "")`
			`if word not in stopwords:`
			`to_add = to_add + " " + word`
			`b.append(to_add)`
			`kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(b))`
			`out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "").replace("]", "")`
			`with open("./dev-0/out.tsv", "w") as file:`
			`file.write(out)`


			`# In[58]:`


			`with open("./test-A/in.tsv", encoding='utf-8') as in_file:`
			`a = in_file.readlines()`


			`# In[59]:`


			`for string in a:`
			`to_add = ""`
			`for word in string.split():`
			`word = word.strip().replace(",", "")`
			`if word not in stopwords:`
			`to_add = to_add + " " + word`
			`c.append(to_add)`
			`kmeans = KMeans(n_clusters=30).fit(TfidfVectorizer().fit_transform(content_clear))`
			`out=np.array2string(kmeans.labels_, separator='\n').replace(" ", "").replace("[", "")`
			`with open("./test-A/out.tsv", "w") as file:`
			`file.write(result)`


			`# In[ ]:`