polish-urban-legends-public/classifier.py

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import KMeans
from collections import Counter
import re

sentences = []
path = 'dev-0/'
with open(path + 'in.tsv') as f:
    sentences = [line.rstrip() for line in f]

clusters_no = 10

stopwords = []
with open('stopwords') as f:
    stopwords = [line.rstrip() for line in f]

splited = []
for sentence in sentences:
    for w in stopwords:
        pattern = r'\b' + w + r'\b'
        sentence = re.sub(pattern, '', sentence)
    spl = sentence.split(' ')
    spl = list(filter(lambda x: x != '', spl))
    spl = [x.lower() for x in spl]
    splited.append(spl)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(splited)]
model = Doc2Vec(documents, min_count=0, alpha=0.025, epochs=400)
# for epoch in range(10):
#     print('Training epoch %s', epoch)
#     model.train()
#     model.alpha -= 0.002
#     model.min_alpha = model.alpha


X = model.dv.vectors

kmeans = KMeans(n_clusters=clusters_no, max_iter=4000).fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

print(labels)
ctr = Counter(labels)
print(ctr.values())
with open(path + 'out.tsv', 'w') as file:
    for label in labels:
        file.write("%i\n" % label)


# print(model.wv.vectors)
# kclusterer = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, repeats=25)
# assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
# print(assigned_clusters)
#
# words = list(model.wv.key_to_index)
# for i, word in enumerate(words):
#     print(word + ":" + str(assigned_clusters[i]))
Add classifier 2021-04-13 19:34:47 +02:00			`from gensim.models.doc2vec import Doc2Vec, TaggedDocument`
			`from sklearn.cluster import KMeans`
Use stopwords 2021-04-13 20:28:41 +02:00			`from collections import Counter`
			`import re`
Add classifier 2021-04-13 19:34:47 +02:00
			`sentences = []`
			`path = 'dev-0/'`
			`with open(path + 'in.tsv') as f:`
			`sentences = [line.rstrip() for line in f]`

Use stopwords 2021-04-13 20:28:41 +02:00			`clusters_no = 10`

			`stopwords = []`
			`with open('stopwords') as f:`
			`stopwords = [line.rstrip() for line in f]`

Add classifier 2021-04-13 19:34:47 +02:00			`splited = []`
			`for sentence in sentences:`
Use stopwords 2021-04-13 20:28:41 +02:00			`for w in stopwords:`
			`pattern = r'\b' + w + r'\b'`
			`sentence = re.sub(pattern, '', sentence)`
			`spl = sentence.split(' ')`
			`spl = list(filter(lambda x: x != '', spl))`
			`spl = [x.lower() for x in spl]`
			`splited.append(spl)`
			`documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(splited)]`
Script fix 2021-04-13 20:59:56 +02:00			`model = Doc2Vec(documents, min_count=0, alpha=0.025, epochs=400)`
Use stopwords 2021-04-13 20:28:41 +02:00			`# for epoch in range(10):`
			`# print('Training epoch %s', epoch)`
			`# model.train()`
			`# model.alpha -= 0.002`
			`# model.min_alpha = model.alpha`


Add classifier 2021-04-13 19:34:47 +02:00			`X = model.dv.vectors`

Script fix 2021-04-13 20:59:56 +02:00			`kmeans = KMeans(n_clusters=clusters_no, max_iter=4000).fit(X)`
Add classifier 2021-04-13 19:34:47 +02:00			`labels = kmeans.labels_`
			`centroids = kmeans.cluster_centers_`

			`print(labels)`
Use stopwords 2021-04-13 20:28:41 +02:00			`ctr = Counter(labels)`
			`print(ctr.values())`
Add classifier 2021-04-13 19:34:47 +02:00			`with open(path + 'out.tsv', 'w') as file:`
			`for label in labels:`
			`file.write("%i\n" % label)`



			`# print(model.wv.vectors)`
			`# kclusterer = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, repeats=25)`
			`# assigned_clusters = kclusterer.cluster(X, assign_clusters=True)`
			`# print(assigned_clusters)`
			`#`
			`# words = list(model.wv.key_to_index)`
			`# for i, word in enumerate(words):`
			`# print(word + ":" + str(assigned_clusters[i]))`