2021-04-13 19:34:47 +02:00
|
|
|
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
|
|
|
from sklearn.cluster import KMeans
|
2021-04-13 20:28:41 +02:00
|
|
|
from collections import Counter
|
|
|
|
import re
|
2021-04-13 19:34:47 +02:00
|
|
|
|
|
|
|
sentences = []
|
|
|
|
path = 'dev-0/'
|
|
|
|
with open(path + 'in.tsv') as f:
|
|
|
|
sentences = [line.rstrip() for line in f]
|
|
|
|
|
2021-04-13 20:28:41 +02:00
|
|
|
clusters_no = 10
|
|
|
|
|
|
|
|
stopwords = []
|
|
|
|
with open('stopwords') as f:
|
|
|
|
stopwords = [line.rstrip() for line in f]
|
|
|
|
|
2021-04-13 19:34:47 +02:00
|
|
|
splited = []
|
|
|
|
for sentence in sentences:
|
2021-04-13 20:28:41 +02:00
|
|
|
for w in stopwords:
|
|
|
|
pattern = r'\b' + w + r'\b'
|
|
|
|
sentence = re.sub(pattern, '', sentence)
|
|
|
|
spl = sentence.split(' ')
|
|
|
|
spl = list(filter(lambda x: x != '', spl))
|
|
|
|
spl = [x.lower() for x in spl]
|
|
|
|
splited.append(spl)
|
|
|
|
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(splited)]
|
2021-04-13 20:59:56 +02:00
|
|
|
model = Doc2Vec(documents, min_count=0, alpha=0.025, epochs=400)
|
2021-04-13 20:28:41 +02:00
|
|
|
# for epoch in range(10):
|
|
|
|
# print('Training epoch %s', epoch)
|
|
|
|
# model.train()
|
|
|
|
# model.alpha -= 0.002
|
|
|
|
# model.min_alpha = model.alpha
|
|
|
|
|
|
|
|
|
2021-04-13 19:34:47 +02:00
|
|
|
X = model.dv.vectors
|
|
|
|
|
2021-04-13 20:59:56 +02:00
|
|
|
kmeans = KMeans(n_clusters=clusters_no, max_iter=4000).fit(X)
|
2021-04-13 19:34:47 +02:00
|
|
|
labels = kmeans.labels_
|
|
|
|
centroids = kmeans.cluster_centers_
|
|
|
|
|
|
|
|
print(labels)
|
2021-04-13 20:28:41 +02:00
|
|
|
ctr = Counter(labels)
|
|
|
|
print(ctr.values())
|
2021-04-13 19:34:47 +02:00
|
|
|
with open(path + 'out.tsv', 'w') as file:
|
|
|
|
for label in labels:
|
|
|
|
file.write("%i\n" % label)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# print(model.wv.vectors)
|
|
|
|
# kclusterer = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, repeats=25)
|
|
|
|
# assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
|
|
|
|
# print(assigned_clusters)
|
|
|
|
#
|
|
|
|
# words = list(model.wv.key_to_index)
|
|
|
|
# for i, word in enumerate(words):
|
|
|
|
# print(word + ":" + str(assigned_clusters[i]))
|