from gensim.models.doc2vec import Doc2Vec, TaggedDocument from sklearn.cluster import KMeans from collections import Counter import re sentences = [] path = 'dev-0/' with open(path + 'in.tsv') as f: sentences = [line.rstrip() for line in f] clusters_no = 10 stopwords = [] with open('stopwords') as f: stopwords = [line.rstrip() for line in f] splited = [] for sentence in sentences: for w in stopwords: pattern = r'\b' + w + r'\b' sentence = re.sub(pattern, '', sentence) spl = sentence.split(' ') spl = list(filter(lambda x: x != '', spl)) spl = [x.lower() for x in spl] splited.append(spl) documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(splited)] model = Doc2Vec(documents, min_count=0, alpha=0.025, epochs=400) # for epoch in range(10): # print('Training epoch %s', epoch) # model.train() # model.alpha -= 0.002 # model.min_alpha = model.alpha X = model.dv.vectors kmeans = KMeans(n_clusters=clusters_no, max_iter=4000).fit(X) labels = kmeans.labels_ centroids = kmeans.cluster_centers_ print(labels) ctr = Counter(labels) print(ctr.values()) with open(path + 'out.tsv', 'w') as file: for label in labels: file.write("%i\n" % label) # print(model.wv.vectors) # kclusterer = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, repeats=25) # assigned_clusters = kclusterer.cluster(X, assign_clusters=True) # print(assigned_clusters) # # words = list(model.wv.key_to_index) # for i, word in enumerate(words): # print(word + ":" + str(assigned_clusters[i]))