polish-urban-legends-public/classifier.py

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.cluster import KMeansClusterer
from sklearn.cluster import KMeans


sentences = []
path = 'dev-0/'
with open(path + 'in.tsv') as f:
    sentences = [line.rstrip() for line in f]

clusters_no = len(sentences)
splited = []
for sentence in sentences:
    splited.append(sentence.split(' '))
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
model = Doc2Vec(documents, min_count=1)
X = model.dv.vectors

kmeans = KMeans(n_clusters=clusters_no).fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

print("Cluster id labels for inputted data")
print(labels)
with open(path + 'out.tsv', 'w') as file:
    for label in labels:
        file.write("%i\n" % label)


# print(model.wv.vectors)
# kclusterer = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, repeats=25)
# assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
# print(assigned_clusters)
#
# words = list(model.wv.key_to_index)
# for i, word in enumerate(words):
#     print(word + ":" + str(assigned_clusters[i]))