polish-urban-legends-public/classifier.py

29 lines
1.1 KiB
Python

import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import KMeans
stopwords = []
with open('stopwords') as f:
stopwords = [line.rstrip() for line in f]
def classify(path, clusters_no):
with open(path + 'in.tsv') as f:
docs = [line.rstrip() for line in f]
docs_preprocessed = []
for doc in docs:
doc = doc.lower().split(' ')
doc = [''.join(char for char in word if char not in string.punctuation) for word in doc]
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
docs_preprocessed.append(doc)
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
model = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, seed=10, workers=1)
k_means = KMeans(n_clusters=clusters_no, n_init=15, random_state=0).fit(model.dv.vectors)
labels = k_means.labels_
with open(path + 'out.tsv', 'w') as file:
for label in labels:
file.write("%i\n" % label)
classify('dev-0/', 10)
classify('test-A/', 10)