diff --git a/classifier.py b/classifier.py index 816a228..5b9cb50 100644 --- a/classifier.py +++ b/classifier.py @@ -3,11 +3,12 @@ from gensim.models.doc2vec import Doc2Vec, TaggedDocument from sklearn.cluster import KMeans stopwords = [] +# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt with open('stopwords') as f: stopwords = [line.rstrip() for line in f] -def classify(path, clusters_no): +def classify(path, n_clusters): with open(path + 'in.tsv') as f: docs = [line.rstrip() for line in f] docs_preprocessed = [] @@ -18,12 +19,12 @@ def classify(path, clusters_no): docs_preprocessed.append(doc) tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)] model = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, seed=10, workers=1) - k_means = KMeans(n_clusters=clusters_no, n_init=15, random_state=0).fit(model.dv.vectors) + k_means = KMeans(n_clusters=n_clusters, n_init=15, random_state=0).fit(model.dv.vectors) labels = k_means.labels_ with open(path + 'out.tsv', 'w') as file: for label in labels: file.write("%i\n" % label) -classify('dev-0/', 10) -classify('test-A/', 10) \ No newline at end of file +classify('dev-0/', n_clusters=10) +classify('test-A/', n_clusters=10) \ No newline at end of file