Add stopwords source

2021-04-17 16:56:37 +02:00 · 2021-04-17 16:56:37 +02:00 · 9bbc8213a9
commit 9bbc8213a9
parent 37351edc0f
1 changed files with 5 additions and 4 deletions
--- a/classifier.py
+++ b/classifier.py
@ -3,11 +3,12 @@ from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 from sklearn.cluster import KMeans

 stopwords = []
+# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
 with open('stopwords') as f:
    stopwords = [line.rstrip() for line in f]


-def classify(path, clusters_no):
+def classify(path, n_clusters):
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
@ -18,12 +19,12 @@ def classify(path, clusters_no):
        docs_preprocessed.append(doc)
    tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
    model = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, seed=10, workers=1)
-    k_means = KMeans(n_clusters=clusters_no, n_init=15, random_state=0).fit(model.dv.vectors)
+    k_means = KMeans(n_clusters=n_clusters, n_init=15, random_state=0).fit(model.dv.vectors)
    labels = k_means.labels_
    with open(path + 'out.tsv', 'w') as file:
        for label in labels:
            file.write("%i\n" % label)


-classify('dev-0/', 10)
-classify('test-A/', 10)
+classify('dev-0/', n_clusters=10)
+classify('test-A/', n_clusters=10)