Add stopwords source
This commit is contained in:
parent
37351edc0f
commit
9bbc8213a9
@ -3,11 +3,12 @@ from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
|||||||
from sklearn.cluster import KMeans
|
from sklearn.cluster import KMeans
|
||||||
|
|
||||||
stopwords = []
|
stopwords = []
|
||||||
|
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
|
||||||
with open('stopwords') as f:
|
with open('stopwords') as f:
|
||||||
stopwords = [line.rstrip() for line in f]
|
stopwords = [line.rstrip() for line in f]
|
||||||
|
|
||||||
|
|
||||||
def classify(path, clusters_no):
|
def classify(path, n_clusters):
|
||||||
with open(path + 'in.tsv') as f:
|
with open(path + 'in.tsv') as f:
|
||||||
docs = [line.rstrip() for line in f]
|
docs = [line.rstrip() for line in f]
|
||||||
docs_preprocessed = []
|
docs_preprocessed = []
|
||||||
@ -18,12 +19,12 @@ def classify(path, clusters_no):
|
|||||||
docs_preprocessed.append(doc)
|
docs_preprocessed.append(doc)
|
||||||
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
|
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
|
||||||
model = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, seed=10, workers=1)
|
model = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, seed=10, workers=1)
|
||||||
k_means = KMeans(n_clusters=clusters_no, n_init=15, random_state=0).fit(model.dv.vectors)
|
k_means = KMeans(n_clusters=n_clusters, n_init=15, random_state=0).fit(model.dv.vectors)
|
||||||
labels = k_means.labels_
|
labels = k_means.labels_
|
||||||
with open(path + 'out.tsv', 'w') as file:
|
with open(path + 'out.tsv', 'w') as file:
|
||||||
for label in labels:
|
for label in labels:
|
||||||
file.write("%i\n" % label)
|
file.write("%i\n" % label)
|
||||||
|
|
||||||
|
|
||||||
classify('dev-0/', 10)
|
classify('dev-0/', n_clusters=10)
|
||||||
classify('test-A/', 10)
|
classify('test-A/', n_clusters=10)
|
Loading…
Reference in New Issue
Block a user