From ab4fc228bdf57664ae6ad2fcb85b74385091d1c2 Mon Sep 17 00:00:00 2001 From: kuba Date: Sat, 17 Apr 2021 16:28:56 +0200 Subject: [PATCH] Add preprocessing, set model parameters --- classifier.py | 68 ++--- dev-0/out.tsv | 154 +++++----- test-A/expected.tsv | 691 ++++++++++++++++++++++++++++++++++++++++++++ test-A/out.tsv | 691 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1479 insertions(+), 125 deletions(-) create mode 100644 test-A/expected.tsv create mode 100644 test-A/out.tsv diff --git a/classifier.py b/classifier.py index 0afbe15..816a228 100644 --- a/classifier.py +++ b/classifier.py @@ -1,57 +1,29 @@ +import string from gensim.models.doc2vec import Doc2Vec, TaggedDocument from sklearn.cluster import KMeans -from collections import Counter -import re - -sentences = [] -path = 'dev-0/' -with open(path + 'in.tsv') as f: - sentences = [line.rstrip() for line in f] - -clusters_no = 10 stopwords = [] with open('stopwords') as f: stopwords = [line.rstrip() for line in f] -splited = [] -for sentence in sentences: - for w in stopwords: - pattern = r'\b' + w + r'\b' - sentence = re.sub(pattern, '', sentence) - spl = sentence.split(' ') - spl = list(filter(lambda x: x != '', spl)) - spl = [x.lower() for x in spl] - splited.append(spl) -documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(splited)] -model = Doc2Vec(documents, min_count=0, alpha=0.025, epochs=400) -# for epoch in range(10): -# print('Training epoch %s', epoch) -# model.train() -# model.alpha -= 0.002 -# model.min_alpha = model.alpha + +def classify(path, clusters_no): + with open(path + 'in.tsv') as f: + docs = [line.rstrip() for line in f] + docs_preprocessed = [] + for doc in docs: + doc = doc.lower().split(' ') + doc = [''.join(char for char in word if char not in string.punctuation) for word in doc] + doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc)) + docs_preprocessed.append(doc) + tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)] + model = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, seed=10, workers=1) + k_means = KMeans(n_clusters=clusters_no, n_init=15, random_state=0).fit(model.dv.vectors) + labels = k_means.labels_ + with open(path + 'out.tsv', 'w') as file: + for label in labels: + file.write("%i\n" % label) -X = model.dv.vectors - -kmeans = KMeans(n_clusters=clusters_no, max_iter=4000).fit(X) -labels = kmeans.labels_ -centroids = kmeans.cluster_centers_ - -print(labels) -ctr = Counter(labels) -print(ctr.values()) -with open(path + 'out.tsv', 'w') as file: - for label in labels: - file.write("%i\n" % label) - - - -# print(model.wv.vectors) -# kclusterer = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, repeats=25) -# assigned_clusters = kclusterer.cluster(X, assign_clusters=True) -# print(assigned_clusters) -# -# words = list(model.wv.key_to_index) -# for i, word in enumerate(words): -# print(word + ":" + str(assigned_clusters[i])) \ No newline at end of file +classify('dev-0/', 10) +classify('test-A/', 10) \ No newline at end of file diff --git a/dev-0/out.tsv b/dev-0/out.tsv index b145ba4..e26aa0e 100644 --- a/dev-0/out.tsv +++ b/dev-0/out.tsv @@ -1,87 +1,87 @@ -5 -2 -3 -3 4 -4 -5 -7 -4 -3 -3 -4 -1 -3 -3 -3 -3 -3 -3 -2 -3 -3 -3 -3 -5 -4 -4 -3 -4 -3 9 -5 -5 -3 -3 -5 -3 -3 -4 -3 -4 -3 -5 -5 -3 -3 -5 +2 +6 +0 +8 1 +3 +8 +2 +3 +8 +5 +6 +6 +7 +6 +6 +2 +6 +6 +6 +0 +2 +3 +0 +1 +0 6 5 1 -3 -5 -3 -3 -3 +7 1 -0 -5 -3 -5 -3 -5 -3 +6 +6 +7 +6 4 -3 -5 -3 -3 -5 -3 -3 -5 -2 -5 -2 1 -3 -5 -1 -3 -4 -3 -2 -4 -3 +6 8 +4 +7 +6 +4 +6 +7 +9 +1 +1 +3 +2 +1 +6 +4 +2 +5 +5 +7 +6 +7 +9 +7 +7 +8 +6 +1 +4 +6 +7 +0 +2 +4 +7 +9 +3 +3 +4 +6 +6 +2 +3 +2 +0 +7 +6 +0 diff --git a/test-A/expected.tsv b/test-A/expected.tsv new file mode 100644 index 0000000..c292bbf --- /dev/null +++ b/test-A/expected.tsv @@ -0,0 +1,691 @@ +41 +39 +6 +29 +34 +27 +41 +32 +54 +28 +36 +19 +46 +3 +57 +6 +3 +19 +3 +57 +3 +3 +53 +54 +3 +3 +15 +3 +36 +3 +36 +32 +24 +6 +51 +12 +19 +13 +6 +26 +15 +16 +6 +46 +29 +29 +11 +25 +6 +57 +6 +15 +61 +38 +49 +38 +28 +3 +54 +54 +29 +6 +3 +3 +2 +6 +3 +33 +51 +3 +36 +51 +1 +1 +15 +40 +54 +54 +15 +36 +36 +4 +6 +3 +28 +12 +11 +54 +28 +15 +48 +49 +51 +34 +3 +28 +41 +36 +36 +0 +3 +10 +3 +36 +29 +11 +54 +54 +32 +12 +44 +6 +2 +6 +54 +34 +28 +12 +52 +29 +47 +32 +3 +11 +29 +6 +28 +38 +44 +6 +49 +51 +11 +38 +3 +51 +54 +2 +6 +3 +47 +3 +47 +3 +51 +29 +19 +3 +29 +38 +3 +28 +28 +57 +48 +19 +3 +54 +1 +46 +48 +29 +11 +47 +49 +29 +6 +48 +51 +51 +12 +6 +36 +49 +1 +6 +36 +36 +11 +12 +4 +47 +54 +3 +3 +51 +32 +6 +9 +49 +41 +52 +19 +11 +51 +3 +29 +6 +11 +41 +15 +58 +41 +36 +3 +3 +6 +47 +28 +41 +2 +3 +28 +36 +3 +19 +46 +33 +12 +2 +57 +11 +49 +3 +6 +51 +48 +3 +52 +19 +11 +15 +38 +15 +29 +49 +51 +36 +36 +29 +41 +2 +6 +3 +3 +12 +36 +49 +19 +25 +11 +50 +4 +46 +15 +6 +22 +29 +48 +25 +38 +32 +3 +28 +54 +3 +11 +28 +15 +36 +29 +15 +49 +36 +3 +30 +29 +44 +47 +6 +36 +36 +0 +3 +46 +47 +29 +3 +3 +24 +32 +11 +54 +2 +12 +49 +3 +47 +3 +3 +3 +35 +12 +42 +11 +36 +29 +15 +28 +21 +28 +55 +34 +5 +51 +20 +49 +47 +3 +3 +15 +6 +3 +28 +15 +15 +28 +15 +3 +36 +6 +6 +29 +38 +11 +3 +54 +34 +57 +36 +28 +11 +46 +45 +14 +47 +6 +19 +11 +6 +3 +3 +49 +32 +3 +36 +49 +3 +36 +3 +32 +29 +3 +36 +49 +3 +6 +46 +25 +48 +28 +15 +19 +3 +32 +51 +38 +3 +48 +3 +3 +29 +3 +54 +7 +46 +36 +19 +36 +33 +3 +6 +54 +6 +19 +12 +41 +34 +3 +11 +3 +29 +38 +36 +38 +2 +54 +3 +49 +46 +15 +47 +32 +54 +38 +36 +47 +49 +3 +3 +54 +3 +36 +53 +31 +15 +6 +34 +15 +57 +0 +6 +25 +49 +4 +11 +16 +46 +15 +24 +12 +6 +15 +36 +3 +12 +1 +4 +32 +29 +47 +15 +41 +3 +36 +8 +10 +25 +15 +36 +52 +11 +3 +36 +26 +3 +3 +11 +25 +15 +15 +41 +15 +51 +1 +36 +15 +6 +3 +12 +31 +6 +15 +54 +36 +3 +11 +36 +15 +28 +6 +6 +47 +11 +44 +32 +46 +3 +34 +25 +36 +6 +12 +29 +36 +43 +46 +1 +46 +36 +3 +37 +3 +12 +36 +19 +12 +3 +28 +25 +46 +3 +48 +3 +3 +28 +36 +36 +3 +46 +4 +15 +15 +6 +52 +46 +3 +57 +49 +3 +6 +3 +34 +36 +36 +24 +36 +46 +59 +46 +34 +15 +15 +3 +51 +4 +41 +17 +51 +6 +24 +3 +54 +9 +6 +3 +38 +6 +32 +51 +6 +6 +49 +12 +15 +19 +3 +6 +19 +36 +38 +54 +3 +24 +15 +60 +4 +38 +54 +49 +12 +54 +3 +25 +10 +28 +41 +36 +6 +6 +36 +57 +49 +57 +38 +15 +51 +51 +36 +3 +19 +46 +41 +36 +19 +52 +47 +25 +57 +28 +3 +46 +49 +6 +12 +36 +3 +54 +3 +36 +19 +25 +3 +3 +15 +41 +6 +48 +12 +29 +57 +41 +3 +38 +36 +48 +56 +38 +3 +12 +49 +36 +4 +19 +2 +11 +1 +27 +19 +3 +3 +33 +18 +12 +6 +46 +36 +23 +3 +3 +10 +28 +3 +48 +41 +51 +3 +29 +52 +24 +12 +51 +54 +48 +32 +6 +57 +36 +41 +15 +48 diff --git a/test-A/out.tsv b/test-A/out.tsv new file mode 100644 index 0000000..c292bbf --- /dev/null +++ b/test-A/out.tsv @@ -0,0 +1,691 @@ +41 +39 +6 +29 +34 +27 +41 +32 +54 +28 +36 +19 +46 +3 +57 +6 +3 +19 +3 +57 +3 +3 +53 +54 +3 +3 +15 +3 +36 +3 +36 +32 +24 +6 +51 +12 +19 +13 +6 +26 +15 +16 +6 +46 +29 +29 +11 +25 +6 +57 +6 +15 +61 +38 +49 +38 +28 +3 +54 +54 +29 +6 +3 +3 +2 +6 +3 +33 +51 +3 +36 +51 +1 +1 +15 +40 +54 +54 +15 +36 +36 +4 +6 +3 +28 +12 +11 +54 +28 +15 +48 +49 +51 +34 +3 +28 +41 +36 +36 +0 +3 +10 +3 +36 +29 +11 +54 +54 +32 +12 +44 +6 +2 +6 +54 +34 +28 +12 +52 +29 +47 +32 +3 +11 +29 +6 +28 +38 +44 +6 +49 +51 +11 +38 +3 +51 +54 +2 +6 +3 +47 +3 +47 +3 +51 +29 +19 +3 +29 +38 +3 +28 +28 +57 +48 +19 +3 +54 +1 +46 +48 +29 +11 +47 +49 +29 +6 +48 +51 +51 +12 +6 +36 +49 +1 +6 +36 +36 +11 +12 +4 +47 +54 +3 +3 +51 +32 +6 +9 +49 +41 +52 +19 +11 +51 +3 +29 +6 +11 +41 +15 +58 +41 +36 +3 +3 +6 +47 +28 +41 +2 +3 +28 +36 +3 +19 +46 +33 +12 +2 +57 +11 +49 +3 +6 +51 +48 +3 +52 +19 +11 +15 +38 +15 +29 +49 +51 +36 +36 +29 +41 +2 +6 +3 +3 +12 +36 +49 +19 +25 +11 +50 +4 +46 +15 +6 +22 +29 +48 +25 +38 +32 +3 +28 +54 +3 +11 +28 +15 +36 +29 +15 +49 +36 +3 +30 +29 +44 +47 +6 +36 +36 +0 +3 +46 +47 +29 +3 +3 +24 +32 +11 +54 +2 +12 +49 +3 +47 +3 +3 +3 +35 +12 +42 +11 +36 +29 +15 +28 +21 +28 +55 +34 +5 +51 +20 +49 +47 +3 +3 +15 +6 +3 +28 +15 +15 +28 +15 +3 +36 +6 +6 +29 +38 +11 +3 +54 +34 +57 +36 +28 +11 +46 +45 +14 +47 +6 +19 +11 +6 +3 +3 +49 +32 +3 +36 +49 +3 +36 +3 +32 +29 +3 +36 +49 +3 +6 +46 +25 +48 +28 +15 +19 +3 +32 +51 +38 +3 +48 +3 +3 +29 +3 +54 +7 +46 +36 +19 +36 +33 +3 +6 +54 +6 +19 +12 +41 +34 +3 +11 +3 +29 +38 +36 +38 +2 +54 +3 +49 +46 +15 +47 +32 +54 +38 +36 +47 +49 +3 +3 +54 +3 +36 +53 +31 +15 +6 +34 +15 +57 +0 +6 +25 +49 +4 +11 +16 +46 +15 +24 +12 +6 +15 +36 +3 +12 +1 +4 +32 +29 +47 +15 +41 +3 +36 +8 +10 +25 +15 +36 +52 +11 +3 +36 +26 +3 +3 +11 +25 +15 +15 +41 +15 +51 +1 +36 +15 +6 +3 +12 +31 +6 +15 +54 +36 +3 +11 +36 +15 +28 +6 +6 +47 +11 +44 +32 +46 +3 +34 +25 +36 +6 +12 +29 +36 +43 +46 +1 +46 +36 +3 +37 +3 +12 +36 +19 +12 +3 +28 +25 +46 +3 +48 +3 +3 +28 +36 +36 +3 +46 +4 +15 +15 +6 +52 +46 +3 +57 +49 +3 +6 +3 +34 +36 +36 +24 +36 +46 +59 +46 +34 +15 +15 +3 +51 +4 +41 +17 +51 +6 +24 +3 +54 +9 +6 +3 +38 +6 +32 +51 +6 +6 +49 +12 +15 +19 +3 +6 +19 +36 +38 +54 +3 +24 +15 +60 +4 +38 +54 +49 +12 +54 +3 +25 +10 +28 +41 +36 +6 +6 +36 +57 +49 +57 +38 +15 +51 +51 +36 +3 +19 +46 +41 +36 +19 +52 +47 +25 +57 +28 +3 +46 +49 +6 +12 +36 +3 +54 +3 +36 +19 +25 +3 +3 +15 +41 +6 +48 +12 +29 +57 +41 +3 +38 +36 +48 +56 +38 +3 +12 +49 +36 +4 +19 +2 +11 +1 +27 +19 +3 +3 +33 +18 +12 +6 +46 +36 +23 +3 +3 +10 +28 +3 +48 +41 +51 +3 +29 +52 +24 +12 +51 +54 +48 +32 +6 +57 +36 +41 +15 +48