From 54ef4f18bff33ffd0d51e4e5fd1ad1b3d03e68f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Kolasin=CC=81ski?= Date: Tue, 13 Apr 2021 19:34:47 +0200 Subject: [PATCH] Add classifier --- .gitignore | 6 ++++ classifier.py | 38 ++++++++++++++++++++++ dev-0/out.tsv | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+) create mode 100644 classifier.py create mode 100644 dev-0/out.tsv diff --git a/.gitignore b/.gitignore index b72f9be..2aec18a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ *~ *.swp +.idea/.gitignore +.idea/misc.xml +.idea/modules.xml +.idea/polish-urban-legends-public.iml +.idea/vcs.xml +.idea/inspectionProfiles/profiles_settings.xml diff --git a/classifier.py b/classifier.py new file mode 100644 index 0000000..0be75eb --- /dev/null +++ b/classifier.py @@ -0,0 +1,38 @@ +from gensim.models.doc2vec import Doc2Vec, TaggedDocument +from nltk.cluster import KMeansClusterer +from sklearn.cluster import KMeans + + +sentences = [] +path = 'dev-0/' +with open(path + 'in.tsv') as f: + sentences = [line.rstrip() for line in f] + +clusters_no = len(sentences) +splited = [] +for sentence in sentences: + splited.append(sentence.split(' ')) +documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)] +model = Doc2Vec(documents, min_count=1) +X = model.dv.vectors + +kmeans = KMeans(n_clusters=clusters_no).fit(X) +labels = kmeans.labels_ +centroids = kmeans.cluster_centers_ + +print("Cluster id labels for inputted data") +print(labels) +with open(path + 'out.tsv', 'w') as file: + for label in labels: + file.write("%i\n" % label) + + + +# print(model.wv.vectors) +# kclusterer = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, repeats=25) +# assigned_clusters = kclusterer.cluster(X, assign_clusters=True) +# print(assigned_clusters) +# +# words = list(model.wv.key_to_index) +# for i, word in enumerate(words): +# print(word + ":" + str(assigned_clusters[i])) \ No newline at end of file diff --git a/dev-0/out.tsv b/dev-0/out.tsv new file mode 100644 index 0000000..462d969 --- /dev/null +++ b/dev-0/out.tsv @@ -0,0 +1,87 @@ +44 +51 +54 +65 +33 +4 +57 +17 +39 +68 +21 +22 +29 +0 +71 +19 +40 +80 +20 +38 +72 +36 +52 +66 +5 +2 +47 +13 +10 +48 +30 +35 +69 +12 +56 +16 +14 +76 +11 +84 +61 +75 +74 +67 +73 +3 +86 +15 +27 +7 +28 +41 +60 +77 +79 +45 +55 +50 +83 +85 +31 +46 +70 +37 +1 +24 +58 +78 +53 +43 +64 +62 +63 +42 +23 +26 +25 +32 +59 +9 +82 +18 +49 +8 +6 +81 +34