Add classifier

2021-04-13 19:34:47 +02:00 · 2021-04-13 19:34:47 +02:00 · 54ef4f18bf
commit 54ef4f18bf
parent e4adfb04dc
3 changed files with 131 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,8 @@
 *~
 *.swp
 .idea/.gitignore
 .idea/misc.xml
 .idea/modules.xml
 .idea/polish-urban-legends-public.iml
 .idea/vcs.xml
 .idea/inspectionProfiles/profiles_settings.xml
--- a/classifier.py
+++ b/classifier.py
@ -0,0 +1,38 @@
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 from nltk.cluster import KMeansClusterer
 from sklearn.cluster import KMeans
 sentences = []
 path = 'dev-0/'
 with open(path + 'in.tsv') as f:
    sentences = [line.rstrip() for line in f]
 clusters_no = len(sentences)
 splited = []
 for sentence in sentences:
    splited.append(sentence.split(' '))
 documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
 model = Doc2Vec(documents, min_count=1)
 X = model.dv.vectors
 kmeans = KMeans(n_clusters=clusters_no).fit(X)
 labels = kmeans.labels_
 centroids = kmeans.cluster_centers_
 print("Cluster id labels for inputted data")
 print(labels)
 with open(path + 'out.tsv', 'w') as file:
    for label in labels:
        file.write("%i\n" % label)
 # print(model.wv.vectors)
 # kclusterer = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, repeats=25)
 # assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
 # print(assigned_clusters)
 #
 # words = list(model.wv.key_to_index)
 # for i, word in enumerate(words):
 #     print(word + ":" + str(assigned_clusters[i]))
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
@ -0,0 +1,87 @@
 44
 51
 54
 65
 33
 4
 57
 17
 39
 68
 21
 22
 29
 0
 71
 19
 40
 80
 20
 38
 72
 36
 52
 66
 5
 2
 47
 13
 10
 48
 30
 35
 69
 12
 56
 16
 14
 76
 11
 84
 61
 75
 74
 67
 73
 3
 86
 15
 27
 7
 28
 41
 60
 77
 79
 45
 55
 50
 83
 85
 31
 46
 70
 37
 1
 24
 58
 78
 53
 43
 64
 62
 63
 42
 23
 26
 25
 32
 59
 9
 82
 18
 49
 8
 6
 81
 34
		`@ -0,0 +1,87 @@`
							`44`
							`51`
							`54`
							`65`
							`33`
							`4`
							`57`
							`17`
							`39`
							`68`
							`21`
							`22`
							`29`
							`0`
							`71`
							`19`
							`40`
							`80`
							`20`
							`38`
							`72`
							`36`
							`52`
							`66`
							`5`
							`2`
							`47`
							`13`
							`10`
							`48`
							`30`
							`35`
							`69`
							`12`
							`56`
							`16`
							`14`
							`76`
							`11`
							`84`
							`61`
							`75`
							`74`
							`67`
							`73`
							`3`
							`86`
							`15`
							`27`
							`7`
							`28`
							`41`
							`60`
							`77`
							`79`
							`45`
							`55`
							`50`
							`83`
							`85`
							`31`
							`46`
							`70`
							`37`
							`1`
							`24`
							`58`
							`78`
							`53`
							`43`
							`64`
							`62`
							`63`
							`42`
							`23`
							`26`
							`25`
							`32`
							`59`
							`9`
							`82`
							`18`
							`49`
							`8`
							`6`
							`81`
							`34`