Add classifier

This commit is contained in:
Jakub Kolasiński 2021-04-13 19:34:47 +02:00
parent e4adfb04dc
commit 54ef4f18bf
3 changed files with 131 additions and 0 deletions

6
.gitignore vendored
View File

@ -1,2 +1,8 @@
*~
*.swp
.idea/.gitignore
.idea/misc.xml
.idea/modules.xml
.idea/polish-urban-legends-public.iml
.idea/vcs.xml
.idea/inspectionProfiles/profiles_settings.xml

38
classifier.py Normal file
View File

@ -0,0 +1,38 @@
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.cluster import KMeansClusterer
from sklearn.cluster import KMeans
sentences = []
path = 'dev-0/'
with open(path + 'in.tsv') as f:
sentences = [line.rstrip() for line in f]
clusters_no = len(sentences)
splited = []
for sentence in sentences:
splited.append(sentence.split(' '))
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
model = Doc2Vec(documents, min_count=1)
X = model.dv.vectors
kmeans = KMeans(n_clusters=clusters_no).fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
print("Cluster id labels for inputted data")
print(labels)
with open(path + 'out.tsv', 'w') as file:
for label in labels:
file.write("%i\n" % label)
# print(model.wv.vectors)
# kclusterer = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, repeats=25)
# assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
# print(assigned_clusters)
#
# words = list(model.wv.key_to_index)
# for i, word in enumerate(words):
# print(word + ":" + str(assigned_clusters[i]))

87
dev-0/out.tsv Normal file
View File

@ -0,0 +1,87 @@
44
51
54
65
33
4
57
17
39
68
21
22
29
0
71
19
40
80
20
38
72
36
52
66
5
2
47
13
10
48
30
35
69
12
56
16
14
76
11
84
61
75
74
67
73
3
86
15
27
7
28
41
60
77
79
45
55
50
83
85
31
46
70
37
1
24
58
78
53
43
64
62
63
42
23
26
25
32
59
9
82
18
49
8
6
81
34
1 44
2 51
3 54
4 65
5 33
6 4
7 57
8 17
9 39
10 68
11 21
12 22
13 29
14 0
15 71
16 19
17 40
18 80
19 20
20 38
21 72
22 36
23 52
24 66
25 5
26 2
27 47
28 13
29 10
30 48
31 30
32 35
33 69
34 12
35 56
36 16
37 14
38 76
39 11
40 84
41 61
42 75
43 74
44 67
45 73
46 3
47 86
48 15
49 27
50 7
51 28
52 41
53 60
54 77
55 79
56 45
57 55
58 50
59 83
60 85
61 31
62 46
63 70
64 37
65 1
66 24
67 58
68 78
69 53
70 43
71 64
72 62
73 63
74 42
75 23
76 26
77 25
78 32
79 59
80 9
81 82
82 18
83 49
84 8
85 6
86 81
87 34