test-A, script

This commit is contained in:
bednarco 2021-04-20 19:06:45 +02:00
parent 960a201fb5
commit 6194b5dd46
4 changed files with 912 additions and 247 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,87 +1,87 @@
1
7
0
6
4
8
9
3
5
1
3
4
0
3
0
9
9
5
9
9
4
6
2
0
6
3
0
6
7
4
8
7
2
6
6
7
7
3
4
8
4
4
8
0
4
6
2
9
9
5
5
5
0
9
5
4
4
7
2
2
2
4
7
2
6
6
8
3
8
1
8
0
8
1
9
6
5
7
4
5
9
7
1
6
1
2
9
1
3
2
9
5
7
5
2
0
7
1
1
3
2
7
5
0
4
5
9
4
1
5
9
0
1
1
8
7
7
2
3
2
7
2
2
6
4
2
1
3
2
4
3
1
2
7
0
0
1
9
4
3
4
0
3
4
2
5
7
4
0

1 1 0
2 7 6
3 4
4 8
5 9
6 3 3
7 5
8 1
9 3
10 4
11 0
12 3
13 0
14 9
15 9
16 5
17 9
18 9 9
19 4 4
6
2
20 0 0
21 6 8
3
0
6
7
4
22 7 7
23 2 2
24 6
25 6
26 7 7
27 7 6
28 3 2
29 4 9
30 8 9
31 4 5
32 4 5
33 8 5
34 0 0
35 4 9
36 5 5
4
4
37 7 7
38 2 6
39 2 6
40 2 8
41 4 3
42 7 8
43 2 1
44 8
45 0
46 8
47 1
48 9
49 6
50 5
51 7 7
52 4 4
53 5 5
54 9 7
55 1
56 6 6
1
2
57 9 9
58 1 9
59 3 5
2
60 7 7
61 5 5
62 2 7
63 0 1
64 1
65 3 3
66 2 7
67 5
68 0
69 4 4
70 5
71 9
72 4
73 1
74 5
75 9
76 0
77 1
78 1 1
79 8 8
7
7
2
3
2
7
2
2
6
4
2
1
3
2
4
3
1
2
7
0
0
1
80 9 9
81 4 4
82 3 3
83 4
84 0 0
85 3 5
4
2
86 7 7
87 4 0

75
script.py Normal file
View File

@ -0,0 +1,75 @@
import pandas as pd
from many_stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from unidecode import unidecode
from nltk.tokenize import word_tokenize
import string
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
data=pd.read_csv('dev-0/in.tsv', sep='\t', header=None)
data_test=pd.read_csv('test-A/in.tsv', sep='\t', header=None)
def remove_punctuations(text):
for punctuation in string.punctuation:
text = text.replace(punctuation, '')
return text
data[0] = data[0].str.lower()
data_test[0] = data_test[0].str.lower()
stop_words = get_stop_words('pl')
data[0] = data[0].apply(unidecode)
data_test[0] = data_test[0].apply(unidecode)
uni_stop_words = [unidecode(x) for x in stop_words]
data[0] = data[0].apply(remove_punctuations)
data_test[0] = data_test[0].apply(remove_punctuations)
data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))
data_test[0] = data_test[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))
tf=TfidfVectorizer()
text_tf= tf.fit_transform(data[0])
text_test_tf= tf.fit_transform(data_test[0])
Sum_of_squared_distances = []
K = range(2,20)
for k in K:
km = KMeans(n_clusters=k, max_iter=200, n_init=10)
km = km.fit(text_tf)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
Sum_of_squared_distances = []
K = range(2,30)
for k in K:
km = KMeans(n_clusters=k, max_iter=200, n_init=10)
km = km.fit(text_test_tf)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
true_k_dev = 10
model_dev = KMeans(n_clusters=true_k_dev, init='k-means++', max_iter=200, n_init=10)
model_dev.fit(text_tf)
labels_dev=model_dev.labels_
clusters_dev=pd.DataFrame(list(labels_dev),columns=['cluster'])
true_k_test = 28
model_test = KMeans(n_clusters=true_k_test, init='k-means++', max_iter=200, n_init=10)
model_test.fit(text_test_tf)
labels_test=model_test.labels_
clusters_test=pd.DataFrame(list(labels_test),columns=['cluster'])
clusters_dev.to_csv("dev-0\out.tsv", sep="\t",index=False,header=None)
clusters_test.to_csv("test-A\out.tsv", sep="\t",index=False,header=None)

691
test-A/out.tsv Normal file
View File

@ -0,0 +1,691 @@
27
17
3
19
1
7
27
10
19
1
2
20
15
22
12
1
1
11
1
12
10
15
7
22
25
17
19
13
10
1
4
5
7
6
8
2
20
19
3
27
21
23
1
15
25
21
0
11
3
12
3
24
19
22
9
23
19
3
16
24
21
1
25
17
12
6
22
7
0
12
9
8
1
1
11
19
27
12
21
2
9
26
18
2
17
20
19
19
17
21
22
9
8
17
1
1
27
25
27
14
25
15
1
13
20
0
7
20
11
17
15
3
12
3
20
17
17
12
11
19
11
10
16
21
19
3
1
23
15
23
9
8
21
23
16
8
4
19
18
4
27
10
11
4
8
19
17
4
19
23
1
1
17
12
22
20
1
14
1
15
22
17
4
11
9
20
18
22
8
8
2
19
14
20
1
18
19
16
23
2
26
11
5
1
10
10
10
18
10
9
27
8
20
19
14
14
19
3
19
27
21
24
27
25
1
1
3
11
17
27
15
1
12
7
14
20
12
7
16
10
12
0
9
17
18
8
22
13
18
20
0
13
23
9
7
25
8
22
7
19
27
12
6
13
19
16
9
9
21
11
0
2
26
15
24
18
5
1
22
11
23
15
12
13
4
13
4
2
24
11
24
10
9
19
7
1
25
15
11
1
19
9
23
11
15
27
11
3
1
7
27
0
22
2
9
9
1
27
1
13
25
11
12
9
2
16
19
7
17
2
17
9
6
1
18
2
9
4
5
24
21
18
15
17
21
21
17
7
11
25
7
19
19
23
24
3
19
6
12
19
17
21
15
12
22
11
1
20
0
0
22
7
9
15
1
22
9
1
27
1
5
8
20
20
9
4
3
5
11
22
17
21
20
13
10
14
23
1
22
19
24
2
4
25
27
15
25
20
13
7
19
6
12
3
12
2
27
17
1
21
17
19
23
14
22
12
7
10
10
15
21
27
10
20
23
9
11
9
4
5
20
0
20
7
22
24
3
17
13
12
8
22
11
24
26
12
21
15
22
7
16
3
21
14
1
2
1
26
15
13
24
2
27
13
21
23
20
11
21
9
11
0
23
2
27
1
3
19
7
21
21
23
21
10
1
0
24
23
8
16
22
18
21
0
22
25
19
9
24
17
27
3
11
22
15
11
15
4
17
11
25
3
2
13
19
6
15
1
15
25
7
22
7
2
24
20
2
1
2
11
15
10
22
11
17
13
19
18
16
5
26
27
21
3
19
15
24
12
9
0
3
4
1
11
15
7
16
5
20
15
1
21
24
13
8
26
27
27
8
6
7
3
16
10
13
1
23
19
10
8
3
3
9
2
21
20
15
11
20
19
23
13
10
7
24
9
26
23
19
9
2
20
22
7
15
2
27
20
10
24
3
12
9
12
23
2
16
27
21
1
20
5
27
13
20
19
11
11
2
17
25
15
9
3
12
18
25
9
1
25
20
11
8
1
21
27
18
22
16
4
12
27
8
23
10
22
19
22
13
2
9
13
26
20
12
0
1
24
20
22
20
7
1
19
19
15
16
19
8
19
15
1
16
22
27
18
1
16
16
7
16
8
7
22
5
3
12
13
27
10
22
1 27
2 17
3 3
4 19
5 1
6 7
7 27
8 10
9 19
10 1
11 2
12 20
13 15
14 22
15 12
16 1
17 1
18 11
19 1
20 12
21 10
22 15
23 7
24 22
25 25
26 17
27 19
28 13
29 10
30 1
31 4
32 5
33 7
34 6
35 8
36 2
37 20
38 19
39 3
40 27
41 21
42 23
43 1
44 15
45 25
46 21
47 0
48 11
49 3
50 12
51 3
52 24
53 19
54 22
55 9
56 23
57 19
58 3
59 16
60 24
61 21
62 1
63 25
64 17
65 12
66 6
67 22
68 7
69 0
70 12
71 9
72 8
73 1
74 1
75 11
76 19
77 27
78 12
79 21
80 2
81 9
82 26
83 18
84 2
85 17
86 20
87 19
88 19
89 17
90 21
91 22
92 9
93 8
94 17
95 1
96 1
97 27
98 25
99 27
100 14
101 25
102 15
103 1
104 13
105 20
106 0
107 7
108 20
109 11
110 17
111 15
112 3
113 12
114 3
115 20
116 17
117 17
118 12
119 11
120 19
121 11
122 10
123 16
124 21
125 19
126 3
127 1
128 23
129 15
130 23
131 9
132 8
133 21
134 23
135 16
136 8
137 4
138 19
139 18
140 4
141 27
142 10
143 11
144 4
145 8
146 19
147 17
148 4
149 19
150 23
151 1
152 1
153 17
154 12
155 22
156 20
157 1
158 14
159 1
160 15
161 22
162 17
163 4
164 11
165 9
166 20
167 18
168 22
169 8
170 8
171 2
172 19
173 14
174 20
175 1
176 18
177 19
178 16
179 23
180 2
181 26
182 11
183 5
184 1
185 10
186 10
187 10
188 18
189 10
190 9
191 27
192 8
193 20
194 19
195 14
196 14
197 19
198 3
199 19
200 27
201 21
202 24
203 27
204 25
205 1
206 1
207 3
208 11
209 17
210 27
211 15
212 1
213 12
214 7
215 14
216 20
217 12
218 7
219 16
220 10
221 12
222 0
223 9
224 17
225 18
226 8
227 22
228 13
229 18
230 20
231 0
232 13
233 23
234 9
235 7
236 25
237 8
238 22
239 7
240 19
241 27
242 12
243 6
244 13
245 19
246 16
247 9
248 9
249 21
250 11
251 0
252 2
253 26
254 15
255 24
256 18
257 5
258 1
259 22
260 11
261 23
262 15
263 12
264 13
265 4
266 13
267 4
268 2
269 24
270 11
271 24
272 10
273 9
274 19
275 7
276 1
277 25
278 15
279 11
280 1
281 19
282 9
283 23
284 11
285 15
286 27
287 11
288 3
289 1
290 7
291 27
292 0
293 22
294 2
295 9
296 9
297 1
298 27
299 1
300 13
301 25
302 11
303 12
304 9
305 2
306 16
307 19
308 7
309 17
310 2
311 17
312 9
313 6
314 1
315 18
316 2
317 9
318 4
319 5
320 24
321 21
322 18
323 15
324 17
325 21
326 21
327 17
328 7
329 11
330 25
331 7
332 19
333 19
334 23
335 24
336 3
337 19
338 6
339 12
340 19
341 17
342 21
343 15
344 12
345 22
346 11
347 1
348 20
349 0
350 0
351 22
352 7
353 9
354 15
355 1
356 22
357 9
358 1
359 27
360 1
361 5
362 8
363 20
364 20
365 9
366 4
367 3
368 5
369 11
370 22
371 17
372 21
373 20
374 13
375 10
376 14
377 23
378 1
379 22
380 19
381 24
382 2
383 4
384 25
385 27
386 15
387 25
388 20
389 13
390 7
391 19
392 6
393 12
394 3
395 12
396 2
397 27
398 17
399 1
400 21
401 17
402 19
403 23
404 14
405 22
406 12
407 7
408 10
409 10
410 15
411 21
412 27
413 10
414 20
415 23
416 9
417 11
418 9
419 4
420 5
421 20
422 0
423 20
424 7
425 22
426 24
427 3
428 17
429 13
430 12
431 8
432 22
433 11
434 24
435 26
436 12
437 21
438 15
439 22
440 7
441 16
442 3
443 21
444 14
445 1
446 2
447 1
448 26
449 15
450 13
451 24
452 2
453 27
454 13
455 21
456 23
457 20
458 11
459 21
460 9
461 11
462 0
463 23
464 2
465 27
466 1
467 3
468 19
469 7
470 21
471 21
472 23
473 21
474 10
475 1
476 0
477 24
478 23
479 8
480 16
481 22
482 18
483 21
484 0
485 22
486 25
487 19
488 9
489 24
490 17
491 27
492 3
493 11
494 22
495 15
496 11
497 15
498 4
499 17
500 11
501 25
502 3
503 2
504 13
505 19
506 6
507 15
508 1
509 15
510 25
511 7
512 22
513 7
514 2
515 24
516 20
517 2
518 1
519 2
520 11
521 15
522 10
523 22
524 11
525 17
526 13
527 19
528 18
529 16
530 5
531 26
532 27
533 21
534 3
535 19
536 15
537 24
538 12
539 9
540 0
541 3
542 4
543 1
544 11
545 15
546 7
547 16
548 5
549 20
550 15
551 1
552 21
553 24
554 13
555 8
556 26
557 27
558 27
559 8
560 6
561 7
562 3
563 16
564 10
565 13
566 1
567 23
568 19
569 10
570 8
571 3
572 3
573 9
574 2
575 21
576 20
577 15
578 11
579 20
580 19
581 23
582 13
583 10
584 7
585 24
586 9
587 26
588 23
589 19
590 9
591 2
592 20
593 22
594 7
595 15
596 2
597 27
598 20
599 10
600 24
601 3
602 12
603 9
604 12
605 23
606 2
607 16
608 27
609 21
610 1
611 20
612 5
613 27
614 13
615 20
616 19
617 11
618 11
619 2
620 17
621 25
622 15
623 9
624 3
625 12
626 18
627 25
628 9
629 1
630 25
631 20
632 11
633 8
634 1
635 21
636 27
637 18
638 22
639 16
640 4
641 12
642 27
643 8
644 23
645 10
646 22
647 19
648 22
649 13
650 2
651 9
652 13
653 26
654 20
655 12
656 0
657 1
658 24
659 20
660 22
661 20
662 7
663 1
664 19
665 19
666 15
667 16
668 19
669 8
670 19
671 15
672 1
673 16
674 22
675 27
676 18
677 1
678 16
679 16
680 7
681 16
682 8
683 7
684 22
685 5
686 3
687 12
688 13
689 27
690 10
691 22