Compare commits

...

5 Commits

Author SHA1 Message Date
bednarco
296fe0638e final 2021-04-20 19:15:41 +02:00
bednarco
b17760162b updated 2021-04-20 19:13:31 +02:00
bednarco
b7150f138d updated 2021-04-20 19:11:45 +02:00
bednarco
6194b5dd46 test-A, script 2021-04-20 19:06:45 +02:00
bednarco
960a201fb5 out 2021-04-20 18:43:03 +02:00
5 changed files with 1399 additions and 0 deletions

View File

@ -0,0 +1,179 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from nltk.tokenize import RegexpTokenizer\n",
"from stop_words import get_stop_words\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"data=pd.read_csv('dev-0/in.tsv', sep='\\t', header=None)\n",
"expected_data=pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"data[0] = data[0].str.lower()\n",
"filtered_words = [word for word in data[0] if word not in get_stop_words('polish')]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"token = RegexpTokenizer(r'[a-zA-Z0-9]+')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"cv = CountVectorizer(lowercase=True,ngram_range = (1,1),tokenizer = token.tokenize)\n",
"text_counts= cv.fit_transform(data[0])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<1x5048 sparse matrix of type '<class 'numpy.int64'>'\n",
"\twith 234 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text_counts"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(\n",
" text_counts, expected_data[0], test_size=0.3, random_state=1)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MultinomialNB Accuracy: 0.6296296296296297\n"
]
}
],
"source": [
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn import metrics\n",
"clf = MultinomialNB().fit(X_train, y_train)\n",
"predicted= clf.predict(X_test)\n",
"print(\"MultinomialNB Accuracy:\",metrics.accuracy_score(y_test, predicted))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"tf=TfidfVectorizer()\n",
"text_tf= tf.fit_transform(filtered_words)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" text_tf, expected_data[0], test_size=0.3, random_state=123)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MultinomialNB Accuracy: 0.2222222222222222\n"
]
}
],
"source": [
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn import metrics\n",
"clf = MultinomialNB().fit(X_train, y_train)\n",
"predicted= clf.predict(X_test)\n",
"print(\"MultinomialNB Accuracy:\",metrics.accuracy_score(y_test, predicted))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

367
Untitled.ipynb Normal file

File diff suppressed because one or more lines are too long

87
dev-0/out.tsv Normal file
View File

@ -0,0 +1,87 @@
1
7
3
9
4
6
2
0
6
3
0
6
7
4
7
2
7
7
3
4
8
4
4
8
0
4
5
4
4
7
2
2
2
4
7
2
7
4
5
9
6
1
2
9
1
3
2
7
5
2
0
3
2
4
1
8
7
7
2
3
2
7
2
2
6
4
2
1
3
2
4
3
1
2
7
0
0
1
9
4
3
0
3
4
2
7
4
1 1
2 7
3 3
4 9
5 4
6 6
7 2
8 0
9 6
10 3
11 0
12 6
13 7
14 4
15 7
16 2
17 7
18 7
19 3
20 4
21 8
22 4
23 4
24 8
25 0
26 4
27 5
28 4
29 4
30 7
31 2
32 2
33 2
34 4
35 7
36 2
37 7
38 4
39 5
40 9
41 6
42 1
43 2
44 9
45 1
46 3
47 2
48 7
49 5
50 2
51 0
52 3
53 2
54 4
55 1
56 8
57 7
58 7
59 2
60 3
61 2
62 7
63 2
64 2
65 6
66 4
67 2
68 1
69 3
70 2
71 4
72 3
73 1
74 2
75 7
76 0
77 0
78 1
79 9
80 4
81 3
82 0
83 3
84 4
85 2
86 7
87 4

75
script.py Normal file
View File

@ -0,0 +1,75 @@
import pandas as pd
from many_stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from unidecode import unidecode
from nltk.tokenize import word_tokenize
import string
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
data=pd.read_csv('dev-0/in.tsv', sep='\t', header=None)
data_test=pd.read_csv('test-A/in.tsv', sep='\t', header=None)
def remove_punctuations(text):
for punctuation in string.punctuation:
text = text.replace(punctuation, '')
return text
data[0] = data[0].str.lower()
data_test[0] = data_test[0].str.lower()
stop_words = get_stop_words('pl')
data[0] = data[0].apply(unidecode)
data_test[0] = data_test[0].apply(unidecode)
uni_stop_words = [unidecode(x) for x in stop_words]
data[0] = data[0].apply(remove_punctuations)
data_test[0] = data_test[0].apply(remove_punctuations)
data[0] = data[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))
data_test[0] = data_test[0].apply(lambda x: ' '.join([item for item in x.split() if item not in uni_stop_words]))
tf=TfidfVectorizer()
text_tf= tf.fit_transform(data[0])
text_test_tf= tf.fit_transform(data_test[0])
Sum_of_squared_distances = []
K = range(2,20)
for k in K:
km = KMeans(n_clusters=k, max_iter=200, n_init=10)
km = km.fit(text_tf)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
Sum_of_squared_distances = []
K = range(2,30)
for k in K:
km = KMeans(n_clusters=k, max_iter=200, n_init=10)
km = km.fit(text_test_tf)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
true_k_dev = 10
model_dev = KMeans(n_clusters=true_k_dev, init='k-means++', max_iter=200, n_init=10)
model_dev.fit(text_tf)
labels_dev=model_dev.labels_
clusters_dev=pd.DataFrame(list(labels_dev),columns=['cluster'])
true_k_test = 28
model_test = KMeans(n_clusters=true_k_test, init='k-means++', max_iter=200, n_init=10)
model_test.fit(text_test_tf)
labels_test=model_test.labels_
clusters_test=pd.DataFrame(list(labels_test),columns=['cluster'])
clusters_dev.to_csv("dev-0\out.tsv", sep="\t",index=False,header=None)
clusters_test.to_csv("test-A\out.tsv", sep="\t",index=False,header=None)

691
test-A/out.tsv Normal file
View File

@ -0,0 +1,691 @@
27
17
3
19
1
7
27
10
19
1
2
20
15
22
12
1
1
11
1
12
10
15
7
22
25
17
19
13
10
1
4
5
7
6
8
2
20
19
3
27
21
23
1
15
25
21
0
11
3
12
3
24
19
22
9
23
19
3
16
24
21
1
25
17
12
6
22
7
0
12
9
8
1
1
11
19
27
12
21
2
9
26
18
2
17
20
19
19
17
21
22
9
8
17
1
1
27
25
27
14
25
15
1
13
20
0
7
20
11
17
15
3
12
3
20
17
17
12
11
19
11
10
16
21
19
3
1
23
15
23
9
8
21
23
16
8
4
19
18
4
27
10
11
4
8
19
17
4
19
23
1
1
17
12
22
20
1
14
1
15
22
17
4
11
9
20
18
22
8
8
2
19
14
20
1
18
19
16
23
2
26
11
5
1
10
10
10
18
10
9
27
8
20
19
14
14
19
3
19
27
21
24
27
25
1
1
3
11
17
27
15
1
12
7
14
20
12
7
16
10
12
0
9
17
18
8
22
13
18
20
0
13
23
9
7
25
8
22
7
19
27
12
6
13
19
16
9
9
21
11
0
2
26
15
24
18
5
1
22
11
23
15
12
13
4
13
4
2
24
11
24
10
9
19
7
1
25
15
11
1
19
9
23
11
15
27
11
3
1
7
27
0
22
2
9
9
1
27
1
13
25
11
12
9
2
16
19
7
17
2
17
9
6
1
18
2
9
4
5
24
21
18
15
17
21
21
17
7
11
25
7
19
19
23
24
3
19
6
12
19
17
21
15
12
22
11
1
20
0
0
22
7
9
15
1
22
9
1
27
1
5
8
20
20
9
4
3
5
11
22
17
21
20
13
10
14
23
1
22
19
24
2
4
25
27
15
25
20
13
7
19
6
12
3
12
2
27
17
1
21
17
19
23
14
22
12
7
10
10
15
21
27
10
20
23
9
11
9
4
5
20
0
20
7
22
24
3
17
13
12
8
22
11
24
26
12
21
15
22
7
16
3
21
14
1
2
1
26
15
13
24
2
27
13
21
23
20
11
21
9
11
0
23
2
27
1
3
19
7
21
21
23
21
10
1
0
24
23
8
16
22
18
21
0
22
25
19
9
24
17
27
3
11
22
15
11
15
4
17
11
25
3
2
13
19
6
15
1
15
25
7
22
7
2
24
20
2
1
2
11
15
10
22
11
17
13
19
18
16
5
26
27
21
3
19
15
24
12
9
0
3
4
1
11
15
7
16
5
20
15
1
21
24
13
8
26
27
27
8
6
7
3
16
10
13
1
23
19
10
8
3
3
9
2
21
20
15
11
20
19
23
13
10
7
24
9
26
23
19
9
2
20
22
7
15
2
27
20
10
24
3
12
9
12
23
2
16
27
21
1
20
5
27
13
20
19
11
11
2
17
25
15
9
3
12
18
25
9
1
25
20
11
8
1
21
27
18
22
16
4
12
27
8
23
10
22
19
22
13
2
9
13
26
20
12
0
1
24
20
22
20
7
1
19
19
15
16
19
8
19
15
1
16
22
27
18
1
16
16
7
16
8
7
22
5
3
12
13
27
10
22
1 27
2 17
3 3
4 19
5 1
6 7
7 27
8 10
9 19
10 1
11 2
12 20
13 15
14 22
15 12
16 1
17 1
18 11
19 1
20 12
21 10
22 15
23 7
24 22
25 25
26 17
27 19
28 13
29 10
30 1
31 4
32 5
33 7
34 6
35 8
36 2
37 20
38 19
39 3
40 27
41 21
42 23
43 1
44 15
45 25
46 21
47 0
48 11
49 3
50 12
51 3
52 24
53 19
54 22
55 9
56 23
57 19
58 3
59 16
60 24
61 21
62 1
63 25
64 17
65 12
66 6
67 22
68 7
69 0
70 12
71 9
72 8
73 1
74 1
75 11
76 19
77 27
78 12
79 21
80 2
81 9
82 26
83 18
84 2
85 17
86 20
87 19
88 19
89 17
90 21
91 22
92 9
93 8
94 17
95 1
96 1
97 27
98 25
99 27
100 14
101 25
102 15
103 1
104 13
105 20
106 0
107 7
108 20
109 11
110 17
111 15
112 3
113 12
114 3
115 20
116 17
117 17
118 12
119 11
120 19
121 11
122 10
123 16
124 21
125 19
126 3
127 1
128 23
129 15
130 23
131 9
132 8
133 21
134 23
135 16
136 8
137 4
138 19
139 18
140 4
141 27
142 10
143 11
144 4
145 8
146 19
147 17
148 4
149 19
150 23
151 1
152 1
153 17
154 12
155 22
156 20
157 1
158 14
159 1
160 15
161 22
162 17
163 4
164 11
165 9
166 20
167 18
168 22
169 8
170 8
171 2
172 19
173 14
174 20
175 1
176 18
177 19
178 16
179 23
180 2
181 26
182 11
183 5
184 1
185 10
186 10
187 10
188 18
189 10
190 9
191 27
192 8
193 20
194 19
195 14
196 14
197 19
198 3
199 19
200 27
201 21
202 24
203 27
204 25
205 1
206 1
207 3
208 11
209 17
210 27
211 15
212 1
213 12
214 7
215 14
216 20
217 12
218 7
219 16
220 10
221 12
222 0
223 9
224 17
225 18
226 8
227 22
228 13
229 18
230 20
231 0
232 13
233 23
234 9
235 7
236 25
237 8
238 22
239 7
240 19
241 27
242 12
243 6
244 13
245 19
246 16
247 9
248 9
249 21
250 11
251 0
252 2
253 26
254 15
255 24
256 18
257 5
258 1
259 22
260 11
261 23
262 15
263 12
264 13
265 4
266 13
267 4
268 2
269 24
270 11
271 24
272 10
273 9
274 19
275 7
276 1
277 25
278 15
279 11
280 1
281 19
282 9
283 23
284 11
285 15
286 27
287 11
288 3
289 1
290 7
291 27
292 0
293 22
294 2
295 9
296 9
297 1
298 27
299 1
300 13
301 25
302 11
303 12
304 9
305 2
306 16
307 19
308 7
309 17
310 2
311 17
312 9
313 6
314 1
315 18
316 2
317 9
318 4
319 5
320 24
321 21
322 18
323 15
324 17
325 21
326 21
327 17
328 7
329 11
330 25
331 7
332 19
333 19
334 23
335 24
336 3
337 19
338 6
339 12
340 19
341 17
342 21
343 15
344 12
345 22
346 11
347 1
348 20
349 0
350 0
351 22
352 7
353 9
354 15
355 1
356 22
357 9
358 1
359 27
360 1
361 5
362 8
363 20
364 20
365 9
366 4
367 3
368 5
369 11
370 22
371 17
372 21
373 20
374 13
375 10
376 14
377 23
378 1
379 22
380 19
381 24
382 2
383 4
384 25
385 27
386 15
387 25
388 20
389 13
390 7
391 19
392 6
393 12
394 3
395 12
396 2
397 27
398 17
399 1
400 21
401 17
402 19
403 23
404 14
405 22
406 12
407 7
408 10
409 10
410 15
411 21
412 27
413 10
414 20
415 23
416 9
417 11
418 9
419 4
420 5
421 20
422 0
423 20
424 7
425 22
426 24
427 3
428 17
429 13
430 12
431 8
432 22
433 11
434 24
435 26
436 12
437 21
438 15
439 22
440 7
441 16
442 3
443 21
444 14
445 1
446 2
447 1
448 26
449 15
450 13
451 24
452 2
453 27
454 13
455 21
456 23
457 20
458 11
459 21
460 9
461 11
462 0
463 23
464 2
465 27
466 1
467 3
468 19
469 7
470 21
471 21
472 23
473 21
474 10
475 1
476 0
477 24
478 23
479 8
480 16
481 22
482 18
483 21
484 0
485 22
486 25
487 19
488 9
489 24
490 17
491 27
492 3
493 11
494 22
495 15
496 11
497 15
498 4
499 17
500 11
501 25
502 3
503 2
504 13
505 19
506 6
507 15
508 1
509 15
510 25
511 7
512 22
513 7
514 2
515 24
516 20
517 2
518 1
519 2
520 11
521 15
522 10
523 22
524 11
525 17
526 13
527 19
528 18
529 16
530 5
531 26
532 27
533 21
534 3
535 19
536 15
537 24
538 12
539 9
540 0
541 3
542 4
543 1
544 11
545 15
546 7
547 16
548 5
549 20
550 15
551 1
552 21
553 24
554 13
555 8
556 26
557 27
558 27
559 8
560 6
561 7
562 3
563 16
564 10
565 13
566 1
567 23
568 19
569 10
570 8
571 3
572 3
573 9
574 2
575 21
576 20
577 15
578 11
579 20
580 19
581 23
582 13
583 10
584 7
585 24
586 9
587 26
588 23
589 19
590 9
591 2
592 20
593 22
594 7
595 15
596 2
597 27
598 20
599 10
600 24
601 3
602 12
603 9
604 12
605 23
606 2
607 16
608 27
609 21
610 1
611 20
612 5
613 27
614 13
615 20
616 19
617 11
618 11
619 2
620 17
621 25
622 15
623 9
624 3
625 12
626 18
627 25
628 9
629 1
630 25
631 20
632 11
633 8
634 1
635 21
636 27
637 18
638 22
639 16
640 4
641 12
642 27
643 8
644 23
645 10
646 22
647 19
648 22
649 13
650 2
651 9
652 13
653 26
654 20
655 12
656 0
657 1
658 24
659 20
660 22
661 20
662 7
663 1
664 19
665 19
666 15
667 16
668 19
669 8
670 19
671 15
672 1
673 16
674 22
675 27
676 18
677 1
678 16
679 16
680 7
681 16
682 8
683 7
684 22
685 5
686 3
687 12
688 13
689 27
690 10
691 22