Fixed bigrams a little

This commit is contained in:
s426135 2020-03-29 19:48:30 +02:00
parent 59132bf9c6
commit a3a146a87c
6 changed files with 138 additions and 134 deletions

Binary file not shown.

View File

@ -156,7 +156,7 @@
P
P
S
P
S
S
S
P
@ -200,7 +200,7 @@
P
P
P
P
S
S
S
S
@ -239,7 +239,7 @@
P
S
S
P
S
P
S
S
@ -290,7 +290,7 @@
S
S
S
P
S
S
P
S
@ -333,7 +333,7 @@
P
S
P
P
S
S
S
S
@ -637,7 +637,7 @@
S
S
S
P
S
S
P
P
@ -699,7 +699,7 @@
P
S
P
P
S
S
S
S
@ -790,7 +790,7 @@
S
S
P
P
S
P
P
P
@ -874,7 +874,7 @@
S
S
P
P
S
P
S
S
@ -925,7 +925,7 @@
S
P
S
P
S
S
S
S
@ -1101,7 +1101,7 @@
S
S
S
P
S
S
S
P
@ -1252,7 +1252,7 @@
P
S
S
P
S
S
S
S
@ -1342,7 +1342,7 @@
S
S
S
P
S
P
S
P
@ -1361,7 +1361,7 @@
S
P
P
P
S
S
S
S
@ -1485,7 +1485,7 @@
S
S
S
P
S
S
P
S
@ -1608,7 +1608,7 @@
S
P
P
P
S
P
S
S
@ -1634,7 +1634,7 @@
S
P
S
P
S
S
S
P
@ -2107,7 +2107,7 @@
P
S
S
P
S
P
S
S
@ -2437,7 +2437,7 @@
P
S
S
P
S
S
P
S
@ -2704,7 +2704,7 @@
P
S
P
P
S
S
P
S
@ -2844,7 +2844,7 @@
P
S
S
P
S
S
S
S
@ -2990,7 +2990,7 @@
S
S
S
P
S
P
S
S
@ -3658,7 +3658,7 @@
P
S
P
P
S
S
S
S
@ -3670,7 +3670,7 @@
S
S
P
P
S
S
P
S
@ -3773,7 +3773,7 @@
S
P
S
P
S
S
S
S
@ -3850,7 +3850,7 @@
S
P
S
P
S
S
S
P
@ -4035,7 +4035,7 @@
S
S
S
P
S
S
S
S
@ -4117,7 +4117,7 @@
P
S
S
P
S
S
S
S
@ -4232,8 +4232,8 @@
S
S
P
S
S
P
P
P
S
S
@ -4578,14 +4578,14 @@
P
S
P
S
P
S
S
P
S
P
S
P
S
P
P
S
@ -4658,8 +4658,8 @@
P
P
S
P
P
S
S
P
S
S
@ -4730,7 +4730,7 @@
S
S
S
S
P
P
P
S
@ -4918,7 +4918,7 @@
S
P
S
S
P
S
S
S

1 S
156 P
157 P
158 S
159 P S
160 S
161 S
162 P
200 P
201 P
202 P
203 P S
204 S
205 S
206 S
239 P
240 S
241 S
242 P S
243 P
244 S
245 S
290 S
291 S
292 S
293 P S
294 S
295 P
296 S
333 P
334 S
335 P
336 P S
337 S
338 S
339 S
637 S
638 S
639 S
640 P S
641 S
642 P
643 P
699 P
700 S
701 P
702 P S
703 S
704 S
705 S
790 S
791 S
792 P
793 P S
794 P
795 P
796 P
874 S
875 S
876 P
877 P S
878 P
879 S
880 S
925 S
926 P
927 S
928 P S
929 S
930 S
931 S
1101 S
1102 S
1103 S
1104 P S
1105 S
1106 S
1107 P
1252 P
1253 S
1254 S
1255 P S
1256 S
1257 S
1258 S
1342 S
1343 S
1344 S
1345 P S
1346 P
1347 S
1348 P
1361 S
1362 P
1363 P
1364 P S
1365 S
1366 S
1367 S
1485 S
1486 S
1487 S
1488 P S
1489 S
1490 P
1491 S
1608 S
1609 P
1610 P
1611 P S
1612 P
1613 S
1614 S
1634 S
1635 P
1636 S
1637 P S
1638 S
1639 S
1640 P
2107 P
2108 S
2109 S
2110 P S
2111 P
2112 S
2113 S
2437 P
2438 S
2439 S
2440 P S
2441 S
2442 P
2443 S
2704 P
2705 S
2706 P
2707 P S
2708 S
2709 P
2710 S
2844 P
2845 S
2846 S
2847 P S
2848 S
2849 S
2850 S
2990 S
2991 S
2992 S
2993 P S
2994 P
2995 S
2996 S
3658 P
3659 S
3660 P
3661 P S
3662 S
3663 S
3664 S
3670 S
3671 S
3672 P
3673 P S
3674 S
3675 P
3676 S
3773 S
3774 P
3775 S
3776 P S
3777 S
3778 S
3779 S
3850 S
3851 P
3852 S
3853 P S
3854 S
3855 S
3856 P
4035 S
4036 S
4037 S
4038 P S
4039 S
4040 S
4041 S
4117 P
4118 S
4119 S
4120 P S
4121 S
4122 S
4123 S
4232 S
4233 S
4234 P
4235 S P
4236 S P
4237 P
4238 S
4239 S
4578 P
4579 S
4580 P
4581 S P
4582 S
4583 S
4584 P
4585 S
4586 P
4587 S
4588 P S
4589 P
4590 P
4591 S
4658 P
4659 P
4660 S
4661 P S
4662 P S
4663 P
4664 S
4665 S
4730 S
4731 S
4732 S
4733 S P
4734 P
4735 P
4736 S
4918 S
4919 P
4920 S
4921 S P
4922 S
4923 S
4924 S

Binary file not shown.

View File

@ -24,16 +24,16 @@ def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigra
product += sceptic_class_logprob
elif class_ == 'paranormal':
product += paranormal_class_logprob
probs[product] = class_
probs[abs(product)] = class_
#print(probs)
return probs[min(probs.keys())]
return probs[max(probs.keys())]
def clear_post(post):
post = post.replace('\\n', ' ')
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', '', post)
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', ' internetlink ', post)
post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|]+', '', post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\\\!\=\^]+', '', post)
post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post)
post = post.rstrip(' ')

View File

@ -50,7 +50,7 @@
S
S
P
P
S
S
S
S
@ -174,7 +174,7 @@
S
P
S
S
P
S
P
S
@ -286,7 +286,7 @@
S
S
S
S
P
S
S
S
@ -331,11 +331,11 @@
S
S
P
S
P
S
S
S
S
P
S
S
@ -357,7 +357,7 @@
P
S
S
S
P
P
S
S
@ -462,7 +462,7 @@
S
S
P
P
S
S
S
S
@ -610,7 +610,7 @@
S
S
S
S
P
S
S
P
@ -661,7 +661,7 @@
S
S
S
P
S
S
S
P
@ -711,7 +711,7 @@
P
S
S
S
P
P
P
P
@ -817,7 +817,7 @@
S
P
S
P
S
S
S
S
@ -925,7 +925,7 @@
S
S
S
S
P
S
S
S
@ -982,7 +982,7 @@
P
S
S
S
P
S
S
S
@ -1220,7 +1220,7 @@
S
S
S
P
S
P
S
S
@ -1251,7 +1251,7 @@
S
S
S
P
S
S
P
S
@ -1350,7 +1350,7 @@
S
S
S
S
P
S
P
P
@ -1359,7 +1359,7 @@
P
S
P
S
P
S
P
S
@ -1461,7 +1461,7 @@
S
P
S
S
P
S
S
S
@ -1475,7 +1475,7 @@
S
S
S
P
S
S
P
P
@ -1483,7 +1483,7 @@
P
P
S
P
S
S
S
S
@ -1499,7 +1499,7 @@
S
S
P
P
S
S
S
S
@ -1542,7 +1542,7 @@
P
P
S
P
S
S
S
S
@ -1558,7 +1558,7 @@
P
P
S
S
P
S
S
P
@ -1576,7 +1576,7 @@
P
S
S
S
P
S
S
S
@ -1672,7 +1672,7 @@
S
S
S
P
S
S
S
P
@ -1758,7 +1758,7 @@
S
S
P
P
S
S
S
P
@ -1799,7 +1799,7 @@
S
S
S
S
P
P
S
S
@ -1831,7 +1831,7 @@
S
S
S
S
P
S
P
S
@ -1986,7 +1986,7 @@
S
S
S
P
S
S
S
S
@ -2016,7 +2016,7 @@
S
P
P
P
S
S
S
S
@ -2032,7 +2032,7 @@
S
S
P
S
P
P
S
S
@ -2143,7 +2143,7 @@
S
S
S
S
P
S
P
S
@ -2191,15 +2191,15 @@
S
S
S
P
S
S
P
S
P
S
P
P
P
P
S
S
S
@ -2248,9 +2248,9 @@
S
S
S
P
P
P
S
S
S
S
S
P
@ -2398,7 +2398,7 @@
S
P
S
S
P
P
P
S
@ -2505,7 +2505,7 @@
S
P
S
P
S
S
S
S
@ -2571,7 +2571,6 @@
S
S
S
P
S
S
S
@ -2585,7 +2584,6 @@
S
S
S
P
S
P
S
@ -2600,9 +2598,11 @@
S
S
S
S
S
P
P
P
S
S
S
P
@ -2615,7 +2615,7 @@
S
S
S
P
S
S
S
P
@ -2693,7 +2693,7 @@
S
P
S
P
S
S
P
S
@ -2727,7 +2727,7 @@
S
S
S
S
P
S
P
S
@ -2771,7 +2771,7 @@
S
P
S
P
S
S
S
S
@ -2790,7 +2790,7 @@
S
P
S
P
S
S
S
S
@ -2822,7 +2822,7 @@
S
S
S
P
S
S
S
S
@ -2830,7 +2830,7 @@
P
P
S
P
S
S
P
S
@ -2840,7 +2840,7 @@
S
S
P
S
P
P
S
S
@ -2893,7 +2893,7 @@
S
P
S
P
S
S
S
S
@ -2918,12 +2918,12 @@
S
S
P
P
S
S
S
S
P
P
S
S
S
P
@ -2971,11 +2971,11 @@
S
S
S
S
P
S
S
S
S
P
S
P
@ -3092,7 +3092,7 @@
S
S
P
P
S
S
P
S
@ -3186,6 +3186,7 @@
S
S
S
P
S
S
S
@ -3195,7 +3196,6 @@
S
P
P
P
S
S
S
@ -3385,7 +3385,7 @@
P
P
S
S
P
S
S
S
@ -3432,7 +3432,7 @@
S
S
P
S
P
P
S
S
@ -3455,12 +3455,13 @@
S
S
S
S
P
P
P
P
S
S
S
P
P
S
S
@ -3469,7 +3470,6 @@
P
P
P
P
S
P
S
@ -3573,7 +3573,7 @@
S
S
S
S
P
S
S
S
@ -3591,7 +3591,7 @@
S
S
S
P
S
P
S
S
@ -3786,7 +3786,7 @@
P
S
S
P
S
S
S
P
@ -3817,7 +3817,7 @@
S
S
S
P
S
S
P
P
@ -3881,12 +3881,12 @@
P
S
S
S
P
S
S
S
P
P
S
P
S
P
@ -3934,7 +3934,7 @@
S
S
S
S
P
S
S
S
@ -4010,7 +4010,7 @@
S
S
S
S
P
P
P
S
@ -4032,7 +4032,7 @@
S
S
S
P
S
S
S
P
@ -4100,7 +4100,7 @@
S
P
S
S
P
S
S
S
@ -4131,7 +4131,7 @@
P
S
S
S
P
S
S
S
@ -4142,7 +4142,7 @@
P
S
S
P
S
S
S
S
@ -4163,7 +4163,7 @@
S
P
S
P
S
S
S
S
@ -4299,16 +4299,16 @@
S
S
S
P
S
S
S
S
S
S
P
S
P
S
P
S
S
S
@ -4451,7 +4451,7 @@
S
S
P
P
S
S
P
S
@ -4473,7 +4473,7 @@
P
S
S
P
S
S
P
S
@ -4692,7 +4692,7 @@
S
S
S
S
P
S
S
P
@ -4728,7 +4728,7 @@
P
S
S
P
S
S
P
S
@ -4784,8 +4784,8 @@
P
P
S
P
P
S
S
S
S
S
@ -4808,14 +4808,14 @@
P
P
S
S
P
S
S
P
S
S
P
P
S
S
S
P
S
@ -4862,7 +4862,7 @@
P
P
S
S
P
S
S
S
@ -4895,6 +4895,7 @@
S
S
S
P
S
S
S
@ -4902,7 +4903,6 @@
S
S
S
P
S
S
S
@ -4930,7 +4930,7 @@
P
S
P
P
S
P
P
S
@ -5001,7 +5001,7 @@
P
S
S
P
S
S
P
S
@ -5018,7 +5018,7 @@
S
P
S
S
P
S
S
S
@ -5034,7 +5034,7 @@
S
S
P
S
P
P
S
S

1 P
50 S
51 S
52 P
53 P S
54 S
55 S
56 S
174 S
175 P
176 S
177 S P
178 S
179 P
180 S
286 S
287 S
288 S
289 S P
290 S
291 S
292 S
331 S
332 S
333 P
S
334 P
335 S
336 S
337 S
338 S
339 P
340 S
341 S
357 P
358 S
359 S
360 S P
361 P
362 S
363 S
462 S
463 S
464 P
465 P S
466 S
467 S
468 S
610 S
611 S
612 S
613 S P
614 S
615 S
616 P
661 S
662 S
663 S
664 P S
665 S
666 S
667 P
711 P
712 S
713 S
714 S P
715 P
716 P
717 P
817 S
818 P
819 S
820 P S
821 S
822 S
823 S
925 S
926 S
927 S
928 S P
929 S
930 S
931 S
982 P
983 S
984 S
985 S P
986 S
987 S
988 S
1220 S
1221 S
1222 S
1223 P S
1224 P
1225 S
1226 S
1251 S
1252 S
1253 S
1254 P S
1255 S
1256 P
1257 S
1350 S
1351 S
1352 S
1353 S P
1354 S
1355 P
1356 P
1359 P
1360 S
1361 P
1362 S P
1363 S
1364 P
1365 S
1461 S
1462 P
1463 S
1464 S P
1465 S
1466 S
1467 S
1475 S
1476 S
1477 S
1478 P S
1479 S
1480 P
1481 P
1483 P
1484 P
1485 S
1486 P S
1487 S
1488 S
1489 S
1499 S
1500 S
1501 P
1502 P S
1503 S
1504 S
1505 S
1542 P
1543 P
1544 S
1545 P S
1546 S
1547 S
1548 S
1558 P
1559 P
1560 S
1561 S P
1562 S
1563 S
1564 P
1576 P
1577 S
1578 S
1579 S P
1580 S
1581 S
1582 S
1672 S
1673 S
1674 S
1675 P S
1676 S
1677 S
1678 P
1758 S
1759 S
1760 P
1761 P S
1762 S
1763 S
1764 P
1799 S
1800 S
1801 S
1802 S P
1803 P
1804 S
1805 S
1831 S
1832 S
1833 S
1834 S P
1835 S
1836 P
1837 S
1986 S
1987 S
1988 S
1989 P S
1990 S
1991 S
1992 S
2016 S
2017 P
2018 P
2019 P S
2020 S
2021 S
2022 S
2032 S
2033 S
2034 P
2035 S P
2036 P
2037 S
2038 S
2143 S
2144 S
2145 S
2146 S P
2147 S
2148 P
2149 S
2191 S
2192 S
2193 S
P
2194 S
2195 S
P
2196 S
2197 P
2198 S
2199 P
2200 P
2201 P
2202 P
2203 S
2204 S
2205 S
2248 S
2249 S
2250 S
2251 P S
2252 P S
2253 P S
2254 S
2255 S
2256 P
2398 S
2399 P
2400 S
2401 S P
2402 P
2403 P
2404 S
2505 S
2506 P
2507 S
2508 P S
2509 S
2510 S
2511 S
2571 S
2572 S
2573 S
P
2574 S
2575 S
2576 S
2584 S
2585 S
2586 S
P
2587 S
2588 P
2589 S
2598 S
2599 S
2600 S
2601 S
2602 S
2603 P
2604 P
2605 P
S
2606 S
2607 S
2608 P
2615 S
2616 S
2617 S
2618 P S
2619 S
2620 S
2621 P
2693 S
2694 P
2695 S
2696 P S
2697 S
2698 P
2699 S
2727 S
2728 S
2729 S
2730 S P
2731 S
2732 P
2733 S
2771 S
2772 P
2773 S
2774 P S
2775 S
2776 S
2777 S
2790 S
2791 P
2792 S
2793 P S
2794 S
2795 S
2796 S
2822 S
2823 S
2824 S
2825 P S
2826 S
2827 S
2828 S
2830 P
2831 P
2832 S
2833 P S
2834 S
2835 P
2836 S
2840 S
2841 S
2842 P
2843 S P
2844 P
2845 S
2846 S
2893 S
2894 P
2895 S
2896 P S
2897 S
2898 S
2899 S
2918 S
2919 S
2920 P
2921 P S
2922 S
2923 S
2924 S
2925 P
2926 P S
2927 S
2928 S
2929 P
2971 S
2972 S
2973 S
S
2974 P
2975 S
2976 S
2977 S
2978 S
2979 P
2980 S
2981 P
3092 S
3093 S
3094 P
3095 P S
3096 S
3097 P
3098 S
3186 S
3187 S
3188 S
3189 P
3190 S
3191 S
3192 S
3196 S
3197 P
3198 P
P
3199 S
3200 S
3201 S
3385 P
3386 P
3387 S
3388 S P
3389 S
3390 S
3391 S
3432 S
3433 S
3434 P
3435 S P
3436 P
3437 S
3438 S
3455 S
3456 S
3457 S
3458 S
3459 P
3460 P
P
P
3461 S
3462 S
3463 S
3464 P
3465 P
3466 S
3467 S
3470 P
3471 P
3472 P
P
3473 S
3474 P
3475 S
3573 S
3574 S
3575 S
3576 S P
3577 S
3578 S
3579 S
3591 S
3592 S
3593 S
3594 P S
3595 P
3596 S
3597 S
3786 P
3787 S
3788 S
3789 P S
3790 S
3791 S
3792 P
3817 S
3818 S
3819 S
3820 P S
3821 S
3822 P
3823 P
3881 P
3882 S
3883 S
3884 S P
3885 S
3886 S
3887 S
3888 P
3889 P S
3890 P
3891 S
3892 P
3934 S
3935 S
3936 S
3937 S P
3938 S
3939 S
3940 S
4010 S
4011 S
4012 S
4013 S P
4014 P
4015 P
4016 S
4032 S
4033 S
4034 S
4035 P S
4036 S
4037 S
4038 P
4100 S
4101 P
4102 S
4103 S P
4104 S
4105 S
4106 S
4131 P
4132 S
4133 S
4134 S P
4135 S
4136 S
4137 S
4142 P
4143 S
4144 S
4145 P S
4146 S
4147 S
4148 S
4163 S
4164 P
4165 S
4166 P S
4167 S
4168 S
4169 S
4299 S
4300 S
4301 S
P
4302 S
4303 S
4304 S
4305 S
4306 S
4307 S
P
4308 S
4309 P
4310 S
4311 P
4312 S
4313 S
4314 S
4451 S
4452 S
4453 P
4454 P S
4455 S
4456 P
4457 S
4473 P
4474 S
4475 S
4476 P S
4477 S
4478 P
4479 S
4692 S
4693 S
4694 S
4695 S P
4696 S
4697 S
4698 P
4728 P
4729 S
4730 S
4731 P S
4732 S
4733 P
4734 S
4784 P
4785 P
4786 S
4787 P S
4788 P S
4789 S
4790 S
4791 S
4808 P
4809 P
4810 S
4811 S P
4812 S
4813 S
4814 P
4815 S
4816 S
4817 P S
4818 P S
4819 S
4820 P
4821 S
4862 P
4863 P
4864 S
4865 S P
4866 S
4867 S
4868 S
4895 S
4896 S
4897 S
4898 P
4899 S
4900 S
4901 S
4903 S
4904 S
4905 S
P
4906 S
4907 S
4908 S
4930 P
4931 S
4932 P
4933 P S
4934 P
4935 P
4936 S
5001 P
5002 S
5003 S
5004 P S
5005 S
5006 P
5007 S
5018 S
5019 P
5020 S
5021 S P
5022 S
5023 S
5024 S
5034 S
5035 S
5036 P
5037 S P
5038 P
5039 S
5040 S

View File

@ -25,10 +25,10 @@ def calc_class_logprob(expected_path):
def clear_post(post):
post = post.replace('\\n', ' ')
# delete links
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', '', post)
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|]+', '', post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\\\!\=\^]+', '', post)
post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post)
post = post.rstrip(' ')
@ -37,6 +37,7 @@ def clear_post(post):
def calc_bigram_count(in_path, expected_path):
bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
with open(in_path) as infile, open(expected_path) as expected_file:
num_of_bigams = 0
for line, exp in zip(infile, expected_file):
class_ = exp.rstrip('\n').replace(' ', '')
text, timestap = line.rstrip('\n').split('\t')
@ -45,12 +46,15 @@ def calc_bigram_count(in_path, expected_path):
for index in range(len(tokens)-1):
# if there is next token we append current and next
bigram = tokens[index] + " " + tokens[index + 1]
print(bigram)
#print(bigram)
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
if class_ == 'P':
bigram_counts['paranormal'][bigram] +=1
elif class_ == 'S':
bigram_counts['sceptic'][bigram] +=1
num_of_bigams +=1
#print(f"num of every added bigams with repetitions {num_of_bigams})")
#print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
return bigram_counts
def calc_bigram_logprobs(bigram_counts):