diff --git a/.predict.py.swp b/.predict.py.swp deleted file mode 100644 index b0cda74..0000000 Binary files a/.predict.py.swp and /dev/null differ diff --git a/.train.py.swp b/.train.py.swp deleted file mode 100644 index 37cc5a8..0000000 Binary files a/.train.py.swp and /dev/null differ diff --git a/dev-0/naive_bigram.pkl b/dev-0/naive_bigram.pkl new file mode 100644 index 0000000..65b8291 Binary files /dev/null and b/dev-0/naive_bigram.pkl differ diff --git a/dev-0/out.tsv b/dev-0/out.tsv index 47a38a1..d2c2dda 100644 --- a/dev-0/out.tsv +++ b/dev-0/out.tsv @@ -9,7 +9,7 @@ S S S - P + S S P S @@ -21,7 +21,7 @@ S S P - P + S P S P @@ -68,7 +68,7 @@ P S P - P + S S S P @@ -79,11 +79,11 @@ S P P - P + S P P S - P + S P P S @@ -95,7 +95,7 @@ S S P - P + S P S S @@ -107,16 +107,16 @@ S S S - P - P S S S S - P S S + P + S S + P S P P @@ -156,7 +156,7 @@ P P S - S + P S S P @@ -171,11 +171,11 @@ S P S + S + S P P - P - P - P + S S S P @@ -213,7 +213,7 @@ P S S - S + P S S S @@ -239,7 +239,7 @@ P S S - S + P P S S @@ -290,11 +290,11 @@ S S S - S P S P S + S P P P @@ -306,11 +306,8 @@ S P S - P - P S P - P S P S @@ -319,6 +316,9 @@ S S S + S + S + S P P S @@ -337,7 +337,7 @@ S S S - P + S S P S @@ -348,15 +348,15 @@ S P S - P + S S S S S P + P S S - P S S S @@ -393,7 +393,7 @@ S S S - P + S S P S @@ -408,7 +408,7 @@ S S P - P + S S S P @@ -423,7 +423,7 @@ S P P - P + S S S P @@ -462,7 +462,6 @@ S P S - P S S S @@ -476,6 +475,7 @@ P S S + S P S P @@ -561,21 +561,21 @@ S P P - P - P + S + S S P P S S S - P - P S S S S - P + S + S + S S S P @@ -584,7 +584,7 @@ S P S - S + P S S S @@ -619,7 +619,7 @@ S S S - P + S S P P @@ -637,12 +637,12 @@ S S S - S - S P + S P P S + S P S P @@ -695,7 +695,7 @@ P P S - P + S P S P @@ -711,7 +711,7 @@ P P S - S + P S P P @@ -724,7 +724,6 @@ S P S - P S S S @@ -734,14 +733,15 @@ S S S - P S P - P + S S P S + P S + P S P S @@ -751,36 +751,36 @@ S P P - P S - P S P S S S + S + S P P - P + S P S S - P S S S - P S P - P - S S S + P S P S + S P + S P + S P S P @@ -796,7 +796,7 @@ P S P - P + S S P P @@ -809,8 +809,8 @@ S P P - P - P + S + S S P S @@ -821,7 +821,7 @@ S S P - P + S P P S @@ -834,14 +834,14 @@ P S P - P + S S S S P + S P - P - P + S P P P @@ -850,7 +850,7 @@ S S P - P + S S S S @@ -858,11 +858,11 @@ P S S - P - P S P S + P + P S S S @@ -872,7 +872,7 @@ P P S - P + S P P P @@ -884,7 +884,7 @@ S P S - P + S S S S @@ -918,13 +918,13 @@ S P S - P + S S S S S P - P + S P S S @@ -937,7 +937,7 @@ P S S - P + S S S P @@ -948,7 +948,7 @@ P S S - P + S P S S @@ -959,7 +959,7 @@ P S P - P + S P P S @@ -972,7 +972,7 @@ P S S - P + S S P S @@ -987,7 +987,7 @@ P P S - P + S S S S @@ -1006,7 +1006,7 @@ S S S - P + S S S P @@ -1044,7 +1044,7 @@ S P P - P + S S P P @@ -1056,7 +1056,7 @@ S S S - P + S S P S @@ -1077,8 +1077,6 @@ P P P - P - P S P S @@ -1087,13 +1085,14 @@ S S S - P S S P S + S P S + P S S S @@ -1102,6 +1101,7 @@ S S S + P S S P @@ -1113,6 +1113,7 @@ P S S + S P P S @@ -1120,12 +1121,11 @@ S S S - S P S - P S - P + S + S P S P @@ -1229,7 +1229,7 @@ S P S - P + S S P S @@ -1245,7 +1245,7 @@ S S S - P + S S S P @@ -1303,7 +1303,7 @@ P S S - P + S S P P @@ -1342,7 +1342,7 @@ S S S - S + P P S P @@ -1355,7 +1355,7 @@ S S S - P + S P S S @@ -1409,7 +1409,7 @@ S S P - P + S P P S @@ -1482,13 +1482,13 @@ P S S - P - P S S S P S + P + S S S S @@ -1520,7 +1520,7 @@ S P S - P + S S S S @@ -1539,7 +1539,7 @@ P S P - P + S P P S @@ -1580,7 +1580,7 @@ S P P - P + S S S S @@ -1608,7 +1608,7 @@ S P P - S + P P S S @@ -1619,24 +1619,24 @@ S S P - P S - P S S S S - P + S S P P + P S S - P S P S P + S + S P S S @@ -1653,7 +1653,7 @@ S P S - S + P S S P @@ -1681,15 +1681,13 @@ S S P - P S S S S S S - P - P + S P P P @@ -1697,6 +1695,8 @@ P S P + S + S P P S @@ -1716,11 +1716,11 @@ P P S - S - S - P P + S P + S + S P S P @@ -1731,7 +1731,7 @@ S S S - P + S P S S @@ -1779,9 +1779,9 @@ P P S + S P - P - P + S S S S @@ -1789,7 +1789,7 @@ P S S - P + S P S S @@ -1820,9 +1820,9 @@ S S S - P S - P + S + S S P S @@ -1832,7 +1832,7 @@ S S S - S + P S P P @@ -1842,20 +1842,20 @@ S P S - P + S P P P S S S + S P S S S S P - P S P S @@ -1877,7 +1877,7 @@ S S S - P + S S S S @@ -1889,7 +1889,7 @@ S S P - S + P S S S @@ -1914,14 +1914,14 @@ S P S - P + S S P S S S S - P + S S S S @@ -1960,7 +1960,7 @@ P S S - P + S S S S @@ -2014,7 +2014,7 @@ S S S - S + P S P P @@ -2041,9 +2041,7 @@ S S S - S P - S P S S @@ -2052,6 +2050,8 @@ S S S + S + S P P P @@ -2072,7 +2072,7 @@ S S S - S + P S P P @@ -2083,7 +2083,7 @@ S S S - P + S P S S @@ -2091,16 +2091,15 @@ S P S - P S - P + S + S P S S S P S - P S S S @@ -2109,15 +2108,14 @@ S S P + P + S S S - P P P P S - P - P S P S @@ -2125,6 +2123,8 @@ S S S + S + S P P S @@ -2152,35 +2152,35 @@ P S P - S + P S P S P P + P S S S - P S - P - P S P - P S S P S + S P + S P + S P P S P S S - S + P S P P @@ -2221,7 +2221,7 @@ P P S - P + S S S S @@ -2245,7 +2245,7 @@ S P S - S + P P S S @@ -2258,11 +2258,11 @@ P P S - P + S P P S - P + S S S P @@ -2281,9 +2281,9 @@ S P S + S P - P - P + S P S S @@ -2320,7 +2320,7 @@ P S S - S + P P S S @@ -2343,7 +2343,7 @@ S P S - P + S S P P @@ -2357,7 +2357,7 @@ P P P - P + S S S P @@ -2407,7 +2407,7 @@ P S S - P + S P S S @@ -2434,19 +2434,19 @@ S S S - S - S - S + P S S P S P + S + S P P S S - P + S P S P @@ -2458,7 +2458,7 @@ S S S - P + S S S S @@ -2466,22 +2466,22 @@ S S S + S P - P - P + S S S S P S S - P + S S S P P S - P + S S S S @@ -2511,16 +2511,16 @@ P P P - S P - S P S + S + S P P S P - P + S P S P @@ -2547,7 +2547,7 @@ P S S - P + S P S S @@ -2558,7 +2558,7 @@ P S S - P + S S S S @@ -2569,7 +2569,7 @@ S P P - P + S P S P @@ -2580,7 +2580,7 @@ S P S - P + S S S S @@ -2594,10 +2594,10 @@ S S S + P + P S S - S - P P P S @@ -2627,7 +2627,7 @@ S S S - P + S P P S @@ -2665,7 +2665,7 @@ S P P - P + S S P S @@ -2699,12 +2699,12 @@ S S P - P S - P S P S + P + P S P S @@ -2725,7 +2725,7 @@ S P S - P + S S S S @@ -2734,8 +2734,8 @@ S P S - P - P + S + S S P P @@ -2744,7 +2744,7 @@ S S S - P + S S P S @@ -2754,7 +2754,7 @@ S S S - P + S S S S @@ -2787,13 +2787,13 @@ S S S - S - S + P + P S S P S - P + S S S P @@ -2817,10 +2817,10 @@ P S P - P S S S + P S P S @@ -2839,31 +2839,31 @@ S S P - P - S - P S S + P S S + P S S S S P P - S P S + P S S S S P + P S S P - P + S S P S @@ -2872,16 +2872,16 @@ S S P + S P P - P - P + S P S P P S - P + S P S S @@ -2894,7 +2894,7 @@ S P P - P + S S P S @@ -2904,7 +2904,7 @@ S P S - P + S P S S @@ -2918,16 +2918,16 @@ S S S + P S + P S P S P P S - P S - P S P S @@ -2976,15 +2976,14 @@ S S S - P S S S S S S - P S + P S S S @@ -2992,6 +2991,7 @@ S S P + P S S P @@ -3000,14 +3000,14 @@ S S S - P + S S P P S S P - P + S P S S @@ -3026,8 +3026,8 @@ P S P - P - P + S + S P P P @@ -3039,7 +3039,7 @@ S P S - P + S S S S @@ -3056,16 +3056,16 @@ S S S - P S S S - P S P + S P S S + S P S S @@ -3108,7 +3108,7 @@ S P P - S + P S P S @@ -3119,7 +3119,7 @@ S S S - P + S S P P @@ -3132,7 +3132,7 @@ P P S - P + S S P P @@ -3175,7 +3175,7 @@ S S S - P + S P S P @@ -3187,7 +3187,7 @@ S P S - P + S S S S @@ -3196,7 +3196,7 @@ P P S - P + S P S S @@ -3207,17 +3207,17 @@ P S S - P + S S S P P - P + S S P P S - P + S P S S @@ -3277,7 +3277,7 @@ S P S - P + S S P S @@ -3305,7 +3305,7 @@ S S S - P + S S S S @@ -3333,7 +3333,7 @@ S S S - P + S S S S @@ -3361,13 +3361,13 @@ S S P - P + S S P S S P - P + S S P P @@ -3383,7 +3383,7 @@ S P P - S + P S P P @@ -3393,7 +3393,7 @@ S P S - P + S S P P @@ -3404,7 +3404,7 @@ S P P - P + S S S S @@ -3432,12 +3432,12 @@ S S S - P - P S S S - P + S + S + S S S P @@ -3445,10 +3445,10 @@ S S S - P - P + S S P + P S S S @@ -3464,7 +3464,7 @@ S S P - P + S S P S @@ -3482,8 +3482,8 @@ S S S - P - P + S + S S S S @@ -3521,7 +3521,7 @@ S S S - S + P S S S @@ -3531,14 +3531,14 @@ S S S - P - P S + P S S S S S + P S S P @@ -3565,7 +3565,7 @@ S P P - P + S S S S @@ -3580,12 +3580,12 @@ S P P - P + S S P P S - P + S S S S @@ -3606,7 +3606,7 @@ S P P - P + S S P P @@ -3618,7 +3618,7 @@ P S P - P + S S S S @@ -3644,7 +3644,7 @@ P S S - P + S P S S @@ -3663,15 +3663,15 @@ S S S + S + S P S S S P - S P S - S P S P @@ -3688,13 +3688,13 @@ P S S - P + S S P P P S - S + P S S S @@ -3706,7 +3706,7 @@ P S S - P + S S S S @@ -3721,7 +3721,7 @@ P S S - P + S S S P @@ -3742,7 +3742,7 @@ S P S - P + S S S S @@ -3753,7 +3753,7 @@ S S S - P + S P P P @@ -3762,7 +3762,7 @@ S P P - P + S S S S @@ -3773,7 +3773,7 @@ S P S - S + P S S S @@ -3795,13 +3795,13 @@ P S S - P - S - P - P S S S + P + P + P + P S S S @@ -3812,7 +3812,7 @@ S S P - P + S S P S @@ -3850,7 +3850,7 @@ S P S - S + P S S P @@ -3882,7 +3882,7 @@ P S S - P + S S S S @@ -3910,7 +3910,7 @@ P P S - P + S S P S @@ -3935,7 +3935,7 @@ S P S - P + S S P S @@ -3949,14 +3949,14 @@ S P S - P - P S - P S S P + S + S P + S P S P @@ -3969,14 +3969,14 @@ S S S - P S S - P S P S + P S + P S S S @@ -3993,7 +3993,7 @@ S S P - P + S S S S @@ -4010,7 +4010,7 @@ S P S - P + S S S S @@ -4029,13 +4029,13 @@ S P P + P S S S S S - S - S + P S S S @@ -4055,7 +4055,7 @@ S S P - P + S S S P @@ -4067,7 +4067,7 @@ S S S - P + S S S S @@ -4087,13 +4087,13 @@ P P S - P + S P S S S P - S + P S S S @@ -4109,7 +4109,7 @@ P P P - P + S P P S @@ -4118,7 +4118,7 @@ S S P - P + S S S S @@ -4146,7 +4146,7 @@ S P P - P + S S S S @@ -4159,7 +4159,7 @@ P P S - P + S P S S @@ -4175,8 +4175,8 @@ S S S - P - P + S + S P P S @@ -4232,8 +4232,8 @@ S S P - P - P + S + S P S S @@ -4253,19 +4253,19 @@ S S S - P S - P S - P + S + S + S P P P P S - P S S + P S P P @@ -4311,7 +4311,7 @@ S S S - P + S S S S @@ -4340,18 +4340,18 @@ P P S - P S S - P S P + S P P P S S S + S P S S @@ -4361,7 +4361,7 @@ S S S - P + S P P P @@ -4385,14 +4385,14 @@ S P S - P S S - P S P S S + S + S P P P @@ -4417,24 +4417,24 @@ P S S - P S - P S S S S S - P - P S - P - P + S + S + S S P P S - P + S + S + S + S P S P @@ -4501,12 +4501,12 @@ S P S - P + S P P P S - P + S P S P @@ -4518,7 +4518,7 @@ P P S - P + S S S P @@ -4532,7 +4532,7 @@ P S S - P + S S P S @@ -4541,8 +4541,8 @@ S S P - P - P + S + S S P S @@ -4571,27 +4571,27 @@ P S S - P + S P P S P S P - P S S - P S P S + P S P + P + P S S P P - P S P P @@ -4603,7 +4603,7 @@ S S P - P + S P P S @@ -4611,7 +4611,7 @@ P S S - P + S S S S @@ -4646,7 +4646,7 @@ S P S - P + S S S S @@ -4669,7 +4669,7 @@ S S S - S + P S S S @@ -4679,12 +4679,12 @@ S P S - P + S S S S P - P + S S S S @@ -4724,13 +4724,13 @@ S P P - P S S S - P S - P + S + S + S P P S @@ -4756,7 +4756,7 @@ P S P - P + S S P S @@ -4767,7 +4767,7 @@ S S S - P + S S S S @@ -4779,7 +4779,7 @@ P S S - P + S S P P @@ -4793,13 +4793,13 @@ S P S - S + P P S P P S - P + S P P S @@ -4807,7 +4807,7 @@ S S S - P + S P S P @@ -4832,7 +4832,7 @@ S S S - P + S P S P @@ -4870,7 +4870,7 @@ S P S - P + S S S S @@ -4881,24 +4881,24 @@ P P S - P + S P P S - P + S P P P S S S - P S S - P S P + S P + S P S P @@ -4913,11 +4913,9 @@ S P P - P - P S P - P + S P S S @@ -4930,6 +4928,8 @@ S S S + S + S P S S @@ -4940,7 +4940,7 @@ P S S - P + S S P S @@ -4971,7 +4971,7 @@ P S S - P + S S S S @@ -4997,7 +4997,7 @@ P S S - P + S S S S @@ -5030,16 +5030,16 @@ P P P - P S S - P - P S S P S S + S + S + S P P S @@ -5047,18 +5047,18 @@ S S P - P S S S S - P - P - P + S + S S P S P + S + S P S S @@ -5073,15 +5073,15 @@ S S P - S - S P S + P S + P S S S - S + P S S S @@ -5153,7 +5153,7 @@ P S P - P + S S S S @@ -5163,7 +5163,7 @@ S P S - S + P S S S @@ -5205,7 +5205,7 @@ P P P - P + S S S P @@ -5213,7 +5213,7 @@ S P S - P + S P P S @@ -5230,13 +5230,13 @@ S S S - P S - P S P + S P P + S P P S @@ -5247,7 +5247,7 @@ P S S - P + S S S S diff --git a/dev-0/out.tsv_baseline b/dev-0/out.tsv_baseline new file mode 100644 index 0000000..47a38a1 --- /dev/null +++ b/dev-0/out.tsv_baseline @@ -0,0 +1,5272 @@ + S + P + P + S + S + S + S + P + S + S + S + P + S + P + S + P + S + S + S + P + S + S + P + P + P + S + P + P + S + P + S + P + S + S + S + S + S + S + P + S + P + S + S + S + S + S + P + S + P + P + P + S + P + S + P + P + S + S + S + S + P + P + S + P + S + P + S + P + S + P + P + S + S + P + S + P + P + S + S + P + P + P + P + P + S + P + P + P + S + P + S + S + S + S + S + S + P + P + P + S + S + P + S + P + S + P + S + S + S + P + P + S + S + S + S + P + S + S + S + S + P + P + S + S + S + P + S + S + S + S + S + P + P + P + S + P + S + S + S + S + S + S + S + P + S + S + S + P + S + P + S + S + S + S + P + P + P + S + S + S + S + P + S + P + S + S + S + S + P + P + S + P + S + P + P + P + P + P + S + S + P + S + P + S + P + S + P + S + S + S + P + P + S + S + S + S + S + S + P + P + P + P + P + S + S + S + P + P + P + P + S + S + P + S + S + S + S + S + S + P + S + P + S + P + S + S + S + P + S + S + S + S + P + P + P + S + S + S + P + S + S + S + P + S + S + P + P + P + S + S + P + S + S + P + P + S + S + S + S + S + S + P + P + S + S + P + S + S + P + P + P + S + S + S + S + S + S + S + S + S + S + S + P + S + P + S + S + P + S + S + S + S + S + P + S + P + S + P + P + P + P + P + P + P + S + S + P + S + P + P + S + P + P + S + P + S + S + S + S + S + S + P + P + S + P + S + S + S + S + P + S + S + P + S + P + P + S + S + S + P + S + P + S + S + P + P + S + S + P + S + P + S + S + S + S + P + S + S + P + S + S + S + S + P + S + S + P + S + S + P + S + S + S + S + P + S + S + S + S + P + S + S + P + S + S + S + P + S + P + S + S + P + S + S + S + P + S + P + S + S + P + S + P + S + S + P + P + S + S + P + P + S + S + P + P + P + P + P + P + S + S + P + S + P + P + P + S + S + P + S + P + S + P + P + S + S + S + P + P + P + S + S + P + S + P + S + S + S + S + S + S + S + S + P + S + S + P + S + S + P + P + S + P + S + P + S + S + S + S + P + P + P + P + P + P + P + S + S + P + S + P + S + S + S + S + S + P + P + S + P + S + P + P + S + S + S + S + S + P + S + S + S + S + S + P + S + S + S + P + S + P + S + S + S + S + P + S + P + S + S + S + P + S + S + S + P + P + P + S + S + P + P + S + S + S + S + P + P + S + P + S + P + S + S + S + S + S + S + P + S + S + P + S + S + P + S + S + S + P + S + S + P + P + P + P + S + P + P + S + S + S + P + P + S + S + S + S + P + S + S + P + P + P + S + P + S + S + S + S + S + P + S + S + S + P + P + S + S + P + P + P + P + P + S + S + S + P + S + S + S + S + P + S + S + P + S + S + S + S + S + S + P + S + P + P + P + S + S + P + S + S + S + P + P + S + S + S + S + S + S + S + P + P + P + S + P + S + P + S + P + S + P + S + S + S + P + P + S + P + P + P + P + S + P + P + P + S + S + S + P + S + S + S + S + S + S + S + S + S + S + S + S + S + S + P + S + S + P + P + S + S + S + S + P + P + P + S + P + P + S + P + P + S + S + S + S + P + S + S + S + P + P + S + S + S + P + P + S + S + P + S + S + S + S + P + S + P + S + S + S + S + S + S + S + S + S + P + S + P + P + S + P + S + S + S + P + S + P + S + S + S + P + P + P + S + P + S + P + S + S + S + P + P + P + P + S + S + P + S + S + S + P + S + P + P + S + S + S + S + P + S + P + P + P + S + P + P + S + P + S + S + P + P + P + P + P + S + P + P + S + P + P + S + P + S + P + P + S + S + P + P + P + P + S + P + S + S + P + S + S + S + S + P + P + P + P + S + S + S + P + S + S + S + P + S + P + P + S + S + S + P + P + P + P + P + P + P + P + S + S + S + P + P + S + S + S + S + P + S + S + P + P + S + P + S + S + S + S + P + S + S + P + P + S + P + P + P + P + S + S + S + S + P + S + P + S + P + S + S + S + P + S + P + P + P + P + P + P + P + S + S + S + P + S + P + P + P + P + S + S + P + S + S + S + P + S + S + S + P + S + P + S + S + S + S + P + P + P + S + S + S + S + S + S + P + S + P + S + S + P + S + S + P + S + S + S + S + P + S + S + P + P + S + S + S + P + S + S + P + S + P + P + P + P + S + S + S + P + S + S + S + P + S + S + P + S + P + S + S + S + P + P + S + P + S + P + P + P + S + P + S + S + S + S + P + S + P + S + P + S + S + S + P + S + S + S + S + S + P + S + S + P + S + S + S + S + P + P + S + S + S + P + P + S + S + P + S + P + S + S + P + S + P + S + S + P + S + S + P + P + P + S + S + S + P + P + P + S + P + P + S + S + S + S + S + S + S + S + P + S + P + S + S + S + S + P + S + P + S + S + P + S + S + P + S + S + P + P + P + P + P + S + P + S + S + S + S + S + S + P + S + S + P + S + P + S + S + S + S + S + S + S + S + S + S + S + P + S + P + S + S + S + P + S + S + P + P + S + S + S + S + S + S + P + S + P + S + P + P + S + P + S + P + P + P + S + S + S + S + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + S + P + S + S + S + S + P + S + P + S + S + S + P + S + P + S + P + S + S + S + P + S + P + P + S + P + P + S + S + S + P + S + S + P + P + S + S + P + S + S + S + P + P + P + P + S + P + P + S + S + S + S + P + S + P + P + S + S + P + S + P + S + S + P + S + S + P + S + P + P + P + P + S + P + S + P + S + S + P + P + P + P + P + P + S + P + S + S + P + S + P + P + S + S + S + P + P + P + S + S + S + P + S + P + P + P + S + S + S + S + S + S + S + S + P + S + P + S + S + S + S + S + S + S + S + S + S + S + P + S + P + S + S + S + P + S + S + S + S + S + P + P + S + S + P + P + P + S + S + S + S + S + P + S + S + S + P + P + P + S + S + S + S + S + S + P + S + S + S + P + P + S + S + P + S + P + S + S + S + P + P + S + S + S + S + P + S + S + S + P + P + S + S + P + P + P + P + S + S + P + S + P + P + P + S + S + S + P + P + P + S + P + P + S + S + S + S + S + S + P + P + S + S + P + P + S + S + P + S + P + S + S + S + S + P + P + P + P + P + P + P + P + P + S + S + P + S + S + S + P + S + S + S + S + S + S + P + P + P + S + S + S + S + S + P + S + S + P + P + S + S + S + P + S + S + S + S + S + P + P + S + S + P + P + S + S + S + S + S + P + P + S + S + P + S + S + S + P + S + S + P + S + S + P + S + P + S + S + S + S + P + P + P + P + P + S + P + S + P + P + S + P + S + P + P + P + P + S + P + S + S + S + S + P + S + S + S + S + S + P + P + S + P + S + S + S + P + S + P + P + S + P + P + P + P + S + P + P + S + S + P + P + S + P + P + P + S + S + S + P + S + P + P + S + P + P + P + S + P + S + S + S + S + S + S + P + P + S + S + S + S + P + P + S + P + S + S + S + S + S + S + S + S + P + P + S + P + S + S + S + S + P + S + P + P + S + S + P + S + P + S + P + P + S + S + S + S + S + S + S + S + P + S + P + P + S + P + S + S + S + S + P + P + S + P + P + S + P + S + S + P + S + P + P + S + P + S + P + S + S + S + S + S + S + S + P + P + S + S + S + S + S + S + P + P + P + P + P + S + P + S + P + P + P + S + P + S + S + S + S + S + S + S + S + S + S + S + S + P + P + S + S + S + P + P + P + P + S + P + S + P + S + S + S + S + S + P + P + S + S + P + S + S + S + P + S + S + P + S + S + S + P + P + P + P + P + P + P + P + P + S + P + P + P + P + S + P + S + S + P + S + S + S + S + S + S + P + S + S + S + P + P + P + S + P + P + P + S + S + S + S + P + S + S + P + P + S + S + S + S + S + S + S + P + S + P + S + S + S + P + S + S + S + S + S + P + S + P + S + S + P + S + S + S + S + P + S + P + S + P + S + P + P + P + S + S + S + S + S + P + P + S + P + P + S + P + S + P + P + P + P + S + S + S + P + S + S + S + S + P + P + S + P + S + P + S + P + P + S + S + P + S + S + S + S + P + S + P + S + S + S + S + P + S + S + S + S + P + S + S + S + S + S + P + S + S + S + S + S + P + S + S + P + P + S + S + S + P + P + S + P + P + P + S + S + S + S + P + S + P + S + P + S + S + S + S + P + S + S + S + P + S + S + S + S + S + P + P + S + P + P + P + P + S + S + S + S + S + S + S + S + S + P + P + P + S + S + S + P + S + P + S + P + S + S + P + S + S + S + S + P + P + P + S + P + S + S + P + S + S + P + P + S + S + S + S + S + P + P + P + P + P + P + P + P + P + P + S + S + P + P + P + S + S + P + P + S + S + P + S + P + P + P + S + S + P + S + S + S + S + S + P + P + P + S + P + S + S + S + P + S + S + S + S + S + S + S + P + S + S + S + P + S + S + S + S + S + P + S + P + S + S + S + S + S + S + S + P + P + P + S + S + S + P + S + P + S + S + S + S + S + S + S + P + S + S + S + S + S + P + P + S + P + P + P + S + S + S + P + P + S + S + P + S + P + S + P + S + P + P + S + S + S + P + S + P + S + S + S + S + P + S + S + P + S + S + P + P + P + P + S + P + P + S + P + S + S + S + S + S + P + P + S + S + P + S + S + S + P + S + S + P + P + P + P + S + S + P + P + S + S + S + P + S + P + S + P + S + S + P + S + P + P + S + S + S + P + S + P + P + S + P + P + S + S + P + S + P + P + P + P + S + P + S + S + S + S + P + P + P + S + S + S + S + S + S + P + S + S + P + P + P + S + S + P + S + P + S + S + S + S + P + S + S + P + P + S + P + P + P + P + P + S + P + P + S + P + S + S + S + P + P + S + S + S + P + S + P + S + S + S + P + P + S + P + S + P + S + P + S + S + P + S + S + S + S + S + P + S + P + P + P + S + P + P + P + S + P + S + S + P + S + S + S + S + S + S + S + S + P + P + S + S + S + P + S + P + P + P + P + S + S + S + P + S + S + P + S + P + P + S + S + S + S + P + S + S + P + P + P + S + S + P + S + S + S + S + S + P + S + S + S + P + S + S + S + P + S + S + P + S + P + S + P + S + S + P + S + S + S + P + S + S + P + P + S + P + S + P + S + P + P + P + P + P + P + P + P + S + P + P + P + P + S + S + P + S + P + P + P + S + S + P + P + S + S + S + S + S + P + P + S + S + S + P + P + S + S + P + S + P + S + P + P + P + P + S + P + P + S + P + S + S + P + S + P + S + S + S + P + S + S + P + P + S + S + P + S + P + S + S + S + P + S + S + S + P + P + P + S + P + S + P + P + P + S + S + S + S + S + S + S + S + S + P + S + P + P + P + S + S + P + P + S + P + S + P + P + P + S + S + S + S + P + S + S + S + P + S + S + S + P + P + P + S + S + S + P + S + S + P + S + S + P + P + S + P + S + S + S + P + S + P + S + P + P + P + S + S + P + P + S + S + P + S + S + S + S + P + P + S + S + S + P + P + P + S + P + S + P + S + P + P + S + P + P + P + S + P + S + S + P + P + S + P + S + P + S + S + P + S + S + S + S + P + S + S + S + S + P + S + S + P + P + S + S + S + P + S + S + P + S + S + P + S + S + S + P + P + S + P + S + P + P + P + P + S + P + S + P + P + S + S + P + S + P + S + S + S + P + S + S + P + P + S + S + S + S + S + S + S + S + P + P + P + S + S + P + S + P + S + S + S + P + P + P + P + S + P + P + S + S + S + S + S + P + P + P + P + S + S + S + P + P + P + S + S + S + S + S + P + P + P + S + S + P + P + S + P + S + S + S + S + P + S + P + S + P + S + P + P + S + S + S + S + P + S + S + P + P + P + S + P + S + S + S + S + P + S + S + S + P + S + S + S + S + P + S + S + S + S + S + S + S + P + P + S + S + S + P + S + S + S + P + P + S + P + S + P + S + S + P + S + S + S + P + S + S + S + S + S + S + S + S + S + P + P + S + P + S + P + S + S + S + P + S + S + P + S + P + P + S + P + P + S + P + S + S + S + P + S + P + S + S + S + S + S + S + S + P + S + S + S + P + S + S + S + S + P + P + P + P + P + P + S + S + P + S + P + P + S + S + P + S + S + P + P + S + S + S + S + S + S + S + S + S + P + S + P + S + S + P + S + P + S + S + P + S + S + S + P + S + S + S + S + S + S + S + P + P + S + P + P + S + S + S + S + P + S + S + P + P + S + P + S + S + P + S + S + P + P + S + S + P + P + S + P + S + S + S + S + S + S + S + S + P + P + S + P + S + S + S + S + S + P + S + S + P + P + S + P + S + S + S + S + S + P + P + P + P + P + P + S + P + P + S + P + P + S + S + S + S + P + P + S + P + S + P + P + P + S + P + S + S + S + P + S + P + S + P + P + S + S + P + P + S + P + S + P + S + S + S + S + S + S + P + S + P + P + S + P + S + P + S + P + S + S + P + S + P + P + S + S + S + P + P + S + S + S + P + P + S + S + P + S + P + P + P + S + S + S + S + P + S + S + S + S + S + S + S + S + S + S + P + S + P + S + P + S + S + S + P + S + S + S + S + S + S + P + S + S + S + S + S + S + S + P + S + S + P + P + S + S + S + S + P + S + P + P + S + S + P + P + P + S + S + P + S + S + S + S + S + P + S + S + S + P + P + P + S + P + P + P + P + P + P + S + P + S + P + S + S + P + S + P + S + S + S + S + S + P + P + P + S + S + P + S + P + S + S + S + P + S + S + S + P + S + P + P + S + S + P + S + S + P + P + S + P + S + P + S + S + S + P + S + S + S + P + P + P + P + S + P + S + S + S + P + S + S + S + P + S + S + S + P + S + P + P + P + P + S + P + P + S + S + P + S + S + P + P + S + S + S + S + P + S + P + P + S + S + S + S + S + S + P + P + S + P + S + P + P + S + S + S + P + S + P + P + S + P + S + S + S + P + S + S + S + S + S + S + P + S + S + S + S + P + P + S + P + S + S + P + S + P + P + S + P + S + S + S + P + P + S + P + P + P + P + S + P + S + P + S + P + S + S + S + P + P + P + P + S + P + P + S + S + P + S + S + P + P + S + S + P + S + S + P + P + P + S + P + P + S + P + P + S + S + P + S + S + S + S + S + S + S + P + S + S + S + P + P + S + S + P + S + S + S + P + P + S + P + P + S + S + S + P + P + S + P + S + P + S + P + S + S + S + S + P + S + S + S + S + P + S + S + S + P + P + S + S + S + P + S + P + S + P + S + S + P + S + S + S + P + S + S + P + S + P + S + S + P + P + S + S + S + P + P + S + S + S + S + P + S + S + S + S + S + P + S + P + S + P + S + S + P + S + P + P + P + P + P + S + S + P + S + S + S + S + S + P + S + S + S + S + P + S + S + S + P + S + P + S + S + S + S + S + P + P + P + S + P + S + P + S + S + S + P + P + S + P + S + S + P + P + S + P + P + S + P + P + S + P + S + P + S + S + S + P + P + S + S + P + P + S + S + P + S + P + S + P + S + P + P + S + S + S + P + S + P + P + P + S + S + S + S + P + S + S + P + S + P + S + P + S + S + S + S + P + S + S + P + S + S + S + P + S + S + S + P + P + S + S + S + P + S + S + P + S + S + S + S + P + P + S + P + S + S + S + S + P + S + S + S + P + P + P + P + S + S + P + P + S + P + S + P + S + P + S + S + P + P + S + S + P + S + S + S + S + P + P + S + S + S + P + S + P + P + S + P + S + S + P + S + S + S + S + P + S + P + S + P + P + P + S + P + S + S + S + P + P + S + P + P + P + S + S + S + S + S + S + S + S + S + P + S + S + S + P + P + S + S + S + S + S + S + S + S + P + S + S + S + P + S + S + P + S + S + P + S + S + S + S + P + S + S + S + S + S + S + P + P + P + S + S + S + P + P + S + S + P + P + S + S + S + P + P + P + S + P + P + S + P + S + S + S + S + S + P + S + P + S + S + P + S + S + S + P + S + S + S + P + P + P + S + P + P + P + S + P + P + S + P + S + P + P + S + S + S + P + S + S + S + P + S + S + S + S + S + P + P + S + S + P + S + P + P + P + P + S + S + P + P + S + S + P + S + S + S + S + S + S + P + S + P + P + S + S + S + S + P + S + S + S + P + S + P + S + S + P + S + P + S + P + S + P + P + S + S + P + P + S + P + S + S + P + S + P + P + P + S + S + S + S + S + P + P + S + P + S + P + S + S + P + S + S + S + S + S + P + S + S + P + S + S + P + S + S + P + S + S + P + P + S + S + P + S + P + S + S + S + S + S + S + S + S + S + P + S + P + S + S + S + P + P + P + S + S + S + S + P + P + P + P + S + P + S + P + P + P + S + S + S + S + S + P + P + S + P + S + S + S + S + S + S + P + S + P + P + S + S + P + P + P + S + P + P + S + S + P + S + S + P + S + P + P + S + S + S + S + S + S + S + S + S + S + S + S + P + P + S + P + S + S + P + P + S + P + S + S + S + S + P + S + S + S + P + P + P + S + S + S + S + S + P + S + S + S + S + P + S + P + S + P + S + P + S + S + S + S + P + S + P + P + S + S + S + S + S + S + S + S + S + S + S + P + S + P + S + S + S + S + S + P + P + S + P + S + S + P + S + S + S + S + P + S + P + P + P + S + S + P + P + P + S + S + S + S + S + S + S + S + S + S + P + P + S + P + S + P + S + S + P + P + S + S + P + S + S + S + S + S + P + P + P + S + P + S + P + S + P + S + P + S + P + S + S + P + S + P + P + P + S + S + P + S + P + P + S + P + S + S + P + P + P + S + P + P + S + S + S + P + S + S + S + S + P + S + S + P + S + P + S + S + S + S + S + S + S + S + S + S + S + S + S + S + P + S + S + P + P + S + S + S + S + S + S + P + P + S + S + P + S + S + S + P + S + P + S + S + S + P + S + S + S + S + S + S + S + S + S + P + P + S + P + P + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + P + P + S + S + S + S + P + S + S + P + P + S + S + P + S + S + P + P + S + S + S + S + P + S + S + S + S + P + S + S + P + S + S + S + S + P + S + P + S + P + P + S + P + P + S + S + S + P + S + S + S + S + S + S + S + S + S + P + P + S + S + P + P + P + P + P + P + S + P + P + S + S + P + P + S + S + S + S + P + S + S + P + S + S + S + P + P + S + P + P + S + S + P + P + S + S + S + S + S + P + P + P + S + S + S + S + P + P + P + S + S + P + P + S + P + P + S + S + S + P + P + S + P + S + S + S + S + S + S + S + P + P + P + P + S + S + S + S + P + S + S + S + S + P + S + S + S + S + P + P + P + P + S + P + P + S + S + P + S + S + S + P + P + P + P + P + S + S + S + S + S + S + S + S + S + S + S + P + P + S + P + S + S + S + S + S + P + P + P + P + S + S + S + P + S + P + S + S + S + P + S + S + P + S + P + S + S + S + P + S + P + S + P + P + P + P + P + S + P + S + S + S + P + P + S + S + S + S + S + P + S + S + S + P + S + S + S + S + S + S + S + S + P + S + S + P + P + P + S + P + S + P + S + P + S + P + P + P + P + S + S + S + S + S + S + S + P + S + S + S + S + P + S + S + S + P + P + P + P + S + S + S + S + P + S + P + S + S + P + P + P + S + P + P + S + P + S + S + P + S + P + P + P + P + S + S + S + P + S + S + S + S + S + S + S + S + P + P + P + P + S + S + S + P + S + S + P + S + S + S + S + S + S + S + S + S + S + S + P + S + P + S + S + P + S + P + S + S + P + P + P + P + P + S + S + S + S + S + S + S + S + S + P + P + S + S + P + P + P + P + S + S + P + S + P + S + S + S + S + S + P + P + S + P + P + S + P + P + S + P + P + S + P + P + P + S + S + S + P + P + P + P + P + P + P + P + S + S + S + P + S + P + S + S + P + S + S + S + S + P + S + P + S + S + S + S + P + P + S + S + P + S + S + S + S + S + S + S + P + P + P + S + P + S + S + S + S + S + S + S + P + S + S + S + P + S + P + P + P + P + S + P + P + S + P + S + S + S + S + P + P + P + S + P + S + S + P + S + S + P + P + P + S + P + P + S + S + P + S + P + S + S + P + S + S + P + P + P + S + P + S + S + S + P + P + S + P + P + P + S + S + P + S + P + S + S + P + S + S + P + S + S + P + P + S + S + P + P + P + S + P + S + P + P + S + S + P + S + P + S + S + P + S + S + P + P + P + S + P + P + S + S + P + S + P + S + S + P + P + P + P + S + S + P + S + S + P + S + S + S + P + S + P + P + S + S + P + S + S + S + S + P + S + S + S + S + P + S + S + S + S + S + P + S + S + P + P + S + S + P + S + P + S + S + S + P + S + S + S + P + P + P + S + P + P + P + S + S + S + P + S + S + S + S + S + S + S + S + S + S + P + S + P + S + P + S + S + S + P + P + S + S + S + P + S + P + P + S + S + S + S + S + S + S + S + S + P + S + P + S + P + S + P + P + P + S + P + S + P + S + S + S + S + S + S + S + S + P + P + P + S + S + S + P + S + P + P + P + S + S + S + P + S + P + P + P + S + S + S + S + S + P + S + S + P + S + S + P + P + S + P + P + S + P + S + S + S + P + S + S + S + S + P + S + S + S + S + P + P + S + S + P + S + S + P + S + P + P + S + S + S + S + S + S + S + S + P + S + S + P + S + P + P + S + P + P + P + S + P + S + S + S + P + P + S + P + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + P + P + S + P + S + P + S + S + S + S + P + P + P + S + S + S + P + P + P + S + P + P + S + P + S + S + P + P + S + P + S + P + P + S + S + S + P + S + P + S + S + S + S + P + S + S + P + P + S + P + P + P + S + P + P + P + P + S + S + S + P + S + S + P + S + P + P + P + S + P + P + S + P + S + P + P + P + S + S + P + P + P + P + S + P + P + P + S + S + S + S + S + S + S + S + S + S + S + P + S + S + P + P + P + P + P + S + S + P + S + P + S + P + P + S + S + S + P + S + P + S + S + P + P + S + S + S + P + P + S + S + S + S + P + S + P + P + S + S + P + S + S + S + P + P + S + P + S + S + P + P + P + S + S + S + S + P + S + P + P + P + S + P + S + S + P + S + S + S + P + S + S + S + S + P + S + P + P + S + S + S + S + S + S + S + P + S + P + P + S + P + P + S + S + S + P + P + P + P + S + S + P + P + S + S + P + S + S + P + P + S + S + S + S + P + P + S + S + S + S + P + P + P + S + P + S + P + P + S + S + S + S + P + S + P + S + P + S + S + S + P + S + S + P + S + S + S + S + S + S + S + S + S + P + S + S + P + S + S + S + S + S + S + P + P + P + S + P + S + P + S + P + P + P + P + P + S + P + S + P + S + S + S + S + P + S + S + P + S + P + S + S + S + P + S + S + S + P + S + P + P + S + S + P + S + S + S + S + P + P + S + P + P + S + S + S + S + P + P + S + P + P + S + S + S + P + S + S + S + P + S + S + S + S + S + S + P + S + S + P + S + S + S + S + P + S + P + S + S + S + P + S + P + S + S + S + S + P + P + P + S + P + S + P + P + S + S + P + S + S + P + P + P + P + S + S + P + P + S + P + S + P + P + P + S + S + S + S + S + P + P + S + S + S + S + S + S + S + P + S + P + S + P + P + P + P + P + S + P + S + S + P + P + S + S + P + S + S + S + S + S + S + S + P + S + S + P + S + P + S + S + S + S + S + P + S + P + S diff --git a/exp b/exp new file mode 100644 index 0000000..25535fd --- /dev/null +++ b/exp @@ -0,0 +1,10 @@ + S + P + P + S + S + S + S + P + S + S diff --git a/in b/in new file mode 100644 index 0000000..532d559 --- /dev/null +++ b/in @@ -0,0 +1,10 @@ +In which case, tell them I'm in work, or dead, or down the shops. They can come back in when they've got Professor Brian Cox with them. If only cause I want to meet him and steal his hair. 1328302967 +Put me down as another for Mysterious Universe. Those dudes are brilliant. 1347836881 +The military of any country would never admit that UFO's have taken down our most powerful weapons. 1331905826 +An example would have been more productive than a downvote, I think. 1315584834 +sorry, but the authors of this article admit that the study is limited and flawed. Also, upon peer review, the study is found to be fallacious. It says this towards the end of the article, a point to which the majority of people usually fail to read. \n\nalso, 463 people is NOT, in any way, compelling evidence for a causal relationship. 1347389166 +"Are you afraid of science in general, or just the kind you read about on the internet?" 1303864529 +Well, I know it can decrease intraocular pressure, but as for *cancer...*\n\nThe only data I'm aware of relating marijuana to cancer is that it's an antiemetic and an appetite increasing drug. Which, while very good for patient health, are not directly related to the cancer itself, as much as a treatment of the adverse reactions of chemotherapy/radiation treatment (unless if the cancer is in the GI system, or is making weird hormones...).\n\nIf you have any information indicating that marijuana helps in cancer/HIV treatment *other* than as an antiemetic/appetite increasing medication, could you please provide it? I mean, I'm all for it's responsible use, but I don't want to go around spreading misinformation, especially if I'm going to one day be in the medical field. 1318558797 +That could be anything. Why even bother... 1285029343 +what was the joke? he deleted it 1337651956 +Its the landing and taking off that/'s hard, and those things will do do that for you as well. The pilots, for the most part, are there in case shit happens.\n\nBTW stalling a Cessna is fun as hell. 1336941346 diff --git a/naive_base_model.pkl b/naive_base_model.pkl_baseline similarity index 100% rename from naive_base_model.pkl rename to naive_base_model.pkl_baseline diff --git a/naive_bigram.pkl b/naive_bigram.pkl new file mode 100644 index 0000000..6498e42 Binary files /dev/null and b/naive_bigram.pkl differ diff --git a/predict.py b/predict.py index 07bff1d..0ed14e7 100755 --- a/predict.py +++ b/predict.py @@ -3,69 +3,61 @@ import pickle import math import re +import sys -def clear_tokens(tokens, is_text=True): - tokens = tokens.replace('\\n', ' ') - return tokens - tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) - tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens) - tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens) - tokens = re.sub(r'[0-9]+', ' ', tokens) - tokens = re.sub(r'œ|·', '', tokens) - if is_text: - tokens = re.sub(r' +', ' ', tokens) - else: - tokens = re.sub(r' +', '', tokens) - return tokens - -def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs): - # dla kazdego tokenu z danego posta +def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs): text, timestap = post.rstrip('\n').split('\t') - text = clear_tokens(text, True) + text = clear_post(text) tokens = text.lower().split(' ') - #probs = {0.0 : 'sceptic', 0.0 : 'paranormal'} probs = {} - for class_ in word_logprobs.keys(): - product = 1 - for token in tokens: - token = clear_tokens(token, False) + for class_ in bigrams_logprobs.keys(): + product = 0 + for index in range(len(tokens)-1): + # we handle bigrams not in models as neutral + bigram = tokens[index] + " " + tokens[index + 1] + #print(bigram) try: - product *= word_logprobs[class_][token] + product += bigrams_logprobs[class_][bigram] except KeyError: - product *= 1 - # tu wzoru uzyj + product +=0 if class_ == 'sceptic': - product *= sceptic_class_logprob + product += sceptic_class_logprob elif class_ == 'paranormal': - product *= paranormal_class_logprob - probs[abs(product)] = class_ - #print(probs) -# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal - if search_for_keywords(text): - return 'paranormal' - return probs[max(probs.keys())] + product += paranormal_class_logprob + probs[product] = class_ + #print(probs) + return probs[min(probs.keys())] -def search_for_keywords(text): - keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens', 'atlantis'] - return any(keyword in text for keyword in keywords) +def clear_post(post): + post = post.replace('\\n', ' ') + post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+]+(\)|)', '', post) + post = re.sub(r'[\.\,]+', ' ', post) + post = re.sub(r'(<|>)','',post) + post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%]+', '', post) + post = re.sub(r' \- ', ' ', post) + post = re.sub(r' +', ' ', post) + post = post.rstrip(' ') + return post def main(): - with open('naive_base_model.pkl', 'rb') as f: + if len(sys.argv) != 4: + print("syntax is ./predict.py in.tsv out.tsv model.pkl") + return + in_file = sys.argv[1] + out_file = sys.argv[2] + model = sys.argv[3] + with open(model, 'rb') as f: pickle_list = pickle.load(f) + paranormal_class_logprob = pickle_list[0] sceptic_class_logprob = pickle_list[1] - word_logprobs = pickle_list[2] - in_file = "test-A/in.tsv" - #in_file = "dev-0/in.tsv" - out_file = "test-A/out.tsv" - #out_file = "dev-0/out.tsv" - print (f"in {in_file}") - print (f"out {out_file}") + bigrams_logprobs = pickle_list[2] + with open(in_file) as in_f, open(out_file, 'w') as out_f: - for line in in_f: - hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs) + for line in in_f: + hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs) if hyp == 'sceptic': - out_f.write(" S\n") + out_f.write(' S\n') elif hyp == 'paranormal': - out_f.write(' P\n') + out_f.write(' P\n') main() diff --git a/predict_baseline.py b/predict_baseline.py new file mode 100755 index 0000000..07bff1d --- /dev/null +++ b/predict_baseline.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 + +import pickle +import math +import re + +def clear_tokens(tokens, is_text=True): + tokens = tokens.replace('\\n', ' ') + return tokens + tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) + tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens) + tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens) + tokens = re.sub(r'[0-9]+', ' ', tokens) + tokens = re.sub(r'œ|·', '', tokens) + if is_text: + tokens = re.sub(r' +', ' ', tokens) + else: + tokens = re.sub(r' +', '', tokens) + return tokens + +def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs): + # dla kazdego tokenu z danego posta + text, timestap = post.rstrip('\n').split('\t') + text = clear_tokens(text, True) + tokens = text.lower().split(' ') + #probs = {0.0 : 'sceptic', 0.0 : 'paranormal'} + probs = {} + for class_ in word_logprobs.keys(): + product = 1 + for token in tokens: + token = clear_tokens(token, False) + try: + product *= word_logprobs[class_][token] + except KeyError: + product *= 1 + # tu wzoru uzyj + if class_ == 'sceptic': + product *= sceptic_class_logprob + elif class_ == 'paranormal': + product *= paranormal_class_logprob + probs[abs(product)] = class_ + #print(probs) +# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal + if search_for_keywords(text): + return 'paranormal' + return probs[max(probs.keys())] + +def search_for_keywords(text): + keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens', 'atlantis'] + return any(keyword in text for keyword in keywords) + +def main(): + with open('naive_base_model.pkl', 'rb') as f: + pickle_list = pickle.load(f) + paranormal_class_logprob = pickle_list[0] + sceptic_class_logprob = pickle_list[1] + word_logprobs = pickle_list[2] + in_file = "test-A/in.tsv" + #in_file = "dev-0/in.tsv" + out_file = "test-A/out.tsv" + #out_file = "dev-0/out.tsv" + print (f"in {in_file}") + print (f"out {out_file}") + with open(in_file) as in_f, open(out_file, 'w') as out_f: + for line in in_f: + hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs) + if hyp == 'sceptic': + out_f.write(" S\n") + elif hyp == 'paranormal': + out_f.write(' P\n') +main() diff --git a/test-A/out.tsv b/test-A/out.tsv index 8508f03..5c4fe57 100644 --- a/test-A/out.tsv +++ b/test-A/out.tsv @@ -1,241 +1,186 @@ P P - S - P - P - P - P P + S P S + P S S S - P - P S - P S - P S P - P S S P P - P S - P S S S - P S S S P - P - P - P - P S - P + S S S P + S P S S S S P - P - P + S + S S S P S S S - P S S S S - P - P - P + S P P S - P S S - P - P S - P S - P S S - P - P - P - P - P - P S P - P - P - P - P - P S S - P - P S P - P - P + S + S P P S - P S S - P - P S - P S - P S - P S S - P - P S S S S - P - P S - P S P P - P - P - P - P - P + S + S + S + S P P P P P S - P - P - P + S + S P S S S + S P S S - P + S + S + S S P P + S P P + S P S S P P S - P S - P - P - P - P + S + S S P - P + S P S P P + S + S + S + S + S P P P + S P P P + S P S S S S - P - P S S - P + S P S S - P S S - P - P - P + S P S S - P - P - P - P S P - P - P - P - P - P S P - P S - P - P S P - P S - P - P - P S - P S P + S P + S + S + S P S P + S + S P S S @@ -243,17 +188,9 @@ S S S - P - P - P - P - P - P S - P - P - P - P + S + S S P S @@ -261,21 +198,12 @@ S S S - P - P - P - P S P - P - P - P S - P S - P - P - P + S + S S S P @@ -284,16 +212,10 @@ S S S - P - P - P S - P S P S - P - P S S P @@ -301,232 +223,178 @@ S P S - P S S P S S + S P + S P S + S + S P S S S S P - P - P S - P - P - P S - P - P + S + S P P P S S S + S P P P P + S + S + S + S + S + S P + S P + S + S P S S S P + S + S P S S S S - P - P S - P S - P - P - P - P S - P - P - P S - P - P - P - P S - P - P S S - P S S S S - P - P S - P S S - P S - P S S - P - P + S + S + S + S + S + S + S + S + S + S S P S + S + S + S + S + S + S P P P P P + S P + S P P + S + S + S + S + S P S P S S + S P S S S P S - P - P - P - P S P S - P S S S - P S - P - P - P - P - P - P - P - P S S - P S - P S P P + S + S P P S - P - P - P - P - P - P - P - P - P - P S - P - P S P S - P S - P - P - P S - P S - P S - P S - P - P S - P S - P - P S - P S - P S S - P - P S - P - P S - P S S S - P - P - P S S - P - P - P - P - P - P - P S - P S - P S - P - P S - P - P + S S S P S S - P S P - P - P - P - P + S S S P @@ -535,394 +403,272 @@ S S S + S + P + S + S P S S S S P - P - P - P - P - P S S + P S + P S S S S - P S P S S - P S S S - P - P - P - P - P S - P - P S - P - P - P S S P - P S - P S - P - P S P P - P - P - P S S P + S P P P P P P - S P S S - P - P S - P - P S - P S S S - P - P - P S S - P - P - P - P - P - P + S S P P S S - P S S - P S P S - P - P S S - P S - P S P S - P - P - P - P - P - P - P - P + S P P S P S + S + S + S + S P P + S + S P + S P S S P + S + S P S S S - P S P S S - P S S S S P - P - P - P - P - P - P - P - P S S S - P - P - P S S - P - P - P - P - P - P - P S - P + S P S S - P - P - P - P - P - P - P S - P - P S S P - P + S S P P S - P S - P S S S S - P S - P S P - P - P - P - P S - P - P + S S P P P S - P S S - P - P - P S - P - P S - P - P S S - P - P S S S S - P - P - P - P - P - P - P - P - P - P - P - P - P - P S S S P S + S + S + S P S S S S - P - P - P S - P - P S S - P S - P S - P - P - P - P S - P S - P S - P - P - P - P - P - P S P - P - P + S P P P P P S + S P + S + S + S + S + S + S + S P - P - P - P - P + S + S S P S S + S + S P + S + S P + S + S + S + S + S + S P S S + S + S P + S + S + S + S + S + S + S P S + S + S + S P P S P - P - P - P + S P S - P - P - P - P - P - P S - P - P - P - P - P S - P - P S - P - P - P - P - P S - P - P - P - P S S - P - P S P S - P - P + S S P S S S - P S - P - P S - P S - P S - P - P S S S - P + S P S + S P S S S S - P - P - P S S P @@ -931,68 +677,48 @@ S S S - P S - P + S P S S P S S - P S - P - P - P - P S - P S - P - P - P S - P S S S P - P - P - P S - P + S S P S S S - P - P S P P S - P S - P - P - P + S + S S P - P - P + S + S + S P P P S S - P - P S - P - P + S S S S @@ -1000,32 +726,32 @@ P P S - P S - P S - P - P S - P + S + S + S + S + S S P + S + S P P + S + S + S P - P - P + S P P P S - P S - P - P S S - P S S P @@ -1033,86 +759,69 @@ S S S + S + S + S P - P + S P P S S - P - P S S S S S - P - P S P - P S P - P S - P S - P - P - P + S + S + S S P + S + S + S + S P P P S P S - P - P S - P - P S - P - P S S S - P S - P - P S - P - P - P S P - P S P S S - P - P - P - P S P S + S + S P + S + S P - P - P + S P S S S - P S - P - P S P P @@ -1120,142 +829,81 @@ P P S - P - P S - P - P - P S - P - P - P S P - P S - P S P S - P - P - P - P - P - P - P - P S S - P S S - P - P - P S P S - P S S P - P - P S S P S - P S S - P S S - P S P - P - P - P - P - P - P S P S - P - P - P S S - P S - P S S - P - P - P - P + S P S S S S - P - P - P - P S - P S - P - P - P - P + S P S S - P - P - P - P - P - P - P - P S P P S - P S - P - P - P S - P - P S S S S S - P S - P S S - P + S + S + S + S + S + S + S S S P - P - P - P - P + S + S + S P P S @@ -1263,127 +911,94 @@ P P S - P S - P S S S S - P - P - P S S - P + S S S P S S - P S - P S S - P - P S S S - P - P S S S P - P - P - P + S S S P P S P - P + S + S P P S S S - P S - P - P - P - P - P - P - P - P - P S S - P - P - P S P S - P - P - P - P - P - P - P - P + S P S S - P - P - P - P - P - P S P - P - P - P S - P + S + S + S + S + S S P P P S P - P S S S - P - P S S P + S P S + S + S + S + S + S + S P + S P + S + S P P S - P S S - P S S - P S S S @@ -1391,10 +1006,16 @@ S P P + S + S + S + S + S P P P P + S P P P @@ -1403,42 +1024,55 @@ P S S - P S - P - P S - P - P - P - P + S S P - P - P - P - P + S S P + S P + S + S + S + S P S + S P P + S P + S P + S + S + S + S P + S + S + S + S + S P + S + S + S P P S S - P + S S S P P S + S + S P S S @@ -1448,23 +1082,25 @@ P S S - P - P + S + S S S P + S + S + S + S P P - P + S P P S S S P - P S - P S S S @@ -1473,154 +1109,101 @@ S S S - P - P - P - P - P - P S - P S P - P - P - P - P - P - P - P S - P S S S S S - P - P - P - P S S S S - P - P - P - P + S S S S S P + S + S + S P S + S P + S P S P S S - P - P + S P P P P S S - P S S - P - P - P - P S - P - P - P - P S - P S - P - P - P S - P - P - P S - P S S - P - P - P S - P S - P - P - P - P - P - P - P - P + S + S + S P S + S P S S S - P - P S - P - P S - P - P - P + S S P - P + S P S S S - P S - P S P - P - P - P + S + S S S S P + S + S P + S P S S S - P S S S - P S P - P - P - P - P - P + S + S + S S S S @@ -1632,36 +1215,51 @@ S P P + S + S + S + S + S P P S - P S - P S P S + S P + S + S + S P P + S + S P S S S - P S - P + S S P P P + S + S + S + S + S P - P - P + S P S P S + S + S + P P P S @@ -1669,38 +1267,27 @@ S P S + S P S S P S S - P S S S - P S - P S S - P S - P - P - P - P S S P - P S S P S - P - P - P S S S @@ -1708,150 +1295,119 @@ P S S - P S - P S - P + S + S + S + S + S P P P S - P + S P P S - P S S + P S P P S S + S P P P P P + S P P P S S S - P - P - P - P S - P - P S - P - P S P P - P - P - P - P - P - P - P - P - P + S P P P S S - P - P - P - P - P - P - P - P - P - P + S + S + S + S + S + S + S + S + S S P P P P P + S P + S + S P S S S - P - P - P - P S - P - P + S P S S - P S - P + S S S P + S + S P S S S - P - P S - P - P S S S - P - P - P - P - P - P - P - P - P S S S S - P S - P S - P - P + S P S P + S P + S P S S - P - P - P S P S S P - P - P + S + S S S P @@ -1859,34 +1415,26 @@ S S S - P S - P S P S S - P - P S P S P P - P - P - P S P - P - P S - P + S + S + S S S P - P - P + S S P P @@ -1896,31 +1444,18 @@ S S S - S P S P P P - P S S - P - P - P - P S - P S P P S - P - P - P - P - P - P S S S @@ -1928,59 +1463,46 @@ S S S - P - P - P - P S - P S - P S S S - P S - P S P S - P - P - P - P S - P - P - P S - P - P - P - P S P S P P + S P P S P - P S S S S S P - P + S P S S + S + S + S + S P P - P + S + S + S S S P @@ -1988,103 +1510,56 @@ P S S - P S - P S S P S S - P - P - P S - P - P - P S P - P S S S - P - P S - P S - P - P - P - P S - P - P - P - P - P - P - P - P - P - P - P S - P - P + S P S S S - P S - P - P S - P S S - P S S - P - P - P - P S S P P - P S P S - P - P - P - P - P S - P - P - P S S - P - P - P - P + S S S P P - P - P + S + S + S P P S - P + S + S S P P @@ -2096,177 +1571,89 @@ P S S + S + P P S S - P S - P S - P - P - P - P S S - P - P - P S S P - P S - P S S - P - P S S - P - P S - P - P - P S S S S - P - P S P S - P + S P S S - P - P - P - P - P S - P - P S - P S P P - P S S S - P - P - P - P + S P S S - P S - P - P - P - P - P - P - P - P - P - P - P - P - P - P S S - P S S P S - P - P - P - P - P - P - P - P - P - P + S P S S - P - P S S - P S - P S S S - P - P S - P - P - P S - P + S S P S S - P S S S - P - P S S P S - P - P - P - P - P - P - P - P - P - P S - P - P - P - P - P - P - P - P - P - P - P S P S - P S S S S P P + P S P S @@ -2275,154 +1662,69 @@ S P S - P S P - P S - P - P S - P + S P S S - P - P - P S - P - P S P - P - P S - P S P P S S - P S - P - P S P - P - S S S P - P - P S P - P S - P + S P P S S - P - P - P S S - P - P S - P - P - P S S - P - P - P - P - P S - P S - P S - P S - P - P - P - P - P - P S - P - P - P - P S - P S - P - P S S - P - P - P - P - P S - P S - P - P - P - P - P - P - P - P - P S S - P - P - P - P S S P S - P S - P - P - P - P - P S S - P S - P S - P - P - P S - P - P S S - P S S - P - P - P S S S @@ -2430,50 +1732,28 @@ S S S - P S - P - P S - P - P - P + S P S S - P - P - P - P S P S - P - P - P + S P S S S P - P - P - P - P - P - P - P - P - P - P S S P - P - P - P S + P S + P S S S @@ -2483,258 +1763,134 @@ S P S - S P S S S S - P - P - P S - P S - P - P S S - P - P - P S S P - P - P S P - P - P - P - P - P - P - P - P - P + S S P + S P S S S S - P + S P S S S - P - P - P - P + S S P P P S - P S - P S - P S P - P - P S - P S - P - P - P S P - P - P - P - P S S - P S S - P S - P S - P - P - P - P - P - P - P S S - P - P - P - P S - P + S P S S S - P - P - P - P - P - P - P - P S - P - P S P - P - P - P S P - P S - P - P S P P - P S - P - P S - P - P S - P - P - P S S P - P - P - P - P S - P - P S - P S S S P S - P - P - P S - P S - P S P - P S - S - P - P - P - P - P - P P S + S P S S S - P - P S - P - P S - P S S S - P S S - P S S P P - P - P - P S S S S - S - P P S S S - P - P - S - P S S S P S S - P - S - P S P S S - P - P - P - P S - P S S S S - P S - P - P - P - P S S S - P - P - P S S S @@ -2742,57 +1898,31 @@ S S P - P - P - P - P - P - P - P S - P S - P - P - P S - P - P S - P - P - P - P - P - P - P S S - P - P - P - P - P S - P S S P P + S P S S P - P + S P P P S S - P - P - P - P + S + S + S P P S @@ -2800,12 +1930,6 @@ S P S - P - P - P - P - P - P S P S @@ -2813,13 +1937,7 @@ S S S - P - P - P S - P - P - P S P S @@ -2829,55 +1947,27 @@ S P S - P - P S P S - S - P P S P - P S S S - P S - P - P - P S - P - P - P S - P S S - P - P - P S - P - P - P - P S P S S - P - P - P - P - P - P S S - P - S S S S @@ -2887,13 +1977,8 @@ P P S - P S - P S - P - P - P S P S @@ -2903,75 +1988,35 @@ S S S - P - P - P - P - P S S - P - P - P - P S - P S S - P S - P S S - P - P - P - P - P - P - P - P S S - P S - P S S P - P - P - P - P S S S - P - P - P - P - P S - P - P - P - P - P - P - P + S S S P P + S + S + S P P P - P - P - P - P - S - P S S S @@ -2979,87 +2024,53 @@ S S P - P S P - P S - P S - P - P - P - P - P S S - P S P S P - P - P - P S - P + S S P - P - P - P - P - P + S P P S S - P - P - P - P - P S - P + S P S S P - P - P - P - P - P S - P S - P - P - P - P - P - P - P S S - P - P S - P S S - P + S + S P P P P S - P S S S P P + S + S + S P S P @@ -3067,14 +2078,10 @@ P S S - P - P S - P S - P - P - P + S + S S S P @@ -3082,45 +2089,24 @@ P S S - P S S - P S S P - P - P - P - P S - P S - P S S S - P - P - P - P - P - P - P - P S S S - P S P S P P - P - P - P - P S S S @@ -3132,106 +2118,66 @@ S P S + S + S P S S S P - P S S S S P - P S - P - P - P - P - P - P - P - P - P - P - P S S + P S S - P S P - P - P S S S S S S - P - P - P - P S P - P - P - P S - P S - P S P - P S - P - P S - P - P - P - P S S - P - P - P S - P - P S S - P - P S S S - P - P S - P - P - P S - P S - P + S P S S - P - P S - P S - P + S S S P + S + S + S + S + S P S P @@ -3239,314 +2185,191 @@ P S S - P S S S - P S S - P S P S S P S + S P P P S - P S - P - P S - P - P - P S S - P S S - P S S S S - P - P S P - P - P - P - P S - P - P S - P - P S S - P S S - P - P - P - P - P - P S - P - P - P - P + S + S P S S S S P - P S - P - P - P - P S S S S - P S S - P - P - P S S - P - P S P - P S P - P - P - P - P - P - P - P - P - P S - P S P P - P S - P - P - P - P - P - P - P - P S - P + S P P P S S P - P - P S - P - P + S S P S S - P - P S S - P - P - P - P + S P S S - P S - P - P - P - P - P - P - P - P - P - P - P - P - P - P - P - P S - P - P S - P - P - P - P S - P - P S - P - P S - P - P - P - P S P S - P S S - P S - P S - P S S P S S S - P - P S P P - P - P - P S P S - P - P S P + S + S P S S S - P - P S - P S - P - P S - P - P - P - P S S - P - P S S - P - P - P S P S P S - P - P S - P S S - P - P - P - P S - P S S P - P S - P - P S S S S - P S P - P - P - P S - P - P S S - P S - P - P - P S - P + S P S S P P - P + S + S + S S P P + S + S P - P - P + S + S P S P + S + S + S + S P + S + S P P + S + S + S + S + S + S + S + S + S P + S P P P @@ -3557,9 +2380,8 @@ S S S - P - P - P + S + S P S P @@ -3567,146 +2389,84 @@ S S S - P S S P - P - P S S S - P - P - P S P S S P P - P - P - P - S S - P - P - P - P - P - P S - P - P - P S - P - P S - P - P - P - P S - P - P S S S - P - P - P S S - P S S P P - P S P S - P S - P S - P S - P - P - P S P S S S S - P S S S S - P - P S - P S P P S - P - P S P - P S - P S - P - P - P - P S - P - P S P - P - P S - P S S - P - P S - P S - P S - P S - P + S S S P - P + S + S + S S P P - S P P P - S P - S P P S - P S S S @@ -3714,72 +2474,55 @@ P S S - P S S P - P - P - P - P + S + S + S + S P P S - P S - P S - P S - P S - P - P S P P S + S + S P + S P P S P P S + S P S P S S - P - P - P S S S - P - P - P S P - P S - P - P - P S P P P - P - P S - P - P - P S - P - P + S + S + S S P P @@ -3787,345 +2530,348 @@ S S S - P S S P S - P - P S - P S P - P S - P - P - P - P - P S - P - P - P - P - P - P - P - P S - P S - P - P S - P S S - P - P - P - P - P - P S - P - P + S S P - P - P - P + S P S S S S - P S - P S P - P - P - P - P S - P - P - P - P S P - P - P - P S S S P P - P - P S P - P - P - P S - P - P S - P - P S S P S S S - P - P - P - P - P - P - P - P - P - P - P - P S - P S - P - P - P S S - P - P - P - P - P - P - P - P - P - P S S - P - P - P - P - P - P S S - P + S + S P S P S S - P - P S - P S - P S - P S S - P - P S - P - P S S - P S S P P - P - P - P - P S - P S - P S P P S - P - P - P - P S P - P S P - P + S S S P + S + S P - P + S + S P S S S - P S - P - P - P - P - P - P S - P S S P S S S - P S - P S P - P S P + S P S + S + S P + S + S + S P P S - P + S S P P S - P S S - P S S - P - P S S S S S - P S S - P S - P S S - P - P - P S - P - P - P - P S S S P P - P - P - P - P - P + S + S + S + S + S + S + S + S + S + S + S + S + S S P + S P + S P + S P S P P + S P + S P S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S P + S + S + S + S + S P S S S P P + P + S + P S S S S + S + P + S + P P P + S + S P S + S + S + S + S + S + S + S P + S + S + S P + S + S + S + S + S P + S P + S + S + S + S + S P S S P + S + P + S + S P + S + S P + S P S S S + S + S + S + S + S + S + S P S S + S + S + S P S S + S + S P S S + S P S P + S + S + S P + S + S + S + S P P S P + S P - P + S + S P P S + S P + S P + S + S + S + S + S P + S P + S + S + S + S + S + S + S + S + S + S P S + S + S + S + S + S + S + S + S + S + P P P S @@ -4133,80 +2879,1708 @@ P S S + S + S + S + S + S + P P S + S + S + S + S P + S P + S + S + S + S + S + S P + S + S + S P + S + S + S + S + S P S + S + S + S + S + S P P S + S + S P P + S + S P + S + S + S + S + S + S + S + S + S + S P S S + P + S S P + S + S + S P P S + S + S + S + S P + S + S P + S + S + S P + S + S + S + S + S + S + S + S + S + S P S S + S P S P + S + S + S P S P + S P P + S + S + S + S + S + S + S + S + S P S + S P S + S + S + S + S + S + S + S + S + S + S P P S + S P + S + S P + S + S + S + S + S + S P + S + S P + S P + S P S P + S + S + S P + S + S + S + S + S + S + S + S + S + S + S + S P S + S P + S + S + S + S + S + S + S + S + S + P + S + S + S + S + S + P + S + S + S + S + S + S + S + S + S + P + S + P + S + P + P + S + S + S + S + P + P + S + P + S + P + S + S + P + S + S + S + S + S + S + S + S + S + S + S + S + S + P + S + S + S + S + S + S + S + P + S + S + S + S + S + S + S + P + S + S + S + S + S + P + S + P + S + S + S + S + S + S + P + P + S + S + P + S + P + S + S + S + S + S + S + S + S + S + S + S + S + P + P + S + S + P + S + S + S + P + P + S + P + S + S + S + S + S + S + P + S + S + S + S + S + S + S + S + S + S + S + P + P + P + S + S + S + S + S + S + S + S + P + S + P + S + S + S + S + S + P + P + S + P + S + S + P + S + P + S + S + S + S + S + S + S + S + S + S + S + S + S + P + S + S + S + S + P + P + S + S + S + P + S + S + S + S + S + S + S + S + P + P + S + S + S + S + S + P + P + P + S + P + P + S + S + S + S + S + S + S + S + S + P + S + S + S + S + P + S + S + S + S + S + S + S + S + S + S + P + S + S + S + S + P + S + P + S + S + S + S + S + P + P + S + S + S + S + S + S + S + S + S + P + S + S + S + S + S + S + S + S + S + S + S + S + S + S + P + P + S + S + S + S + S + P + P + P + P + P + P + P + S + S + S + S + P + S + S + S + S + S + P + S + S + S + P + S + S + S + S + S + S + S + P + P + P + S + P + S + S + S + S + S + S + P + S + S + S + P + P + P + S + S + S + S + S + S + S + P + S + P + P + P + S + S + P + S + S + P + S + S + S + S + S + S + P + S + S + P + S + P + S + P + P + P + S + S + S + S + S + S + S + S + S + S + P + S + S + P + S + P + S + S + S + P + P + S + S + S + S + S + S + S + P + S + P + S + S + S + S + S + S + P + P + P + P + S + S + P + S + S + S + S + P + P + P + P + S + P + S + P + S + S + S + S + P + S + S + P + P + S + S + S + S + S + P + S + S + P + S + P + S + S + S + S + S + P + S + S + S + S + S + S + S + S + P + P + S + S + S + S + P + S + P + S + S + P + S + S + S + S + S + P + P + S + S + S + S + S + S + S + S + S + P + S + S + S + S + S + S + P + S + S + P + S + P + S + S + S + P + P + P + P + S + S + S + S + P + S + S + P + S + S + S + P + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + P + P + P + S + S + S + S + P + P + S + S + S + S + S + P + P + P + S + S + S + S + S + S + S + S + S + S + S + S + P + S + P + S + S + S + P + S + S + S + S + S + S + P + S + S + S + S + S + P + S + S + S + P + S + P + S + S + S + S + S + S + P + S + S + S + S + S + S + S + S + S + S + P + P + S + S + P + P + S + S + P + S + P + P + P + S + P + S + S + P + S + S + S + P + S + P + S + S + S + P + P + P + S + S + S + S + S + S + S + S + P + P + S + S + S + P + S + S + S + P + S + S + S + S + S + P + S + S + S + S + S + P + P + S + S + S + S + S + P + S + P + S + S + S + S + S + S + S + S + S + P + S + P + S + S + S + S + S + P + S + S + S + S + S + S + S + S + P + S + S + P + S + S + P + S + S + S + S + P + S + S + P + S + S + S + S + S + P + P + S + S + S + S + S + S + P + S + S + S + P + S + S + P + S + S + P + S + S + P + P + S + P + S + S + P + S + P + P + S + P + P + S + S + S + S + S + P + S + S + S + S + S + S + P + S + P + P + P + S + S + S + S + S + S + P + S + S + S + P + P + S + P + S + P + S + S + P + S + S + S + S + S + P + P + S + S + P + S + S + S + S + S + S + S + P + S + S + S + P + S + S + P + P + P + P + S + S + P + P + S + P + S + S + S + P + S + S + S + S + S + S + P + P + P + S + P + P + S + S + S + S + P + S + S + P + S + S + S + S + S + S + S + S + P + S + S + P + P + S + P + S + S + S + P + P + S + S + S + S + S + S + S + S + P + S + S + S + S + S + S + S + S + S + S + P + S + S + S + P + S + S + P + P + S + P + S + S + S + S + P + S + P + S + S + S + S + S + P + P + S + S + P + P + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + P + S + S + S + P + S + S + S + S + S + P + S + S + S + S + S + S + P + S + S + S + S + S + S + S + S + S + S + P + P + S + S + S + S + S + S + S + S + S + P + P + S + P + P + S + S + S + S + S + P + S + S + P + P + S + P + S + S + S + S + S + S + P + S + P + S + S + S + S + P + P + S + S + S + S + S + S + S + S + S + S + P + S + S + P + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + P + S + S + S + S + P + P + S + S + S + S + P + S + S + S + S + S + S + S + S + P + P + S + P + P + S + S + S + S + S + P + P + S + S + S + S + S + S + S + S + S + P + S + S + S + S + S + S + S + P + S + P + P + S + S + P + S + S + S + P + P + S + P + S + S + P + S + P + S + S + S + P + S + S + P + S + P + S + S + S + S + S + P + S + P + S + S + S + S + P + S + P + S + P + P + S + S + S + P + S + S + S + S + P + P + S + P + S + S + S + S + S + S + P + S + S + S + S + S + S + S + P + S + P + P + S + S + S + S + S + S + P + P + S + S + S + S + S + S + S + S + P + S + S + S + S + P + S + P + S + S + S + S + S + S + S + S + S + P + S + S + S + S + S + S + S + P + S + S + P + S + S + S + S + S + P + S + S + S + S + S + S + S + S + S + P + S + P + S + S + S + P + P + S + S + S + S + S + S + S + S + S + S + S + S + S + P + S + S + S + S + S + P + S + S + S + S + S + S + P + S + S + S + S + S + S + S + S + S + S + P + S + S + S + S + S + P + S + S + S + S + S + P + P + S + S + P + P + S + S + P + S + S + S + S + S + S + S + P + S + S + S + S + S + P + S + S + S + S + P + S + P + S + S + S + S + P + S + S + S + S + S + S + S + S + P + P + S + S + P + S + S + S + S + P + S + S + P + S + S + S + S + S + S + P + P + S + S + S + S + S + S + S + S + S + S + S + S + S + P + S + S + S + S + P + S + S + P + P + S + S + S + P + P + P + S + S + S + S + S + P + S + P + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + P + S + S + S + S + P + P + S + P + S + S + S + P + S + S + S + S + S + P + S + S + S + P + S + S + P + S + S + P + S + P + S + S + S + S + S + S + P + S + S + S + P + P + S + S + S + P + P + S + S + P + S + P + S + P + P + S P + S + S + S + P + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + S + P + P + P + S + S + S + S + S + S + S + S + S + S + P + S + P + S + P + S + P + S + S + S + S + S + S + S + S + P + S + S P + S + S + S + S + S + S + S + S P S S - P + S P S S - P S - P - P - P - P S S S - P + S + S S P S @@ -4215,395 +4589,209 @@ S S S - P - P S - P - P - P - P - P - P - P - P - P + S + S + S + S P P P S P S - P - P - P S S - P S P - P S - P - P - P S - P S - P S - P - P - P S - P - P + S + S P S S S - P - P S - P - P S S S - P - P - P - P - P S - P - P S - P - P S - P - P - P - P - P - P - P S - P S P - P - P - P S S S - P S + S + S + P P S S S - P S - P S P S S S S - P S - P - P S - P - P S S - P S S - P - P S - P - P - P + S + S + S S S P P P P - P - P + S P S S - P S P - P S - P - P S - P S - P - P - P - P - P S P - P S S - P - P + S S P S + S + S P P S P P S - P S S - P - P S S S - P S S - P - P S - P S - P S - P S P - P S - P S S S S P S - P - P - P + S P S S - P - P - P - P S - P - P S - P S - P S - P - P - P - P S - P - P - P - P S - P S S S - P - P S P - P - P - S S S P - P - P - P - P + S S P - P + S P S S P - P - P - P S P - P S - P S - P - P - P - P - P - P S - P - P - P - P - P - P S S P - P - P S - P S - P - P - P S S - P - P - P S S S P - P - P S - P S - P S - P S - P - P - P - P - P S S S - P - P - P S P P S - P - P - P - P - P S P - P - P - P S - P S S S - P S S S - P - P - P - P - P - P - P S S - P - P - P - P - P - P - P S - P S P - P S P - P - P S S - P - P - P + S P S S - P - P - P S - P - P - P - P - P - P S S P P - P - P - P S P - S P S - P S - P S P S S - P S P S @@ -4611,33 +4799,23 @@ S S P - P - P S - P - P S S - P S S S P + P S S - P - P S S P S - P S P P - P - P - P S P S @@ -4647,37 +4825,34 @@ S S S - P S S S - P - P S S S - P - P - P - P - P + S S S S P S + S + S P P S + S P P P + S P S S S S - P + S P P S @@ -4687,36 +4862,20 @@ P P S - P S S - P - P - P - P S - P - P - P S - P - P - P S - P - P S P - P - P + S + S P S S S - P - P - P - P + S P S S @@ -4724,96 +4883,48 @@ S S P - P - P S - P - P S P + S P S S - P - P - P - P - P - P - P - P - P S - P S - P - P - P S S - P - P - P S S S S - P - P - P - P - P - P - P - P S S S - P + S P S S - P - P S - P - P - P S S - P - P S - P - P - P S - P S S S - P - P S - P S P - P - P - P - P - P - P S - P - P S - P - P S S P + S P + S + S P S P @@ -4824,13 +4935,8 @@ P S S - P - P - P S - P - P - P + S S P S @@ -4839,61 +4945,56 @@ S S S - P S P S - P - P - P S P S - P - P - P S P P P S - P S - P + S P S S S P P - P - P S - P S - P S S - P - P S - P + S + S S P P - P - P + S + S + S + S + S + S + S P S P P S - P + S S P P + S P S + S + P P P P @@ -4903,250 +5004,149 @@ P S P - P S S S S P - P - P + S + S S S P + P S P S S S - P S S - P - P S P S - P S - P S P - P - P S S S - P - P - P - P - P - P - P S S P + S P S S S - P - P - P S S - P S - P - P - P S - P - P S S P - P S - P - P - P - P S P - P S S S P - P - P - P S P S - P - P S P S - P S S P - P - P - P - P S P + S + S P S + S P S S S S - P + S P S S S - P - P S - P S S - P - P - P - P S - P S - P - P S S - P S P P P - P - P - P - P S P S S S + S P S S + P S S - P - P - P S - P - P - P S - P S P - P - P S P - P - P - P - P S - P S - P + S P P P S P S - P - P - P S - P S S - P S - P S P P - P S - P S S P S - P - P S - P S S - P S S - P S - P - P - P - P S - P - P S - P - P - P - P - P - P - P - P - P - P - P - P - P - P S - P S S - P S - P + S S S S P S S - P - P - P - P - P - P + S + S diff --git a/test-A/out.tsv_baseline b/test-A/out.tsv_baseline new file mode 100644 index 0000000..8508f03 --- /dev/null +++ b/test-A/out.tsv_baseline @@ -0,0 +1,5152 @@ + P + P + S + P + P + P + P + P + P + S + S + S + S + P + P + S + P + S + P + S + P + P + S + S + P + P + P + S + P + S + S + S + P + S + S + S + P + P + P + P + P + S + P + S + S + P + P + S + S + S + S + P + P + P + S + S + P + S + S + S + P + S + S + S + S + P + P + P + P + P + S + P + S + S + P + P + S + P + S + P + S + S + P + P + P + P + P + P + S + P + P + P + P + P + P + S + S + P + P + S + P + P + P + P + P + S + P + S + S + P + P + S + P + S + P + S + P + S + S + P + P + S + S + S + S + P + P + S + P + S + P + P + P + P + P + P + P + P + P + P + P + P + S + P + P + P + P + S + S + S + P + S + S + P + S + P + P + P + P + P + S + S + P + P + S + P + S + P + P + P + P + S + P + P + P + S + P + P + P + P + P + P + P + P + P + S + S + S + S + P + P + S + S + P + P + S + S + P + S + S + P + P + P + P + S + S + P + P + P + P + S + P + P + P + P + P + P + S + P + P + S + P + P + S + P + P + S + P + P + P + S + P + S + P + P + P + S + P + P + S + S + S + S + S + S + P + P + P + P + P + P + S + P + P + P + P + S + P + S + S + S + S + S + P + P + P + P + S + P + P + P + P + S + P + S + P + P + P + S + S + P + S + P + S + S + S + P + P + P + S + P + S + P + S + P + P + S + S + P + P + S + P + S + P + S + S + P + S + S + P + P + S + P + S + S + S + S + P + P + P + S + P + P + P + S + P + P + P + P + P + S + S + S + P + P + P + P + P + P + P + S + S + S + P + P + S + S + S + S + P + P + S + P + S + P + P + P + P + S + P + P + P + S + P + P + P + P + S + P + P + S + S + P + S + S + S + S + P + P + S + P + S + S + P + S + P + S + S + P + P + S + P + S + P + P + P + P + P + P + P + P + P + S + P + S + S + P + S + S + S + P + S + P + P + P + P + S + P + S + P + S + S + S + P + S + P + P + P + P + P + P + P + P + S + S + P + S + P + S + P + P + P + P + S + P + P + P + P + P + P + P + P + P + P + S + P + P + S + P + S + P + S + P + P + P + S + P + S + P + S + P + S + P + P + S + P + S + P + P + S + P + S + P + S + S + P + P + S + P + P + S + P + S + S + S + P + P + P + S + S + P + P + P + P + P + P + P + S + P + S + P + S + P + P + S + P + P + S + S + P + S + S + P + S + P + P + P + P + P + S + S + P + S + P + S + S + S + P + S + S + S + S + P + P + P + P + P + P + S + S + S + S + S + S + S + P + S + P + S + S + P + S + S + S + P + P + P + P + P + S + P + P + S + P + P + P + S + S + P + P + S + P + S + P + P + S + P + P + P + P + P + S + S + P + P + P + P + P + P + P + S + P + S + S + P + P + S + P + P + S + P + S + S + S + P + P + P + S + S + P + P + P + P + P + P + S + P + P + S + S + P + S + S + P + S + P + S + P + P + S + S + P + S + P + S + P + S + P + P + P + P + P + P + P + P + P + P + S + P + S + P + P + P + P + S + S + P + P + S + S + S + P + S + P + S + S + P + S + S + S + S + P + P + P + P + P + P + P + P + P + S + S + S + P + P + P + S + S + P + P + P + P + P + P + P + S + P + P + S + S + P + P + P + P + P + P + P + S + P + P + S + S + P + P + S + P + P + S + P + S + P + S + S + S + S + P + S + P + S + P + P + P + P + P + S + P + P + S + P + P + P + S + P + S + S + P + P + P + S + P + P + S + P + P + S + S + P + P + S + S + S + S + P + P + P + P + P + P + P + P + P + P + P + P + P + P + S + S + S + P + S + P + S + S + S + S + P + P + P + S + P + P + S + S + P + S + P + S + P + P + P + P + S + P + S + P + S + P + P + P + P + P + P + S + P + P + P + P + P + P + P + P + S + P + P + P + P + P + P + S + P + S + S + P + P + P + S + S + P + P + S + P + P + S + P + P + P + P + P + S + P + P + P + P + P + P + S + P + P + P + P + P + S + P + P + S + P + P + P + P + P + S + P + P + P + P + S + S + P + P + S + P + S + P + P + S + P + S + S + S + P + S + P + P + S + P + S + P + S + P + P + S + S + S + P + P + S + P + S + S + S + S + P + P + P + S + S + P + S + S + S + S + S + P + S + P + P + S + S + P + S + S + P + S + P + P + P + P + S + P + S + P + P + P + S + P + S + S + S + P + P + P + P + S + P + S + P + S + S + S + P + P + S + P + P + S + P + S + P + P + P + S + P + P + P + P + P + P + S + S + P + P + S + P + P + S + S + S + S + P + P + S + P + S + P + S + P + P + S + P + S + P + P + P + P + P + P + P + P + P + S + P + S + P + P + S + S + P + S + S + P + S + S + S + S + P + P + P + P + S + S + P + P + S + S + S + S + S + P + P + S + P + P + S + P + P + S + P + S + P + P + P + S + P + P + P + P + S + P + S + P + P + S + P + P + S + P + P + S + S + S + P + S + P + P + S + P + P + P + S + P + P + S + P + S + S + P + P + P + P + S + P + S + P + P + P + P + P + S + S + S + P + S + P + P + S + P + P + S + P + P + S + P + P + S + P + P + P + S + P + P + P + S + P + P + S + P + S + P + S + P + P + P + P + P + P + P + P + S + S + P + S + S + P + P + P + S + P + S + P + S + S + P + P + P + S + S + P + S + P + S + S + P + S + S + P + S + P + P + P + P + P + P + P + S + P + S + P + P + P + S + S + P + S + P + S + S + P + P + P + P + P + S + S + S + S + P + P + P + P + S + P + S + P + P + P + P + P + S + S + P + P + P + P + P + P + P + P + S + P + P + S + P + S + P + P + P + S + P + P + S + S + S + S + S + P + S + P + S + S + P + S + S + P + P + P + P + P + P + P + S + S + P + P + S + P + S + P + S + S + S + S + P + P + P + S + S + P + S + S + P + S + S + P + S + P + S + S + P + P + S + S + S + P + P + S + S + S + P + P + P + P + S + S + P + P + S + P + P + P + P + S + S + S + P + S + P + P + P + P + P + P + P + P + P + S + S + P + P + P + S + P + S + P + P + P + P + P + P + P + P + P + S + S + P + P + P + P + P + P + S + P + P + P + P + S + P + S + P + P + P + S + P + P + S + S + S + P + P + S + S + P + P + S + P + P + P + P + S + P + S + S + P + S + S + P + S + S + S + S + S + P + P + P + P + P + P + P + P + P + S + S + P + S + S + P + S + P + P + S + P + P + P + P + S + P + P + P + P + P + S + P + P + P + S + P + P + P + P + P + P + P + P + S + S + P + S + S + P + P + S + P + S + S + S + S + P + P + S + S + P + P + S + S + P + P + P + P + P + P + S + S + S + P + P + S + P + S + S + S + S + S + S + S + S + P + P + P + P + P + P + S + P + S + P + P + P + P + P + P + P + P + S + P + S + S + S + S + S + P + P + P + P + S + S + S + S + P + P + P + P + S + S + S + S + P + P + S + P + P + S + P + S + S + P + P + P + P + P + P + S + S + P + S + S + P + P + P + P + S + P + P + P + P + S + P + S + P + P + P + S + P + P + P + S + P + S + S + P + P + P + S + P + S + P + P + P + P + P + P + P + P + P + S + P + S + S + S + P + P + S + P + P + S + P + P + P + S + P + P + P + S + S + S + P + S + P + S + P + P + P + P + S + S + S + P + P + P + S + S + S + P + S + S + S + P + S + P + P + P + P + P + P + S + S + S + S + S + S + S + S + S + P + P + P + P + S + P + S + P + S + P + S + P + P + P + P + S + S + S + P + S + P + S + P + P + P + P + P + P + P + S + P + S + P + P + S + S + S + P + S + P + S + S + P + S + S + P + S + S + S + P + S + P + S + S + P + S + P + P + P + P + S + S + P + P + S + S + P + S + P + P + P + S + S + S + S + P + S + S + P + S + P + S + P + P + P + P + S + P + P + P + S + P + S + S + S + P + P + S + S + P + P + P + P + P + P + P + P + S + S + S + P + P + P + P + S + P + P + S + P + P + S + P + P + P + P + P + P + P + P + P + P + P + P + P + P + S + S + P + P + P + P + P + P + P + P + P + P + S + P + P + P + P + P + P + P + S + S + S + P + P + P + P + S + P + P + P + S + S + P + S + P + S + S + P + P + S + S + S + P + P + S + P + P + S + S + S + P + P + P + P + P + P + P + P + P + S + S + S + S + P + S + P + S + P + P + P + S + P + P + P + S + S + P + P + P + S + P + S + S + P + P + P + S + S + P + S + S + S + S + P + S + P + S + P + S + S + P + P + S + P + S + P + P + P + P + P + S + P + P + P + S + P + S + S + P + P + P + S + P + P + P + S + S + S + S + S + S + P + S + P + P + P + P + S + S + P + P + P + P + S + P + S + P + P + S + P + P + P + P + P + P + S + S + S + P + S + S + S + P + P + P + P + S + P + S + P + S + S + S + P + S + P + S + P + S + P + P + P + P + S + P + P + P + S + P + P + P + P + S + P + S + P + P + P + P + S + P + P + S + S + S + S + S + P + P + P + S + S + P + P + P + S + S + P + P + P + S + S + P + S + P + S + S + P + S + S + P + P + P + S + P + P + P + S + P + P + S + S + S + P + P + S + P + S + P + P + P + P + S + P + P + P + P + P + P + P + P + P + P + P + S + P + P + P + S + S + S + P + S + P + P + S + P + S + S + P + S + S + P + P + P + P + S + S + P + P + P + S + P + S + P + P + P + P + P + S + P + P + P + S + S + P + P + P + P + S + S + P + P + P + P + P + P + S + P + S + P + P + P + S + S + S + S + P + S + S + P + S + S + P + S + P + S + P + P + P + P + S + S + P + P + P + S + S + P + P + S + P + S + S + P + P + S + S + P + P + S + P + P + P + S + S + S + S + P + P + S + P + S + P + P + S + S + P + P + P + P + P + S + P + P + S + P + S + P + P + P + S + S + S + P + P + P + P + P + S + S + P + S + P + P + P + P + P + P + P + P + P + P + P + P + P + P + S + S + P + S + S + P + S + P + P + P + P + P + P + P + P + P + P + P + S + S + P + P + S + S + P + S + P + S + S + S + P + P + S + P + P + P + S + P + S + P + S + S + P + S + S + S + P + P + S + S + P + S + P + P + P + P + P + P + P + P + P + P + S + P + P + P + P + P + P + P + P + P + P + P + S + P + S + P + S + S + S + S + P + P + S + P + S + P + S + S + P + S + P + S + P + P + S + P + P + S + P + P + S + S + P + P + P + S + P + P + S + P + P + P + S + P + S + P + P + S + S + P + S + P + P + S + P + P + S + S + S + P + P + P + S + P + P + S + P + P + P + S + S + P + P + P + S + S + P + P + S + P + P + P + S + S + P + P + P + P + P + S + P + S + P + S + P + S + P + P + P + P + P + P + S + P + P + P + P + S + P + S + P + P + S + S + P + P + P + P + P + S + P + S + P + P + P + P + P + P + P + P + P + S + S + P + P + P + P + S + S + P + S + P + S + P + P + P + P + P + S + S + P + S + P + S + P + P + P + S + P + P + S + S + P + S + S + P + P + P + S + S + S + S + S + S + S + P + S + P + P + S + P + P + P + P + S + S + P + P + P + P + S + P + S + P + P + P + P + S + S + S + P + P + P + P + P + P + P + P + P + P + P + S + S + P + P + P + P + S + S + S + S + S + P + P + S + S + P + S + S + P + S + S + S + S + P + P + P + S + P + S + P + P + S + S + P + P + P + S + S + P + P + P + S + P + P + P + P + P + P + P + P + P + P + S + P + P + S + S + S + S + P + P + S + S + S + P + P + P + P + S + P + P + P + S + P + S + P + S + P + S + P + P + P + S + P + S + P + P + P + S + P + P + P + P + P + S + S + P + S + S + P + S + P + S + P + P + P + P + P + P + P + S + S + P + P + P + P + S + P + P + S + S + S + P + P + P + P + P + P + P + P + S + P + P + S + P + P + P + P + S + P + P + S + P + P + S + P + P + P + S + P + P + S + P + P + S + P + P + P + S + S + P + P + P + P + P + S + P + P + S + P + S + S + S + P + S + P + P + P + S + P + S + P + S + P + P + S + S + P + P + P + P + P + P + P + S + P + S + S + S + P + P + S + P + P + S + P + S + S + S + P + S + S + P + S + S + P + P + P + P + P + S + S + S + S + S + P + P + S + S + S + P + P + S + P + S + S + S + P + S + S + P + S + P + S + P + S + S + P + P + P + P + S + P + S + S + S + S + P + S + P + P + P + P + S + S + S + P + P + P + S + S + S + S + S + S + P + P + P + P + P + P + P + P + S + P + S + P + P + P + S + P + P + S + P + P + P + P + P + P + P + S + S + P + P + P + P + P + S + P + S + S + P + P + P + S + S + P + P + P + P + P + S + S + P + P + P + P + P + P + S + S + S + P + S + P + P + P + P + P + P + S + P + S + S + S + S + S + P + P + P + S + P + P + P + S + P + S + S + S + P + S + P + S + P + P + S + P + S + S + P + P + S + P + P + S + S + S + P + S + P + P + P + S + P + P + P + S + P + S + S + P + P + P + S + P + P + P + P + S + P + S + S + P + P + P + P + P + P + S + S + P + S + S + S + S + S + S + S + P + P + S + P + S + P + S + P + P + P + S + P + S + S + S + S + S + S + S + P + P + P + P + P + S + S + P + P + P + P + S + P + S + S + P + S + P + S + S + P + P + P + P + P + P + P + P + S + S + P + S + P + S + S + P + P + P + P + P + S + S + S + P + P + P + P + P + S + P + P + P + P + P + P + P + S + S + P + P + P + P + P + P + P + P + P + S + P + S + S + S + S + S + S + P + P + S + P + P + S + P + S + P + P + P + P + P + S + S + P + S + P + S + P + P + P + P + S + P + S + P + P + P + P + P + P + P + P + S + S + P + P + P + P + P + S + P + P + S + S + P + P + P + P + P + P + S + P + S + P + P + P + P + P + P + P + S + S + P + P + S + P + S + S + P + P + P + P + P + S + P + S + S + S + P + P + P + S + P + S + P + S + S + P + P + S + P + S + P + P + P + S + S + P + S + P + S + S + P + S + S + P + S + S + P + P + P + P + P + S + P + S + P + S + S + S + P + P + P + P + P + P + P + P + S + S + S + P + S + P + S + P + P + P + P + P + P + S + S + S + S + S + S + P + S + S + P + S + P + S + S + S + P + P + S + S + S + S + P + P + S + P + P + P + P + P + P + P + P + P + P + P + S + S + S + S + P + S + P + P + P + S + S + S + S + S + S + P + P + P + P + S + P + P + P + P + S + P + S + P + S + P + P + S + P + P + S + P + P + P + P + S + S + P + P + P + S + P + P + S + S + P + P + S + S + S + P + P + S + P + P + P + S + P + S + P + P + S + S + P + P + S + P + S + P + S + S + P + P + S + P + S + P + S + S + P + S + S + S + P + S + S + P + S + P + S + S + P + S + P + P + P + S + P + S + P + P + S + P + P + P + S + S + P + S + S + P + S + S + S + S + P + P + S + P + P + P + P + P + S + P + P + S + P + P + S + S + P + S + S + P + P + P + P + P + P + S + P + P + P + P + P + S + S + S + S + P + P + S + P + P + P + P + S + S + S + S + P + S + S + P + P + P + S + S + P + P + S + P + P + S + P + P + P + P + P + P + P + P + P + P + S + P + S + P + P + P + S + P + P + P + P + P + P + P + P + S + P + P + P + P + S + S + P + P + P + S + P + P + S + P + S + S + P + P + S + S + P + P + P + P + P + S + S + P + S + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + P + S + P + P + S + P + P + P + P + S + P + P + S + P + P + S + P + P + P + P + S + P + S + P + S + S + P + S + P + S + P + S + S + P + S + S + S + P + P + S + P + P + P + P + P + S + P + S + P + P + S + P + P + S + S + S + P + P + S + P + S + P + P + S + P + P + P + P + S + S + P + P + S + S + P + P + P + S + P + S + P + S + P + P + S + P + S + S + P + P + P + P + S + P + S + S + P + P + S + P + P + S + S + S + S + P + S + P + P + P + P + S + P + P + S + S + P + S + P + P + P + S + P + P + S + S + P + P + P + S + P + P + P + P + P + P + S + P + P + P + P + P + P + P + P + P + S + S + S + S + S + S + P + P + P + P + S + P + P + S + S + S + P + S + S + P + P + P + S + S + S + P + P + P + S + P + S + S + P + P + P + P + P + S + S + P + P + P + P + P + P + S + P + P + P + S + P + P + S + P + P + P + P + S + P + P + S + S + S + P + P + P + S + S + P + S + S + P + P + P + S + P + S + P + S + P + S + P + S + P + P + P + S + P + S + S + S + S + P + S + S + S + S + P + P + S + P + S + P + P + S + P + P + S + P + P + S + P + S + P + P + P + P + S + P + P + S + P + P + P + S + P + S + S + P + P + S + P + S + P + S + P + S + P + S + S + P + P + S + P + P + S + P + P + P + S + P + S + P + P + S + P + S + S + S + S + P + S + S + P + S + S + P + P + P + P + P + P + P + S + P + S + P + S + P + S + P + S + P + P + S + P + P + S + P + P + P + S + P + P + S + P + S + P + S + S + P + P + P + S + S + S + P + P + P + S + P + P + S + P + P + P + S + P + P + P + P + P + S + P + P + P + S + P + P + S + P + P + P + S + S + S + P + S + S + P + S + P + P + S + P + S + P + P + S + P + P + P + P + P + S + P + P + P + P + P + P + P + P + S + P + S + P + P + S + P + S + S + P + P + P + P + P + P + S + P + P + S + P + P + P + P + P + S + S + S + S + P + S + P + S + P + P + P + P + P + S + P + P + P + P + S + P + P + P + P + S + S + S + P + P + P + P + S + P + P + P + P + S + P + P + S + P + P + S + S + P + S + S + S + P + P + P + P + P + P + P + P + P + P + P + P + S + P + S + P + P + P + S + S + P + P + P + P + P + P + P + P + P + P + S + S + P + P + P + P + P + P + S + S + P + P + S + P + S + S + P + P + S + P + S + P + S + P + S + S + P + P + S + P + P + S + S + P + S + S + P + P + P + P + P + P + S + P + S + P + S + P + P + S + P + P + P + P + S + P + P + S + P + P + S + S + P + P + P + P + S + S + S + P + S + P + P + P + P + P + P + S + P + S + S + P + S + S + S + P + S + P + S + P + P + S + P + P + S + P + P + P + S + P + S + P + P + S + P + S + S + P + S + S + P + P + S + S + S + S + S + P + S + S + P + S + P + S + S + P + P + P + S + P + P + P + P + S + S + S + P + P + P + P + P + P + P + S + P + P + P + P + S + P + P + P + P + S + P + P + S + S + S + P + P + S + S + S + S + P + P + P + S + P + P + P + P + P + S + S + P + P + P + P + S + S + S + P + S + S + P + S + S + P + S + S + P + S + P + P + P + P + S + P + P + P + P + P + S + P + P + P + P + P + S + P + P + S + S + P + S + S + P + S + P + P + P + P + P + S + P + P + S + P + P + P + P + S + S + S + P + P + P + S + P + P + P + P + S + S + P + S + P + P + S + P + P + P + P + S + P + S + P + P + S + P + P + P + P + P + P + S + P + P + P + S + P + P + P + P + S + S + P + P + S + S + P + S + P + P + P + P + S + S + S + P + S + P + S + P + P + S + S + S + P + P + S + P + P + P + P + P + P + P + P + P + P + P + P + S + P + S + P + P + P + S + S + P + S + P + P + S + P + P + P + S + P + S + P + S + P + P + P + S + P + P + P + S + S + S + P + P + S + P + P + S + S + S + P + P + P + P + P + S + P + P + S + P + P + S + P + P + P + P + P + P + P + S + P + S + P + P + P + P + S + S + S + P + S + P + S + S + S + P + S + P + S + P + S + S + S + S + P + S + P + P + S + P + P + S + S + P + S + S + P + P + S + P + P + P + S + S + P + P + P + P + P + P + P + S + S + P + S + P + P + S + P + P + S + P + S + P + P + P + P + P + S + P + P + S + S + P + P + S + P + S + P + P + S + P + P + S + P + S + S + P + P + S + S + S + P + S + S + P + P + S + P + S + P + S + P + S + P + P + S + P + S + S + S + S + P + S + P + P + P + P + S + S + P + P + P + P + S + P + P + S + P + S + P + S + P + P + P + P + S + P + P + P + P + S + P + S + S + S + P + P + S + P + P + P + S + S + S + P + P + P + P + P + S + P + P + P + S + S + P + P + P + P + S + P + P + S + P + S + P + P + P + P + P + P + S + P + P + P + P + P + P + S + S + P + P + P + S + P + S + P + P + P + S + S + P + P + P + S + S + S + P + P + P + S + P + S + P + S + P + S + P + P + P + P + P + S + S + S + P + P + P + S + P + P + S + P + P + P + P + P + S + P + P + P + P + S + P + S + S + S + P + S + S + S + P + P + P + P + P + P + P + S + S + P + P + P + P + P + P + P + S + P + S + P + P + S + P + P + P + S + S + P + P + P + P + S + S + P + P + P + S + P + P + P + P + P + P + S + S + P + P + P + P + P + S + P + S + P + S + P + S + P + S + P + S + S + P + S + P + S + S + S + S + P + P + P + S + P + P + S + S + P + S + S + S + P + S + S + P + P + S + S + P + S + P + S + P + P + P + P + P + S + P + S + S + S + S + S + S + S + P + S + S + S + P + P + S + S + S + P + P + P + P + P + S + S + S + P + S + P + P + S + P + P + P + P + S + S + S + S + P + P + P + S + S + P + S + P + P + S + P + S + S + P + P + P + P + S + P + P + P + S + P + P + P + S + P + P + S + P + P + P + P + S + S + S + P + P + P + P + P + S + S + S + S + S + P + P + P + S + P + P + S + P + P + S + S + P + P + P + P + P + P + P + P + P + S + P + S + P + P + P + S + S + P + P + P + S + S + S + S + P + P + P + P + P + P + P + P + S + S + S + P + P + S + S + P + P + S + P + P + P + S + S + P + P + S + P + P + P + S + P + S + S + S + P + P + S + P + S + P + P + P + P + P + P + P + S + P + P + S + P + P + S + S + P + P + P + S + P + S + P + P + P + P + S + S + P + P + P + S + P + P + P + S + P + S + S + S + S + S + S + P + S + P + S + P + P + P + S + P + S + P + P + P + S + P + P + P + S + P + S + P + P + S + S + S + P + P + P + P + S + P + S + P + S + S + P + P + S + P + S + P + P + P + P + P + S + P + P + S + P + S + P + P + P + S + P + P + P + P + S + S + P + S + P + P + S + S + S + S + P + P + P + S + S + P + S + P + S + S + S + P + S + S + P + P + S + P + S + P + S + P + S + P + P + P + S + S + S + P + P + P + P + P + P + P + S + S + P + P + S + S + S + P + P + P + S + S + P + S + P + P + P + S + P + P + S + S + P + P + S + P + P + P + P + S + P + P + S + S + S + P + P + P + P + S + P + S + P + P + S + P + S + P + S + S + P + P + P + P + P + S + P + P + S + P + S + S + S + S + P + P + S + S + S + P + P + S + P + S + S + P + P + P + P + S + P + S + P + P + S + S + P + S + P + P + P + P + P + P + P + S + P + S + S + S + P + S + S + S + S + P + P + P + S + P + P + P + S + P + S + P + P + P + S + P + P + P + P + P + S + P + S + P + P + P + P + S + P + S + P + P + P + S + P + S + S + P + S + P + S + P + P + P + S + P + S + S + P + S + P + P + S + P + S + S + P + S + S + P + S + P + P + P + P + S + P + P + S + P + P + P + P + P + P + P + P + P + P + P + P + P + P + S + P + S + S + P + S + P + S + S + S + P + S + S + P + P + P + P + P + P diff --git a/train.py b/train.py index ea1c148..d9ea143 100755 --- a/train.py +++ b/train.py @@ -3,89 +3,81 @@ from collections import defaultdict import math import pickle import re +import sys -# in expected.tsv def calc_class_logprob(expected_path): - paranolal_classcount=0 - sceptic_classcount=0 + paranormal_classcount = 0 + sceptic_classcount = 0 + with open(expected_path) as f: for line in f: line = line.rstrip('\n').replace(' ','') if 'P' in line: - paranolal_classcount +=1 + paranormal_classcount +=1 elif 'S' in line: sceptic_classcount +=1 - paranol_prob = paranolal_classcount / (paranolal_classcount + sceptic_classcount) - sceptic_prob = sceptic_classcount / (paranolal_classcount + sceptic_classcount) + paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount) + sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount) return math.log(paranol_prob), math.log(sceptic_prob) -def clear_tokens(tokens, is_text=True): - tokens = tokens.replace('\\n', ' ') - return tokens - # delete links, special characters, kropki, and \n - tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) - tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens) - tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens) - tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens) - tokens = re.sub(r'[0-9]+', ' ', tokens) - tokens = re.sub(r'œ|·', '', tokens) - if is_text: - tokens = re.sub(r' +', ' ', tokens) - else: - tokens = re.sub(r' +', '', tokens) - return tokens +def clear_post(post): + post = post.replace('\\n', ' ') + # delete links + post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+]+(\)|)', '', post) + post = re.sub(r'[\.\,\/]+', ' ', post) + post = re.sub(r'(<|>)','',post) + post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%]+', '', post) + post = re.sub(r' \- ', ' ', post) + post = re.sub(r' +', ' ', post) + post = post.rstrip(' ') + return post -# ile razy slowo wystepuje w dokumentach w danej klasie -def calc_word_count(in_path, expected_path): - word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja - with open(in_path) as infile, open(expected_path) as expectedfile: - for line, exp in zip(infile, expectedfile): - class_ = exp.rstrip('\n').replace(' ','') - text, timestap =line.rstrip('\n').split('\t') - #print(f"text {type(text)}") - text = clear_tokens(text, True) +def calc_bigram_count(in_path, expected_path): + bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)} + with open(in_path) as infile, open(expected_path) as expected_file: + for line, exp in zip(infile, expected_file): + class_ = exp.rstrip('\n').replace(' ', '') + text, timestap = line.rstrip('\n').split('\t') + text = clear_post(text) tokens = text.lower().split(' ') - #print(f"tokens {type(tokens)}") - for token in tokens: - clear_tokens(token,False) + for index in range(len(tokens)-1): + # if there is next token we append current and next + bigram = tokens[index] + " " + tokens[index + 1] + #print(bigram) + #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;") if class_ == 'P': - word_counts['paranormal'][token] += 1 + bigram_counts['paranormal'][bigram] +=1 elif class_ == 'S': - word_counts['sceptic'][token]+=1 + bigram_counts['sceptic'][bigram] +=1 + return bigram_counts - return word_counts +def calc_bigram_logprobs(bigram_counts): + total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys()) + total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys()) + bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}} + for class_ in bigram_counts.keys(): + for bigram, value in bigram_counts[class_].items(): + if class_ == "sceptic": + bigram_prob = (value + 1) / total_sceptic + elif class_ == "paranormal": + bigram_prob = (value + 1) / total_paranormal -def calc_word_logprobs(word_counts): - total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys()) - total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys()) - word_logprobs= {'paranormal': {}, 'sceptic': {}} - for class_ in word_counts.keys(): # sceptic paranormal - for token, value in word_counts[class_].items(): - if class_ == 'sceptic': - word_prob = (value +1)/ total_skeptic - elif class_ == 'paranormal': - word_prob = (value+1)/ total_paranormal + bigram_logprobs[class_][bigram] = math.log(bigram_prob) - #print (token) - word_logprobs[class_][token] = math.log(word_prob) - - return word_logprobs + return bigram_logprobs def main(): - expected = './train/expected.tsv' - #expected = './dev-0/expected.tsv' - in_f = './train/in.tsv' - #in_f = './dev-0/in.tsv' - print (f"expected {expected}") - print (f"in {in_f}") - paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected) - wordcounts =calc_word_count(in_f,expected) - - word_logprobs = calc_word_logprobs(wordcounts) - with open('naive_base_model.pkl', 'wb') as f: - pickle.dump([paranormal_class_lgprob, skeptic_class_logprob, word_logprobs], f) - # w predict.py bierzemy ten wzor argmax P(w) iloczynP(w|c) - + if len(sys.argv) != 4: + print("syntax is ./train.py expected.tsv in.tsv model.pkl") + return + expected_file = str(sys.argv[1]) + in_file = str(sys.argv[2]) + model = str(sys.argv[3]) + paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file) + bigrams_count = calc_bigram_count(in_file, expected_file) + bigram_logprobs = calc_bigram_logprobs(bigrams_count) + with open(model, 'wb') as f: + pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs],f) main() diff --git a/train.pyc b/train.pyc deleted file mode 100644 index 9e816dc..0000000 Binary files a/train.pyc and /dev/null differ diff --git a/train_baseline.py b/train_baseline.py new file mode 100755 index 0000000..ea1c148 --- /dev/null +++ b/train_baseline.py @@ -0,0 +1,91 @@ +#!/usr/bin/python3 +from collections import defaultdict +import math +import pickle +import re + +# in expected.tsv +def calc_class_logprob(expected_path): + paranolal_classcount=0 + sceptic_classcount=0 + with open(expected_path) as f: + for line in f: + line = line.rstrip('\n').replace(' ','') + if 'P' in line: + paranolal_classcount +=1 + elif 'S' in line: + sceptic_classcount +=1 + + paranol_prob = paranolal_classcount / (paranolal_classcount + sceptic_classcount) + sceptic_prob = sceptic_classcount / (paranolal_classcount + sceptic_classcount) + + return math.log(paranol_prob), math.log(sceptic_prob) + +def clear_tokens(tokens, is_text=True): + tokens = tokens.replace('\\n', ' ') + return tokens + # delete links, special characters, kropki, and \n + tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) + tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens) + tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens) + tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens) + tokens = re.sub(r'[0-9]+', ' ', tokens) + tokens = re.sub(r'œ|·', '', tokens) + if is_text: + tokens = re.sub(r' +', ' ', tokens) + else: + tokens = re.sub(r' +', '', tokens) + return tokens + +# ile razy slowo wystepuje w dokumentach w danej klasie +def calc_word_count(in_path, expected_path): + word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja + with open(in_path) as infile, open(expected_path) as expectedfile: + for line, exp in zip(infile, expectedfile): + class_ = exp.rstrip('\n').replace(' ','') + text, timestap =line.rstrip('\n').split('\t') + #print(f"text {type(text)}") + text = clear_tokens(text, True) + tokens = text.lower().split(' ') + #print(f"tokens {type(tokens)}") + for token in tokens: + clear_tokens(token,False) + if class_ == 'P': + word_counts['paranormal'][token] += 1 + elif class_ == 'S': + word_counts['sceptic'][token]+=1 + + return word_counts + +def calc_word_logprobs(word_counts): + total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys()) + total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys()) + word_logprobs= {'paranormal': {}, 'sceptic': {}} + for class_ in word_counts.keys(): # sceptic paranormal + for token, value in word_counts[class_].items(): + if class_ == 'sceptic': + word_prob = (value +1)/ total_skeptic + elif class_ == 'paranormal': + word_prob = (value+1)/ total_paranormal + + #print (token) + word_logprobs[class_][token] = math.log(word_prob) + + return word_logprobs + +def main(): + expected = './train/expected.tsv' + #expected = './dev-0/expected.tsv' + in_f = './train/in.tsv' + #in_f = './dev-0/in.tsv' + print (f"expected {expected}") + print (f"in {in_f}") + paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected) + wordcounts =calc_word_count(in_f,expected) + + word_logprobs = calc_word_logprobs(wordcounts) + with open('naive_base_model.pkl', 'wb') as f: + pickle.dump([paranormal_class_lgprob, skeptic_class_logprob, word_logprobs], f) + # w predict.py bierzemy ten wzor argmax P(w) iloczynP(w|c) + +main()