Update bigram with new regex

This commit is contained in:
s426135 2020-03-29 14:28:07 +02:00
parent 0b9f952661
commit 59132bf9c6
5 changed files with 44 additions and 44 deletions

Binary file not shown.

View File

@ -116,7 +116,7 @@
P P
S S
S S
P S
S S
P P
P P
@ -1850,7 +1850,7 @@
S S
S S
S S
P S
S S
S S
S S
@ -2511,7 +2511,7 @@
P P
P P
P P
P S
P P
S S
S S
@ -2788,7 +2788,7 @@
S S
S S
P P
P S
S S
S S
P P
@ -2924,7 +2924,7 @@
S S
P P
S S
P S
P P
S S
S S
@ -3521,7 +3521,7 @@
S S
S S
S S
P S
S S
S S
S S

1 S
116 P
117 S
118 S
119 P S
120 S
121 P
122 P
1850 S
1851 S
1852 S
1853 P S
1854 S
1855 S
1856 S
2511 P
2512 P
2513 P
2514 P S
2515 P
2516 S
2517 S
2788 S
2789 S
2790 P
2791 P S
2792 S
2793 S
2794 P
2924 S
2925 P
2926 S
2927 P S
2928 P
2929 S
2930 S
3521 S
3522 S
3523 S
3524 P S
3525 S
3526 S
3527 S

View File

@ -30,11 +30,11 @@ def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigra
def clear_post(post): def clear_post(post):
post = post.replace('\\n', ' ') post = post.replace('\\n', ' ')
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+]+(\)|)', '', post) post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', '', post)
post = re.sub(r'[\.\,]+', ' ', post) post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt)','',post) post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%]+', '', post) post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|]+', '', post)
post = re.sub(r' \- ', ' ', post) post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post) post = re.sub(r' +', ' ', post)
post = post.rstrip(' ') post = post.rstrip(' ')
return post return post

View File

@ -235,7 +235,7 @@
S S
S S
S S
P S
S S
S S
S S
@ -343,7 +343,7 @@
P P
S S
S S
P S
S S
S S
S S
@ -357,7 +357,7 @@
P P
S S
S S
P S
P P
S S
S S
@ -425,7 +425,7 @@
S S
P P
S S
S P
S S
S S
S S
@ -441,7 +441,7 @@
P P
S S
S S
P S
S S
P P
P P
@ -456,7 +456,7 @@
S S
S S
S S
S P
S S
S S
S S
@ -694,7 +694,7 @@
S S
P P
S S
S P
S S
P P
S S
@ -844,7 +844,7 @@
S S
P P
S S
S P
S S
P P
S S
@ -998,7 +998,7 @@
S S
S S
S S
S P
S S
S S
S S
@ -1268,7 +1268,7 @@
P P
S S
S S
P S
S S
S S
P P
@ -1383,7 +1383,7 @@
S S
S S
S S
S P
S S
S S
S S
@ -1582,7 +1582,7 @@
S S
S S
S S
P S
S S
S S
S S
@ -1788,7 +1788,7 @@
S S
S S
P P
S P
S S
S S
S S
@ -1855,7 +1855,7 @@
S S
S S
S S
S P
S S
S S
S S
@ -1986,7 +1986,7 @@
S S
S S
S S
S P
S S
S S
S S
@ -2217,7 +2217,7 @@
S S
S S
S S
S P
S S
S S
S S
@ -2227,7 +2227,7 @@
S S
S S
S S
P S
S S
S S
S S
@ -3282,7 +3282,7 @@
S S
P P
S S
S P
S S
S S
S S
@ -3378,7 +3378,7 @@
S S
S S
P P
S P
S S
S S
P P
@ -3724,7 +3724,7 @@
S S
P P
S S
P S
S S
S S
S S
@ -3765,7 +3765,7 @@
P P
S S
S S
P S
S S
S S
S S
@ -4111,7 +4111,7 @@
P P
S S
P P
P S
S S
S S
S S
@ -4190,7 +4190,7 @@
S S
S S
S S
P S
P P
S S
P P
@ -4344,7 +4344,7 @@
S S
S S
S S
S P
P P
S S
S S
@ -4421,19 +4421,19 @@
S S
P P
P P
P
S S
S S
S S
S S
S S
P
S S
P P
S S
P
S S
S S
S S
P
S S
S S
S S
@ -4763,7 +4763,7 @@
S S
S S
S S
S P
S S
S S
S S

1 P
235 S
236 S
237 S
238 P S
239 S
240 S
241 S
343 P
344 S
345 S
346 P S
347 S
348 S
349 S
357 P
358 S
359 S
360 P S
361 P
362 S
363 S
425 S
426 P
427 S
428 S P
429 S
430 S
431 S
441 P
442 S
443 S
444 P S
445 S
446 P
447 P
456 S
457 S
458 S
459 S P
460 S
461 S
462 S
694 S
695 P
696 S
697 S P
698 S
699 P
700 S
844 S
845 P
846 S
847 S P
848 S
849 P
850 S
998 S
999 S
1000 S
1001 S P
1002 S
1003 S
1004 S
1268 P
1269 S
1270 S
1271 P S
1272 S
1273 S
1274 P
1383 S
1384 S
1385 S
1386 S P
1387 S
1388 S
1389 S
1582 S
1583 S
1584 S
1585 P S
1586 S
1587 S
1588 S
1788 S
1789 S
1790 P
1791 S P
1792 S
1793 S
1794 S
1855 S
1856 S
1857 S
1858 S P
1859 S
1860 S
1861 S
1986 S
1987 S
1988 S
1989 S P
1990 S
1991 S
1992 S
2217 S
2218 S
2219 S
2220 S P
2221 S
2222 S
2223 S
2227 S
2228 S
2229 S
2230 P S
2231 S
2232 S
2233 S
3282 S
3283 P
3284 S
3285 S P
3286 S
3287 S
3288 S
3378 S
3379 S
3380 P
3381 S P
3382 S
3383 S
3384 P
3724 S
3725 P
3726 S
3727 P S
3728 S
3729 S
3730 S
3765 P
3766 S
3767 S
3768 P S
3769 S
3770 S
3771 S
4111 P
4112 S
4113 P
4114 P S
4115 S
4116 S
4117 S
4190 S
4191 S
4192 S
4193 P S
4194 P
4195 S
4196 P
4344 S
4345 S
4346 S
4347 S P
4348 P
4349 S
4350 S
4421 S
4422 P
4423 P
P
4424 S
4425 S
4426 S
4427 S
4428 S
P
4429 S
4430 P
4431 S
4432 P
4433 S
4434 S
4435 S
4436 P
4437 S
4438 S
4439 S
4763 S
4764 S
4765 S
4766 S P
4767 S
4768 S
4769 S

View File

@ -25,11 +25,11 @@ def calc_class_logprob(expected_path):
def clear_post(post): def clear_post(post):
post = post.replace('\\n', ' ') post = post.replace('\\n', ' ')
# delete links # delete links
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+]+(\)|)', '', post) post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', '', post)
post = re.sub(r'[\.\,\/]+', ' ', post) post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt)','',post) post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%]+', '', post) post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|]+', '', post)
post = re.sub(r' \- ', ' ', post) post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post) post = re.sub(r' +', ' ', post)
post = post.rstrip(' ') post = post.rstrip(' ')
return post return post
@ -45,7 +45,7 @@ def calc_bigram_count(in_path, expected_path):
for index in range(len(tokens)-1): for index in range(len(tokens)-1):
# if there is next token we append current and next # if there is next token we append current and next
bigram = tokens[index] + " " + tokens[index + 1] bigram = tokens[index] + " " + tokens[index + 1]
#print(bigram) print(bigram)
#print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;") #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
if class_ == 'P': if class_ == 'P':
bigram_counts['paranormal'][bigram] +=1 bigram_counts['paranormal'][bigram] +=1