Added some regex and fix logprobs

This commit is contained in:
s426135 2020-03-22 11:59:07 +01:00
parent eb6ba923a4
commit fa155b7a7e
9 changed files with 11693 additions and 1131 deletions

BIN
.predict.py.swp Normal file

Binary file not shown.

BIN
.train.py.swp Normal file

Binary file not shown.

5272
dev-0/in.tsv Normal file

File diff suppressed because one or more lines are too long

Binary file not shown.

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -2,10 +2,15 @@
import pickle import pickle
import math import math
import re
def clear_tokens(tokens): def clear_tokens(tokens):
tokens = tokens.replace('\n', ' ') tokens = tokens.replace('\\n', ' ')
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@]+', ' ', tokens)
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens)
tokens = re.sub(r' +', ' ', tokens)
return tokens return tokens
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs): def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
@ -18,16 +23,16 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
product = 1 product = 1
for token in tokens: for token in tokens:
try: try:
product *= word_logprobs[class_][token] product += word_logprobs[class_][token]
except KeyError: except KeyError:
pass pass
# tu wzoru uzyj # tu wzoru uzyj
if class_ == 'sceptic': if class_ == 'sceptic':
product *= sceptic_class_logprob product += sceptic_class_logprob
elif class_ == 'paranormal': elif class_ == 'paranormal':
product *= paranormal_class_logprob product += paranormal_class_logprob
probs[abs(product)] = class_ probs[abs(product)] = class_
print(probs) #print(probs)
return probs[max(probs.keys())] return probs[max(probs.keys())]
@ -38,7 +43,11 @@ def main():
paranormal_class_logprob = pickle_list[0] paranormal_class_logprob = pickle_list[0]
sceptic_class_logprob = pickle_list[1] sceptic_class_logprob = pickle_list[1]
word_logprobs = pickle_list[2] word_logprobs = pickle_list[2]
with open('test-A/in.tsv') as in_f, open('test-A/out.tsv', 'w') as out_f: in_file = "test-A/in.tsv"
#in_file = "dev-0/in.tsv"
out_file = "test-A/out.tsv"
#out_file = "dev-0/out.tsv"
with open(in_file) as in_f, open(out_file, 'w') as out_f:
for line in in_f: for line in in_f:
hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs) hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs)
if hyp == 'sceptic': if hyp == 'sceptic':

File diff suppressed because it is too large Load Diff

View File

@ -2,6 +2,7 @@
from collections import defaultdict from collections import defaultdict
import math import math
import pickle import pickle
import re
# in expected.tsv # in expected.tsv
def calc_class_logprob(expected_path): def calc_class_logprob(expected_path):
@ -20,9 +21,13 @@ def calc_class_logprob(expected_path):
return math.log(paranol_prob), math.log(sceptic_prob) return math.log(paranol_prob), math.log(sceptic_prob)
def clear_tokens(tokens): def clear_tokens(tokens):
tokens = tokens.replace('\n', ' ') tokens = tokens.replace('\\n', ' ')
# delete links, special characters, kropki, and \n # delete links, special characters, kropki, and \n
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@]+', ' ', tokens)
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens)
tokens = re.sub(r' +', ' ', tokens)
return tokens return tokens
# ile razy slowo wystepuje w dokumentach w danej klasie # ile razy slowo wystepuje w dokumentach w danej klasie
@ -55,14 +60,18 @@ def calc_word_logprobs(word_counts):
elif class_ == 'paranormal': elif class_ == 'paranormal':
word_prob = (value+1)/ total_paranormal word_prob = (value+1)/ total_paranormal
print (token) #print (token)
word_logprobs[class_][token] = math.log(word_prob) word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs return word_logprobs
def main(): def main():
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob('./train/expected.tsv') expected = './train/expected.tsv'
wordcounts =calc_word_count('./train/in.tsv','./train/expected.tsv') #expected = './dev-0/expected.tsv'
in_f = './train/in.tsv'
#in_f = './dev-0/in.tsv'
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)
wordcounts =calc_word_count(in_f,expected)
word_logprobs = calc_word_logprobs(wordcounts) word_logprobs = calc_word_logprobs(wordcounts)
with open('naive_base_model.pkl', 'wb') as f: with open('naive_base_model.pkl', 'wb') as f: