Added some regex and fix logprobs
This commit is contained in:
parent
eb6ba923a4
commit
fa155b7a7e
BIN
.predict.py.swp
Normal file
BIN
.predict.py.swp
Normal file
Binary file not shown.
BIN
.train.py.swp
Normal file
BIN
.train.py.swp
Normal file
Binary file not shown.
5272
dev-0/in.tsv
Normal file
5272
dev-0/in.tsv
Normal file
File diff suppressed because one or more lines are too long
BIN
dev-0/in.tsv.xz
BIN
dev-0/in.tsv.xz
Binary file not shown.
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
23
predict.py
23
predict.py
@ -2,10 +2,15 @@
|
|||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
import math
|
import math
|
||||||
|
import re
|
||||||
|
|
||||||
def clear_tokens(tokens):
|
def clear_tokens(tokens):
|
||||||
tokens = tokens.replace('\n', ' ')
|
tokens = tokens.replace('\\n', ' ')
|
||||||
|
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
||||||
|
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@]+', ' ', tokens)
|
||||||
|
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
||||||
|
tokens = re.sub(r'[0-9]+', ' ', tokens)
|
||||||
|
tokens = re.sub(r' +', ' ', tokens)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
|
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
|
||||||
@ -18,16 +23,16 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
|
|||||||
product = 1
|
product = 1
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
try:
|
try:
|
||||||
product *= word_logprobs[class_][token]
|
product += word_logprobs[class_][token]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
# tu wzoru uzyj
|
# tu wzoru uzyj
|
||||||
if class_ == 'sceptic':
|
if class_ == 'sceptic':
|
||||||
product *= sceptic_class_logprob
|
product += sceptic_class_logprob
|
||||||
elif class_ == 'paranormal':
|
elif class_ == 'paranormal':
|
||||||
product *= paranormal_class_logprob
|
product += paranormal_class_logprob
|
||||||
probs[abs(product)] = class_
|
probs[abs(product)] = class_
|
||||||
print(probs)
|
#print(probs)
|
||||||
|
|
||||||
return probs[max(probs.keys())]
|
return probs[max(probs.keys())]
|
||||||
|
|
||||||
@ -38,7 +43,11 @@ def main():
|
|||||||
paranormal_class_logprob = pickle_list[0]
|
paranormal_class_logprob = pickle_list[0]
|
||||||
sceptic_class_logprob = pickle_list[1]
|
sceptic_class_logprob = pickle_list[1]
|
||||||
word_logprobs = pickle_list[2]
|
word_logprobs = pickle_list[2]
|
||||||
with open('test-A/in.tsv') as in_f, open('test-A/out.tsv', 'w') as out_f:
|
in_file = "test-A/in.tsv"
|
||||||
|
#in_file = "dev-0/in.tsv"
|
||||||
|
out_file = "test-A/out.tsv"
|
||||||
|
#out_file = "dev-0/out.tsv"
|
||||||
|
with open(in_file) as in_f, open(out_file, 'w') as out_f:
|
||||||
for line in in_f:
|
for line in in_f:
|
||||||
hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs)
|
hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs)
|
||||||
if hyp == 'sceptic':
|
if hyp == 'sceptic':
|
||||||
|
2238
test-A/out.tsv
2238
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
19
train.py
19
train.py
@ -2,6 +2,7 @@
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import math
|
import math
|
||||||
import pickle
|
import pickle
|
||||||
|
import re
|
||||||
|
|
||||||
# in expected.tsv
|
# in expected.tsv
|
||||||
def calc_class_logprob(expected_path):
|
def calc_class_logprob(expected_path):
|
||||||
@ -20,9 +21,13 @@ def calc_class_logprob(expected_path):
|
|||||||
return math.log(paranol_prob), math.log(sceptic_prob)
|
return math.log(paranol_prob), math.log(sceptic_prob)
|
||||||
|
|
||||||
def clear_tokens(tokens):
|
def clear_tokens(tokens):
|
||||||
tokens = tokens.replace('\n', ' ')
|
tokens = tokens.replace('\\n', ' ')
|
||||||
# delete links, special characters, kropki, and \n
|
# delete links, special characters, kropki, and \n
|
||||||
|
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
||||||
|
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@]+', ' ', tokens)
|
||||||
|
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
||||||
|
tokens = re.sub(r'[0-9]+', ' ', tokens)
|
||||||
|
tokens = re.sub(r' +', ' ', tokens)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
# ile razy slowo wystepuje w dokumentach w danej klasie
|
# ile razy slowo wystepuje w dokumentach w danej klasie
|
||||||
@ -55,14 +60,18 @@ def calc_word_logprobs(word_counts):
|
|||||||
elif class_ == 'paranormal':
|
elif class_ == 'paranormal':
|
||||||
word_prob = (value+1)/ total_paranormal
|
word_prob = (value+1)/ total_paranormal
|
||||||
|
|
||||||
print (token)
|
#print (token)
|
||||||
word_logprobs[class_][token] = math.log(word_prob)
|
word_logprobs[class_][token] = math.log(word_prob)
|
||||||
|
|
||||||
return word_logprobs
|
return word_logprobs
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob('./train/expected.tsv')
|
expected = './train/expected.tsv'
|
||||||
wordcounts =calc_word_count('./train/in.tsv','./train/expected.tsv')
|
#expected = './dev-0/expected.tsv'
|
||||||
|
in_f = './train/in.tsv'
|
||||||
|
#in_f = './dev-0/in.tsv'
|
||||||
|
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)
|
||||||
|
wordcounts =calc_word_count(in_f,expected)
|
||||||
|
|
||||||
word_logprobs = calc_word_logprobs(wordcounts)
|
word_logprobs = calc_word_logprobs(wordcounts)
|
||||||
with open('naive_base_model.pkl', 'wb') as f:
|
with open('naive_base_model.pkl', 'wb') as f:
|
||||||
|
Loading…
Reference in New Issue
Block a user