paranormal-or-skeptic/train.py

92 lines
3.5 KiB
Python
Raw Normal View History

2020-03-22 10:15:36 +01:00
#!/usr/bin/python3
from collections import defaultdict
import math
import pickle
2020-03-22 11:59:07 +01:00
import re
2020-03-22 10:15:36 +01:00
# in expected.tsv
def calc_class_logprob(expected_path):
paranolal_classcount=0
sceptic_classcount=0
with open(expected_path) as f:
for line in f:
2020-03-22 12:14:52 +01:00
line = line.rstrip('\n').replace(' ','')
2020-03-22 10:15:36 +01:00
if 'P' in line:
paranolal_classcount +=1
elif 'S' in line:
sceptic_classcount +=1
paranol_prob = paranolal_classcount / (paranolal_classcount + sceptic_classcount)
sceptic_prob = sceptic_classcount / (paranolal_classcount + sceptic_classcount)
return math.log(paranol_prob), math.log(sceptic_prob)
2020-03-22 13:32:09 +01:00
def clear_tokens(tokens, is_text=True):
2020-03-22 11:59:07 +01:00
tokens = tokens.replace('\\n', ' ')
2020-03-22 13:58:35 +01:00
return tokens
2020-03-22 10:15:36 +01:00
# delete links, special characters, kropki, and \n
2020-03-22 11:59:07 +01:00
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
2020-03-22 12:56:42 +01:00
tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens)
2020-03-22 11:59:07 +01:00
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens)
2020-03-22 12:56:42 +01:00
tokens = re.sub(r'œ|·', '', tokens)
2020-03-22 13:32:09 +01:00
if is_text:
tokens = re.sub(r' +', ' ', tokens)
else:
tokens = re.sub(r' +', '', tokens)
2020-03-22 10:15:36 +01:00
return tokens
# ile razy slowo wystepuje w dokumentach w danej klasie
def calc_word_count(in_path, expected_path):
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
with open(in_path) as infile, open(expected_path) as expectedfile:
for line, exp in zip(infile, expectedfile):
class_ = exp.rstrip('\n').replace(' ','')
text, timestap =line.rstrip('\n').split('\t')
#print(f"text {type(text)}")
2020-03-22 13:32:09 +01:00
text = clear_tokens(text, True)
2020-03-22 10:15:36 +01:00
tokens = text.lower().split(' ')
#print(f"tokens {type(tokens)}")
for token in tokens:
2020-03-22 13:32:09 +01:00
clear_tokens(token,False)
2020-03-22 10:15:36 +01:00
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['sceptic'][token]+=1
return word_counts
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
word_logprobs= {'paranormal': {}, 'sceptic': {}}
for class_ in word_counts.keys(): # sceptic paranormal
for token, value in word_counts[class_].items():
if class_ == 'sceptic':
word_prob = (value +1)/ total_skeptic
elif class_ == 'paranormal':
word_prob = (value+1)/ total_paranormal
2020-03-22 11:59:07 +01:00
#print (token)
2020-03-22 10:15:36 +01:00
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
def main():
2020-03-22 14:32:24 +01:00
expected = './train/expected.tsv'
#expected = './dev-0/expected.tsv'
in_f = './train/in.tsv'
#in_f = './dev-0/in.tsv'
2020-03-22 12:14:52 +01:00
print (f"expected {expected}")
print (f"in {in_f}")
2020-03-22 11:59:07 +01:00
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)
wordcounts =calc_word_count(in_f,expected)
2020-03-22 10:15:36 +01:00
word_logprobs = calc_word_logprobs(wordcounts)
with open('naive_base_model.pkl', 'wb') as f:
pickle.dump([paranormal_class_lgprob, skeptic_class_logprob, word_logprobs], f)
# w predict.py bierzemy ten wzor argmax P(w) iloczynP(w|c)
main()