2020-03-22 10:15:36 +01:00
|
|
|
|
#!/usr/bin/python3
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
import math
|
|
|
|
|
import pickle
|
2020-03-22 11:59:07 +01:00
|
|
|
|
import re
|
2020-03-22 10:15:36 +01:00
|
|
|
|
|
|
|
|
|
# in expected.tsv
|
|
|
|
|
def calc_class_logprob(expected_path):
|
|
|
|
|
paranolal_classcount=0
|
|
|
|
|
sceptic_classcount=0
|
|
|
|
|
with open(expected_path) as f:
|
|
|
|
|
for line in f:
|
2020-03-22 12:14:52 +01:00
|
|
|
|
line = line.rstrip('\n').replace(' ','')
|
2020-03-22 10:15:36 +01:00
|
|
|
|
if 'P' in line:
|
|
|
|
|
paranolal_classcount +=1
|
|
|
|
|
elif 'S' in line:
|
|
|
|
|
sceptic_classcount +=1
|
|
|
|
|
|
|
|
|
|
paranol_prob = paranolal_classcount / (paranolal_classcount + sceptic_classcount)
|
|
|
|
|
sceptic_prob = sceptic_classcount / (paranolal_classcount + sceptic_classcount)
|
|
|
|
|
|
|
|
|
|
return math.log(paranol_prob), math.log(sceptic_prob)
|
|
|
|
|
|
2020-03-22 13:32:09 +01:00
|
|
|
|
def clear_tokens(tokens, is_text=True):
|
2020-03-22 11:59:07 +01:00
|
|
|
|
tokens = tokens.replace('\\n', ' ')
|
2020-03-22 10:15:36 +01:00
|
|
|
|
# delete links, special characters, kropki, and \n
|
2020-03-22 11:59:07 +01:00
|
|
|
|
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
2020-03-22 12:56:42 +01:00
|
|
|
|
tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens)
|
|
|
|
|
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens)
|
2020-03-22 11:59:07 +01:00
|
|
|
|
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
|
|
|
|
tokens = re.sub(r'[0-9]+', ' ', tokens)
|
2020-03-22 12:56:42 +01:00
|
|
|
|
tokens = re.sub(r'œ|·', '', tokens)
|
2020-03-22 13:32:09 +01:00
|
|
|
|
if is_text:
|
|
|
|
|
tokens = re.sub(r' +', ' ', tokens)
|
|
|
|
|
else:
|
|
|
|
|
tokens = re.sub(r' +', '', tokens)
|
2020-03-22 10:15:36 +01:00
|
|
|
|
return tokens
|
|
|
|
|
|
|
|
|
|
# ile razy slowo wystepuje w dokumentach w danej klasie
|
|
|
|
|
def calc_word_count(in_path, expected_path):
|
|
|
|
|
word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
|
|
|
|
|
with open(in_path) as infile, open(expected_path) as expectedfile:
|
|
|
|
|
for line, exp in zip(infile, expectedfile):
|
|
|
|
|
class_ = exp.rstrip('\n').replace(' ','')
|
|
|
|
|
text, timestap =line.rstrip('\n').split('\t')
|
|
|
|
|
#print(f"text {type(text)}")
|
2020-03-22 13:32:09 +01:00
|
|
|
|
text = clear_tokens(text, True)
|
2020-03-22 10:15:36 +01:00
|
|
|
|
tokens = text.lower().split(' ')
|
|
|
|
|
#print(f"tokens {type(tokens)}")
|
|
|
|
|
for token in tokens:
|
2020-03-22 13:32:09 +01:00
|
|
|
|
clear_tokens(token,False)
|
2020-03-22 10:15:36 +01:00
|
|
|
|
if class_ == 'P':
|
|
|
|
|
word_counts['paranormal'][token] += 1
|
|
|
|
|
elif class_ == 'S':
|
|
|
|
|
word_counts['sceptic'][token]+=1
|
|
|
|
|
|
|
|
|
|
return word_counts
|
|
|
|
|
|
|
|
|
|
def calc_word_logprobs(word_counts):
|
|
|
|
|
total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
|
|
|
|
|
total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
|
|
|
|
|
word_logprobs= {'paranormal': {}, 'sceptic': {}}
|
|
|
|
|
for class_ in word_counts.keys(): # sceptic paranormal
|
|
|
|
|
for token, value in word_counts[class_].items():
|
|
|
|
|
if class_ == 'sceptic':
|
|
|
|
|
word_prob = (value +1)/ total_skeptic
|
|
|
|
|
elif class_ == 'paranormal':
|
|
|
|
|
word_prob = (value+1)/ total_paranormal
|
|
|
|
|
|
2020-03-22 11:59:07 +01:00
|
|
|
|
#print (token)
|
2020-03-22 10:15:36 +01:00
|
|
|
|
word_logprobs[class_][token] = math.log(word_prob)
|
|
|
|
|
|
|
|
|
|
return word_logprobs
|
|
|
|
|
|
|
|
|
|
def main():
|
2020-03-22 13:32:09 +01:00
|
|
|
|
expected = './train/expected.tsv'
|
|
|
|
|
#expected = './dev-0/expected.tsv'
|
|
|
|
|
in_f = './train/in.tsv'
|
|
|
|
|
#in_f = './dev-0/in.tsv'
|
2020-03-22 12:14:52 +01:00
|
|
|
|
print (f"expected {expected}")
|
|
|
|
|
print (f"in {in_f}")
|
2020-03-22 11:59:07 +01:00
|
|
|
|
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)
|
|
|
|
|
wordcounts =calc_word_count(in_f,expected)
|
2020-03-22 10:15:36 +01:00
|
|
|
|
|
|
|
|
|
word_logprobs = calc_word_logprobs(wordcounts)
|
|
|
|
|
with open('naive_base_model.pkl', 'wb') as f:
|
|
|
|
|
pickle.dump([paranormal_class_lgprob, skeptic_class_logprob, word_logprobs], f)
|
|
|
|
|
# w predict.py bierzemy ten wzor argmax P(w) iloczynP(w|c)
|
|
|
|
|
|
|
|
|
|
main()
|