paranormal-or-skeptic/predict_baseline.py
2020-03-29 13:39:47 +02:00

72 lines
2.5 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python3
import pickle
import math
import re
def clear_tokens(tokens, is_text=True):
tokens = tokens.replace('\\n', ' ')
return tokens
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens)
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens)
tokens = re.sub(r'œ|·', '', tokens)
if is_text:
tokens = re.sub(r' +', ' ', tokens)
else:
tokens = re.sub(r' +', '', tokens)
return tokens
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
# dla kazdego tokenu z danego posta
text, timestap = post.rstrip('\n').split('\t')
text = clear_tokens(text, True)
tokens = text.lower().split(' ')
#probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
probs = {}
for class_ in word_logprobs.keys():
product = 1
for token in tokens:
token = clear_tokens(token, False)
try:
product *= word_logprobs[class_][token]
except KeyError:
product *= 1
# tu wzoru uzyj
if class_ == 'sceptic':
product *= sceptic_class_logprob
elif class_ == 'paranormal':
product *= paranormal_class_logprob
probs[abs(product)] = class_
#print(probs)
# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal
if search_for_keywords(text):
return 'paranormal'
return probs[max(probs.keys())]
def search_for_keywords(text):
keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens', 'atlantis']
return any(keyword in text for keyword in keywords)
def main():
with open('naive_base_model.pkl', 'rb') as f:
pickle_list = pickle.load(f)
paranormal_class_logprob = pickle_list[0]
sceptic_class_logprob = pickle_list[1]
word_logprobs = pickle_list[2]
in_file = "test-A/in.tsv"
#in_file = "dev-0/in.tsv"
out_file = "test-A/out.tsv"
#out_file = "dev-0/out.tsv"
print (f"in {in_file}")
print (f"out {out_file}")
with open(in_file) as in_f, open(out_file, 'w') as out_f:
for line in in_f:
hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs)
if hyp == 'sceptic':
out_f.write(" S\n")
elif hyp == 'paranormal':
out_f.write(' P\n')
main()