#!/usr/bin/python3 import pickle import math import re def clear_tokens(tokens): tokens = tokens.replace('\\n', ' ') tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@]+', ' ', tokens) tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens) tokens = re.sub(r'[0-9]+', ' ', tokens) tokens = re.sub(r' +', ' ', tokens) return tokens def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs): # dla kazdego tokenu z danego posta text, timestap = post.rstrip('\n').split('\t') text = clear_tokens(text) tokens = text.lower().split(' ') probs = {0.0 : 'sceptic', 0.0 : 'paranormal'} for class_ in word_logprobs.keys(): product = 1 for token in tokens: try: product += word_logprobs[class_][token] except KeyError: pass # tu wzoru uzyj if class_ == 'sceptic': product += sceptic_class_logprob elif class_ == 'paranormal': product += paranormal_class_logprob probs[abs(product)] = class_ #print(probs) return probs[max(probs.keys())] def main(): with open('naive_base_model.pkl', 'rb') as f: pickle_list = pickle.load(f) paranormal_class_logprob = pickle_list[0] sceptic_class_logprob = pickle_list[1] word_logprobs = pickle_list[2] in_file = "test-A/in.tsv" #in_file = "dev-0/in.tsv" out_file = "test-A/out.tsv" #out_file = "dev-0/out.tsv" with open(in_file) as in_f, open(out_file, 'w') as out_f: for line in in_f: hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs) if hyp == 'sceptic': out_f.write(" S\n") elif hyp == 'paranormal': out_f.write(' P\n') main()