#!/usr/bin/python3 import pickle import math import re import sys import nltk from nltk.corpus import stopwords def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word): text, timestap = post.rstrip('\n').split('\t') tokens = clear_post(text) #tokens = text.lower().split(' ') probs = {} for class_ in bigrams_logprobs.keys(): product = 0 for index in range(len(tokens)-1): # we handle bigrams not in models as neutral bigram = tokens[index] + " " + tokens[index + 1] #print(bigram) try: product += bigrams_logprobs[class_][bigram] * 4 except KeyError: product += 0 # if class_ == 'sceptic': # product += math.log(1/total_sceptic_bigram) # elif class_ == 'paranormal': # product += math.log(1/total_paranormal_bigram) for token in tokens: try: product += words_logprobs[class_][token]/7 except KeyError: product +=0 #if class_ == 'sceptic': # product += math.log(1/total_sceptic_word) #elif class_ == 'paranormal': # product += math.log(1/total_paranormal_word) if class_ == 'sceptic': product += sceptic_class_logprob elif class_ == 'paranormal': product += paranormal_class_logprob probs[abs(product)] = class_ #print(probs) return probs[max(probs.keys())] def clear_post(post): post = post.replace('\\n', ' ') post = post.lower() post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', ' internetlink ', post) post = re.sub(r'[\.\,\/\~]+', ' ', post) post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post) post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post) post = re.sub(r'( \- |\-\-+)', ' ', post) post = re.sub(r' +', ' ', post) post = post.rstrip(' ') post = post.split(' ') stop_words = set(stopwords.words('english')) post_no_stop = [w for w in post if not w in stop_words] return post_no_stop def main(): if len(sys.argv) != 4: print("syntax is ./predict.py in.tsv out.tsv model.pkl") return in_file = sys.argv[1] out_file = sys.argv[2] model = sys.argv[3] with open(model, 'rb') as f: pickle_list = pickle.load(f) paranormal_class_logprob = pickle_list[0] sceptic_class_logprob = pickle_list[1] bigrams_logprobs = pickle_list[2] words_logprobs = pickle_list[3] total_sceptic_bigram = pickle_list[4] total_paranormal_bigram = pickle_list[5] total_sceptic_word = pickle_list[6] total_paranormal_word = pickle_list[7] print(math.log(1/total_sceptic_bigram)) print(math.log(1/total_paranormal_bigram)) print(math.log(1/total_sceptic_word)) print(math.log(1/total_paranormal_word)) with open(in_file) as in_f, open(out_file, 'w') as out_f: for line in in_f: hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word) if hyp == 'sceptic': out_f.write(' S\n') elif hyp == 'paranormal': out_f.write(' P\n') main()