#!/usr/bin/python3 import pickle import math import re import sys import nltk from nltk.corpus import stopwords def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs): text, timestap = post.rstrip('\n').split('\t') tokens = clear_post(text) #tokens = text.lower().split(' ') probs = {} for class_ in bigrams_logprobs.keys(): product = 0 for index in range(len(tokens)-1): # we handle bigrams not in models as neutral bigram = tokens[index] + " " + tokens[index + 1] #print(bigram) try: product += bigrams_logprobs[class_][bigram] except KeyError: product += 0 for token in tokens: try: product += words_logprobs[class_][token] except KeyError: product += 0 if class_ == 'sceptic': product += sceptic_class_logprob elif class_ == 'paranormal': product += paranormal_class_logprob probs[abs(product)] = class_ #print(probs) return probs[max(probs.keys())] def clear_post(post): post = post.replace('\\n', ' ') post = post.lower() post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', ' internetlink ', post) post = re.sub(r'[\.\,\/\~]+', ' ', post) post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post) post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post) post = re.sub(r'( \- |\-\-+)', ' ', post) post = re.sub(r' +', ' ', post) post = post.rstrip(' ') post = post.split(' ') stop_words = set(stopwords.words('english')) post_no_stop = [w for w in post if not w in stop_words] return post_no_stop def main(): if len(sys.argv) != 4: print("syntax is ./predict.py in.tsv out.tsv model.pkl") return in_file = sys.argv[1] out_file = sys.argv[2] model = sys.argv[3] with open(model, 'rb') as f: pickle_list = pickle.load(f) paranormal_class_logprob = pickle_list[0] sceptic_class_logprob = pickle_list[1] bigrams_logprobs = pickle_list[2] words_logprobs = pickle_list[3] with open(in_file) as in_f, open(out_file, 'w') as out_f: for line in in_f: hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs) if hyp == 'sceptic': out_f.write(' S\n') elif hyp == 'paranormal': out_f.write(' P\n') main()