paranormal-or-skeptic/predict_bigram.py

#!/usr/bin/python3

import pickle
import math
import re
import sys
import nltk
from nltk.corpus import stopwords

def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word):
    text, timestap = post.rstrip('\n').split('\t')
    tokens = clear_post(text)
    #tokens = text.lower().split(' ')
    probs = {}
    for class_ in bigrams_logprobs.keys():
        product = 0
        for index in range(len(tokens)-1):
            # we handle bigrams not in models as neutral
            bigram = tokens[index] + " " + tokens[index + 1]
            #print(bigram)
            try:
                product +=  bigrams_logprobs[class_][bigram] * 4
            except KeyError:
                product += 0
       #         if class_ == 'sceptic':
       #             product += math.log(1/total_sceptic_bigram)
       #         elif class_ == 'paranormal':
       #             product += math.log(1/total_paranormal_bigram)
        for token in tokens:
            try:
                product += words_logprobs[class_][token]/7
            except KeyError:
                product +=0
                #if class_ == 'sceptic':
                #    product += math.log(1/total_sceptic_word)
                #elif class_ == 'paranormal':
                #    product += math.log(1/total_paranormal_word)

        if class_ == 'sceptic':
            product += sceptic_class_logprob
        elif class_ == 'paranormal':
            product += paranormal_class_logprob
        probs[abs(product)] = class_

    #print(probs)
    return probs[max(probs.keys())]

def clear_post(post):
    post = post.replace('\\n', ' ')
    post = post.lower()
    post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', ' internetlink ', post)
    post = re.sub(r'[\.\,\/\~]+', ' ', post)
    post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
    post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
    post = re.sub(r'( \- |\-\-+)', ' ', post)
    post = re.sub(r' +', ' ', post)
    post = post.rstrip(' ')
    post = post.split(' ')
    stop_words = set(stopwords.words('english'))
    post_no_stop = [w for w in post if not w in stop_words]
    return post_no_stop

def main():
    if len(sys.argv) != 4:
        print("syntax is ./predict.py in.tsv out.tsv model.pkl")
        return
    in_file = sys.argv[1]
    out_file = sys.argv[2]
    model = sys.argv[3]
    with open(model, 'rb') as f:
        pickle_list = pickle.load(f)

    paranormal_class_logprob = pickle_list[0]
    sceptic_class_logprob = pickle_list[1]
    bigrams_logprobs = pickle_list[2]
    words_logprobs = pickle_list[3]
    total_sceptic_bigram = pickle_list[4]
    total_paranormal_bigram = pickle_list[5]
    total_sceptic_word = pickle_list[6]
    total_paranormal_word = pickle_list[7]
    print(math.log(1/total_sceptic_bigram))
    print(math.log(1/total_paranormal_bigram))
    print(math.log(1/total_sceptic_word))
    print(math.log(1/total_paranormal_word))
    with open(in_file) as in_f, open(out_file, 'w') as out_f:
        for line in  in_f:
            hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs, total_sceptic_bigram, total_paranormal_bigram, total_sceptic_word, total_paranormal_word)
            if hyp == 'sceptic':
                out_f.write(' S\n')
            elif hyp == 'paranormal':
                out_f.write(' P\n')
main()