diff --git a/code_prediction.py b/code_prediction.py index 276591d..41728d5 100644 --- a/code_prediction.py +++ b/code_prediction.py @@ -3,9 +3,6 @@ import math import pickle import re -open_file = open('naive_base_model.pkl', 'rb') -pickle_loaded = pickle.load(open_file) -paranomal_class_logprob, skeptic_class_logprob, word_logprobs = pickle_loaded def prediction(input,output): output_file = open(output,'w') pickle_load = pickle.load(open('naive_base_model.pkl', 'rb')) @@ -15,20 +12,21 @@ def prediction(input,output): temp_paranormal_logprob = paranomal_class_logprob temp_skeptic_logprob = skeptic_class_logprob text, timestamp = line.rstrip('\n').split('\t') - text = re.sub(r'\\n+', " ", text) + text = text.lower() text = re.sub(r'http\S+', " ", text) + text = re.sub(r'\\n+', " ", text) text = re.sub(r'\/[a-z]\/', " ", text) text = re.sub(r'[^a-z]', " ", text) text = re.sub(r'\s{2,}', " ", text) text = re.sub(r'(\s+|\\n)', ' ', text) text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text) text = re.sub(r'^\s', "", text) - tokens = text.lower().split(' ') + tokens = text.split(' ') for token in tokens: if token not in word_logprobs['paranormal']: - word_logprobs['paranormal'][token] = -15.6 + word_logprobs['paranormal'][token] = -14.78 if token not in word_logprobs['skeptic']: - word_logprobs['skeptic'][token] = -14.78 + word_logprobs['skeptic'][token] = -15.6 temp_paranormal_logprob += paranomal_class_logprob + word_logprobs['paranormal'][token] temp_skeptic_logprob += skeptic_class_logprob + word_logprobs['skeptic'][token]