paranormal-or-skeptic/predict.py

#!/usr/bin/python3

import pickle
import math
import re

def clear_tokens(tokens, is_text=True):
    tokens = tokens.replace('\\n', ' ')
    return tokens
    tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
    tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\＞\″\±]+', ' ', tokens)
    tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
    tokens = re.sub(r'[0-9]+', ' ', tokens)
    tokens = re.sub(r'œ|·', '', tokens)
    if is_text:
        tokens = re.sub(r' +', ' ', tokens)
    else:
        tokens = re.sub(r' +', '', tokens)
    return tokens

def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
    # dla kazdego tokenu z danego posta
    text, timestap = post.rstrip('\n').split('\t')
    text =  clear_tokens(text, True)
    tokens = text.lower().split(' ')
    #probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
    probs = {}
    for class_ in word_logprobs.keys():
        product = 1
        for token in tokens:
            token = clear_tokens(token, False)
            try:
                product *= word_logprobs[class_][token]
            except KeyError:
                product *= 1
            # tu wzoru uzyj
        if class_ == 'sceptic':
            product *=  sceptic_class_logprob
        elif class_ == 'paranormal':
            product *= paranormal_class_logprob
        probs[abs(product)] = class_
        #print(probs)
# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal
    if search_for_keywords(text):
        return 'paranormal'
    return probs[max(probs.keys())]

def search_for_keywords(text):
    keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens', 'atlantis']
    return any(keyword in text for keyword in keywords)

def main():
    with open('naive_base_model.pkl', 'rb') as f:
        pickle_list = pickle.load(f)
    paranormal_class_logprob = pickle_list[0]
    sceptic_class_logprob = pickle_list[1]
    word_logprobs = pickle_list[2]
    in_file = "test-A/in.tsv"
    #in_file = "dev-0/in.tsv"
    out_file = "test-A/out.tsv"
    #out_file = "dev-0/out.tsv"
    print (f"in {in_file}")
    print (f"out {out_file}")
    with open(in_file) as in_f, open(out_file, 'w') as out_f:
        for line in in_f:
            hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs)
            if hyp == 'sceptic':
                out_f.write(" S\n")
            elif hyp == 'paranormal':
                 out_f.write(' P\n')
main()
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								#!/usr/bin/python3
 								import pickle
 								import math
-												Added some regex and fix logprobs

											
										
										
											2020-03-22 11:59:07 +01:00
+								import re
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
-												little rule base added

											
										
										
											2020-03-22 13:32:09 +01:00
+								def clear_tokens(tokens, is_text=True):
-												Added some regex and fix logprobs

											
										
										
											2020-03-22 11:59:07 +01:00
+								    tokens = tokens.replace('\\n', ' ')
-												old way

											
										
										
											2020-03-22 13:58:35 +01:00
+								    return tokens
-												fix predict.py

											
										
										
											2020-03-22 12:56:42 +01:00
+								    tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
 								    tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\＞\″\±]+', ' ', tokens)
-												Added some regex and fix logprobs

											
										
										
											2020-03-22 11:59:07 +01:00
+								    tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
-												little rule base added

											
										
										
											2020-03-22 13:32:09 +01:00
+								    tokens = re.sub(r'[0-9]+', ' ', tokens)
-												fix predict.py

											
										
										
											2020-03-22 12:56:42 +01:00
+								    tokens = re.sub(r'œ|·', '', tokens)
-												little rule base added

											
										
										
											2020-03-22 13:32:09 +01:00
+								    if is_text:
 								        tokens = re.sub(r' +', ' ', tokens)
 								    else:
 								        tokens = re.sub(r' +', '', tokens)
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								    return tokens
 								def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
 								    # dla kazdego tokenu z danego posta
 								    text, timestap = post.rstrip('\n').split('\t')
-												little rule base added

											
										
										
											2020-03-22 13:32:09 +01:00
+								    text =  clear_tokens(text, True)
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								    tokens = text.lower().split(' ')
-												old way

											
										
										
											2020-03-22 13:58:35 +01:00
+								    #probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
 								    probs = {}
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								    for class_ in word_logprobs.keys():
 								        product = 1
 								        for token in tokens:
-												little rule base added

											
										
										
											2020-03-22 13:32:09 +01:00
+								            token = clear_tokens(token, False)
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								            try:
-												Change regexes

											
										
										
											2020-03-22 14:32:24 +01:00
+								                product *= word_logprobs[class_][token]
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								            except KeyError:
-												Change regexes

											
										
										
											2020-03-22 14:32:24 +01:00
+								                product *= 1
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								            # tu wzoru uzyj
 								        if class_ == 'sceptic':
-												Change regexes

											
										
										
											2020-03-22 14:32:24 +01:00
+								            product *=  sceptic_class_logprob
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								        elif class_ == 'paranormal':
-												Change regexes

											
										
										
											2020-03-22 14:32:24 +01:00
+								            product *= paranormal_class_logprob
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								        probs[abs(product)] = class_
-												Added some regex and fix logprobs

											
										
										
											2020-03-22 11:59:07 +01:00
+								        #print(probs)
-												fix predict.py

											
										
										
											2020-03-22 12:56:42 +01:00
+								# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal
-												little rule base added

											
										
										
											2020-03-22 13:32:09 +01:00
+								    if search_for_keywords(text):
 								        return 'paranormal'
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								    return probs[max(probs.keys())]
-												little rule base added

											
										
										
											2020-03-22 13:32:09 +01:00
+								def search_for_keywords(text):
-												Change regexes

											
										
										
											2020-03-22 14:32:24 +01:00
+								    keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens', 'atlantis']
-												little rule base added

											
										
										
											2020-03-22 13:32:09 +01:00
+								    return any(keyword in text for keyword in keywords)
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
 								def main():
 								    with open('naive_base_model.pkl', 'rb') as f:
 								        pickle_list = pickle.load(f)
 								    paranormal_class_logprob = pickle_list[0]
 								    sceptic_class_logprob = pickle_list[1]
 								    word_logprobs = pickle_list[2]
-												Change regexes

											
										
										
											2020-03-22 14:32:24 +01:00
+								    in_file = "test-A/in.tsv"
 								    #in_file = "dev-0/in.tsv"
 								    out_file = "test-A/out.tsv"
 								    #out_file = "dev-0/out.tsv"
-												Fix model a little

											
										
										
											2020-03-22 12:14:52 +01:00
+								    print (f"in {in_file}")
 								    print (f"out {out_file}")
-												Added some regex and fix logprobs

											
										
										
											2020-03-22 11:59:07 +01:00
+								    with open(in_file) as in_f, open(out_file, 'w') as out_f:
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								        for line in in_f:
 								            hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs)
 								            if hyp == 'sceptic':
 								                out_f.write(" S\n")
 								            elif hyp == 'paranormal':
 								                 out_f.write(' P\n')
 								main()