paranormal-or-skeptic/train.py

#!/usr/bin/python3
from collections import defaultdict
import math
import pickle
import re
import sys
import nltk
from nltk.corpus import stopwords

def calc_class_logprob(expected_path):
    paranormal_classcount = 0
    sceptic_classcount = 0

    with open(expected_path) as f:
        for line in f:
            line = line.rstrip('\n').replace(' ','')
            if 'P' in line:
                paranormal_classcount +=1
            elif 'S' in line:
                sceptic_classcount +=1

    paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
    sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)

    return math.log(paranol_prob), math.log(sceptic_prob)

def clear_post(post):
    post = post.replace('\\n', ' ')
    post = post.lower()
    # delete links
    post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
    post = re.sub(r'[\.\,\/\~]+', ' ', post)
    post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
    post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
    post = re.sub(r'( \- |\-\-+)', ' ', post)
    post = re.sub(r' +', ' ', post)
    post = post.rstrip(' ')
    post = post.split(' ')
    stop_words = set(stopwords.words('english'))
    post_no_stop = [w for w in post if not w in stop_words]
    return post_no_stop

#def calc_bigram_count(in_path, expected_path):
#    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
#    with open(in_path) as infile, open(expected_path) as expected_file:
#        num_of_bigams = 0
#        for line, exp in zip(infile, expected_file):
#            class_ = exp.rstrip('\n').replace(' ', '')
#            text, timestap = line.rstrip('\n').split('\t')
#            tokens = clear_post(text)
#            #tokens = text.lower().split(' ')
#            for index in range(len(tokens)-1):
#                # if there is next token we append current and next
#                bigram = tokens[index] + " " + tokens[index + 1]
#                #print(bigram)
#                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
#                if class_ == 'P':
#                    bigram_counts['paranormal'][bigram] +=1
#                elif class_ == 'S':
#                    bigram_counts['sceptic'][bigram] +=1
#                num_of_bigams +=1
#    #print(f"num of every added bigams with repetitions {num_of_bigams})")
#    #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
#    return bigram_counts

def calc_bigram_logprobs(bigram_counts):
    total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
    total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
    bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
    for class_ in bigram_counts.keys():
        for bigram, value in bigram_counts[class_].items():
            if class_ == "sceptic":
                bigram_prob = (value + 1) / total_sceptic
            elif class_ == "paranormal":
                bigram_prob = (value + 1) / total_paranormal

            bigram_logprobs[class_][bigram] = math.log(bigram_prob)

    return bigram_logprobs

#def calc_word_count(in_path, expected_path):
#    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
#    with open(in_path) as infile, open(expected_path)  as expectedfile:
#        for line, exp in zip(infile, expectedfile):
#            class_ = exp.rstrip('\n').replace(' ','')
#            text, timestap =line.rstrip('\n').split('\t')
#            #print(f"text  {type(text)}")
#            text = clear_tokens(text, True)
#            tokens = text.lower().split(' ')
#            #print(f"tokens {type(tokens)}")
#            for token in tokens:
#                clear_tokens(token,False)
#                if class_ == 'P':
#                    word_counts['paranormal'][token] += 1
#                elif class_ == 'S':
#                    word_counts['sceptic'][token]+=1
#
#    return word_counts

def calc_word_logprobs(word_counts):
    total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
    word_logprobs= {'paranormal': {}, 'sceptic': {}}
    for class_ in word_counts.keys(): # sceptic paranormal
        for token, value in word_counts[class_].items():
            if class_ == 'sceptic':
                word_prob = (value +1)/ total_skeptic
            elif class_ == 'paranormal':
                word_prob = (value+1)/ total_paranormal

            #print (token)
            word_logprobs[class_][token] = math.log(word_prob)

    return word_logprobs

def launch_bigrams_and_words(in_path, expected_path):
    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
    with open(in_path) as infile, open(expected_path) as expected_file:
        for line, exp in zip(infile, expected_file):
            class_ = exp.rstrip('\n').replace(' ', '')
            text, timestap = line.rstrip('\n').split('\t')
            tokens = clear_post(text)
            for index in range(len(tokens)-1):
                # if there is next token we append current and next
                bigram = tokens[index] + " " + tokens[index + 1]
                #print(bigram)
                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
                if class_ == 'P':
                    bigram_counts['paranormal'][bigram] +=1
                    word_counts['paranormal'][tokens[index]] +=1
                elif class_ == 'S':
                    bigram_counts['sceptic'][bigram] +=1
                    word_counts['sceptic'][tokens[index]] +=1

    return bigram_counts, word_counts

def main():
    if len(sys.argv) != 4:
        print("syntax is ./train.py expected.tsv in.tsv model.pkl")
        return
    expected_file = str(sys.argv[1])
    in_file = str(sys.argv[2])
    model = str(sys.argv[3])
    paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
    #bigrams_count = calc_bigram_count(in_file, expected_file)
    bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
    bigram_logprobs = calc_bigram_logprobs(bigrams_count)
    word_logprobs = calc_word_logprobs(words_count)
    with open(model, 'wb') as f:
        pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs],f)
main()
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								#!/usr/bin/python3
 								from collections import defaultdict
 								import math
 								import pickle
-												Added some regex and fix logprobs

											
										
										
											2020-03-22 11:59:07 +01:00
+								import re
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								import sys
-												Updated with stopwords

											
										
										
											2020-03-29 23:29:19 +02:00
+								import nltk
 								from nltk.corpus import stopwords
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
 								def calc_class_logprob(expected_path):
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								    paranormal_classcount = 0
 								    sceptic_classcount = 0
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								    with open(expected_path) as f:
 								        for line in f:
-												Fix model a little

											
										
										
											2020-03-22 12:14:52 +01:00
+								            line = line.rstrip('\n').replace(' ','')
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								            if 'P' in line:
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								                paranormal_classcount +=1
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								            elif 'S' in line:
 								                sceptic_classcount +=1
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								    paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
 								    sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
 								    return math.log(paranol_prob), math.log(sceptic_prob)
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								def clear_post(post):
 								    post = post.replace('\\n', ' ')
-												Updated with stopwords

											
										
										
											2020-03-29 23:29:19 +02:00
+								    post = post.lower()
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								    # delete links
-												Fixed bigrams a little

											
										
										
											2020-03-29 19:48:30 +02:00
+								    post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
-												Update bigram with new regex

											
										
										
											2020-03-29 14:28:07 +02:00
+								    post = re.sub(r'[\.\,\/\~]+', ' ', post)
 								    post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
-												Fixed bigrams a little

											
										
										
											2020-03-29 19:48:30 +02:00
+								    post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
-												Update bigram with new regex

											
										
										
											2020-03-29 14:28:07 +02:00
+								    post = re.sub(r'( \- |\-\-+)', ' ', post)
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								    post = re.sub(r' +', ' ', post)
 								    post = post.rstrip(' ')
-												Updated with stopwords

											
										
										
											2020-03-29 23:29:19 +02:00
+								    post = post.split(' ')
 								    stop_words = set(stopwords.words('english'))
 								    post_no_stop = [w for w in post if not w in stop_words]
 								    return post_no_stop
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
-												Updated with stopwords

											
										
										
											2020-03-29 23:29:19 +02:00
+								#def calc_bigram_count(in_path, expected_path):
 								#    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
 								#    with open(in_path) as infile, open(expected_path) as expected_file:
 								#        num_of_bigams = 0
 								#        for line, exp in zip(infile, expected_file):
 								#            class_ = exp.rstrip('\n').replace(' ', '')
 								#            text, timestap = line.rstrip('\n').split('\t')
 								#            tokens = clear_post(text)
 								#            #tokens = text.lower().split(' ')
 								#            for index in range(len(tokens)-1):
 								#                # if there is next token we append current and next
 								#                bigram = tokens[index] + " " + tokens[index + 1]
 								#                #print(bigram)
 								#                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
 								#                if class_ == 'P':
 								#                    bigram_counts['paranormal'][bigram] +=1
 								#                elif class_ == 'S':
 								#                    bigram_counts['sceptic'][bigram] +=1
 								#                num_of_bigams +=1
 								#    #print(f"num of every added bigams with repetitions {num_of_bigams})")
 								#    #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
 								#    return bigram_counts
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								def calc_bigram_logprobs(bigram_counts):
 								    total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
 								    total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
 								    bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
 								    for class_ in bigram_counts.keys():
 								        for bigram, value in bigram_counts[class_].items():
 								            if class_ == "sceptic":
 								                bigram_prob = (value + 1) / total_sceptic
 								            elif class_ == "paranormal":
 								                bigram_prob = (value + 1) / total_paranormal
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								            bigram_logprobs[class_][bigram] = math.log(bigram_prob)
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								    return bigram_logprobs
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
-												Updated with stopwords

											
										
										
											2020-03-29 23:29:19 +02:00
+								#def calc_word_count(in_path, expected_path):
 								#    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
 								#    with open(in_path) as infile, open(expected_path)  as expectedfile:
 								#        for line, exp in zip(infile, expectedfile):
 								#            class_ = exp.rstrip('\n').replace(' ','')
 								#            text, timestap =line.rstrip('\n').split('\t')
 								#            #print(f"text  {type(text)}")
 								#            text = clear_tokens(text, True)
 								#            tokens = text.lower().split(' ')
 								#            #print(f"tokens {type(tokens)}")
 								#            for token in tokens:
 								#                clear_tokens(token,False)
 								#                if class_ == 'P':
 								#                    word_counts['paranormal'][token] += 1
 								#                elif class_ == 'S':
 								#                    word_counts['sceptic'][token]+=1
 								#
 								#    return word_counts
 								def calc_word_logprobs(word_counts):
 								    total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
 								    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
 								    word_logprobs= {'paranormal': {}, 'sceptic': {}}
 								    for class_ in word_counts.keys(): # sceptic paranormal
 								        for token, value in word_counts[class_].items():
 								            if class_ == 'sceptic':
 								                word_prob = (value +1)/ total_skeptic
 								            elif class_ == 'paranormal':
 								                word_prob = (value+1)/ total_paranormal
 								            #print (token)
 								            word_logprobs[class_][token] = math.log(word_prob)
 								    return word_logprobs
 								def launch_bigrams_and_words(in_path, expected_path):
 								    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
 								    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
 								    with open(in_path) as infile, open(expected_path) as expected_file:
 								        for line, exp in zip(infile, expected_file):
 								            class_ = exp.rstrip('\n').replace(' ', '')
 								            text, timestap = line.rstrip('\n').split('\t')
 								            tokens = clear_post(text)
 								            for index in range(len(tokens)-1):
 								                # if there is next token we append current and next
 								                bigram = tokens[index] + " " + tokens[index + 1]
 								                #print(bigram)
 								                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
 								                if class_ == 'P':
 								                    bigram_counts['paranormal'][bigram] +=1
 								                    word_counts['paranormal'][tokens[index]] +=1
 								                elif class_ == 'S':
 								                    bigram_counts['sceptic'][bigram] +=1
 								                    word_counts['sceptic'][tokens[index]] +=1
 								    return bigram_counts, word_counts
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								def main():
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								    if len(sys.argv) != 4:
 								        print("syntax is ./train.py expected.tsv in.tsv model.pkl")
 								        return
 								    expected_file = str(sys.argv[1])
 								    in_file = str(sys.argv[2])
 								    model = str(sys.argv[3])
 								    paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
-												Updated with stopwords

											
										
										
											2020-03-29 23:29:19 +02:00
+								    #bigrams_count = calc_bigram_count(in_file, expected_file)
 								    bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								    bigram_logprobs = calc_bigram_logprobs(bigrams_count)
-												Updated with stopwords

											
										
										
											2020-03-29 23:29:19 +02:00
+								    word_logprobs = calc_word_logprobs(words_count)
-												Bigram implemented

											
										
										
											2020-03-29 13:39:47 +02:00
+								    with open(model, 'wb') as f:
-												Updated with stopwords

											
										
										
											2020-03-29 23:29:19 +02:00
+								        pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs],f)
-												Updated basline

											
										
										
											2020-03-22 10:15:36 +01:00
+								main()
-												Updated with stopwords

											
										
										
											2020-03-29 23:29:19 +02:00