paranormal-or-skeptic/solution.py

import csv
from collections import defaultdict
import math
import pickle
import os
from pathlib import Path


def tokenize(text):
    text = text.replace("n't", " not")
    text = text.replace("'s", " is")
    text = text.replace("'ve", " have")
    text = text.replace("'", " ")
    text = text.replace("(", " ")
    text = text.replace(")", " ")
    text = text.replace("/", " ")
    text = text.replace("\\n\\n", "")
    text = text.replace(".", "")
    text = text.replace("?", "")
    text = text.replace(",", "")
    text = text.replace("!", "")
    text = text.replace('"', '')
    text = text.replace(" a ", " ")
    text = text.replace(" on ", " ")
    text = text.replace(" the ", " ")
    text = text.replace(" of ", " ")
    text = text.replace(" an ", " ")
    text = text.replace(" to ", " ")
    #text = text.replace("a", "")
    return text


def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
    paranoarmal_class_count = 0
    skeptic_class_count = 0
    with open(expected_path) as f:
        for line in f:
            if "1" in line:
                paranoarmal_class_count +=1
            elif "0" in line:
                skeptic_class_count +=1

    paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
    skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)

    return paranormal_class_prob, skeptic_class_prob

def calc_word_counts(in_path, expected_path):
    with open(in_path) as in_file, open(expected_path) as exp_file:
        word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
        for in_line, exp_line in zip(in_file, exp_file):
            class_ = exp_line.rstrip('\n').replace(" ", "")
            text, timestamp = in_line.rstrip('\n').split('\t')
            text = tokenize(text)
            tokens = text.lower().split(' ')
            for token in tokens:
                if class_ == '1':
                    word_counts['paranormal'][token] += 1
                elif class_ == '0':
                    word_counts['skeptic'][token] += 1

    return word_counts


def calc_word_logprobs(word_counts):
    total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
    word_logprobs = {'paranormal': {}, 'skeptic':{}}
    for class_ in word_logprobs.keys():
        for token, value in word_counts[class_].items():
            if class_ == 'skeptic':
                word_prob = (value + 1)/ total_skeptic
            else:
                 word_prob = (value + 1)/total_paranormal
            word_logprobs[class_][token] = word_prob
    return word_logprobs

paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")

word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')

word_logprobs = calc_word_logprobs(word_counts)

#print(word_logprobs['skeptic']["hair."]) #-12.166205308815476

#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic

def get_test_posts(path):
    posts = []
    with open(path) as f:
        for line in f:
            text, timestamp = line.rstrip('\n').split('\t')
            posts.append(text)
    return posts


def predict_post_class(posts, sprob, pprob, word_logprobs):
    out_classes = []

    for post in posts:
        total_s_prob = math.log(sprob)
        total_p_prob = math.log(pprob)
        post = tokenize(post)
        tokens = post.lower().split(' ')
        for token in tokens:
            #dlasceptic
            if (token in word_logprobs['skeptic'].keys()):
                sceptic_prob = word_logprobs['skeptic'][token]+1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
            else:
                sceptic_prob = 1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
            #dlaparanormal
            if (token in word_logprobs['paranormal'].keys()):
                paranormal_prob = word_logprobs['paranormal'][token]+1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
            else:
                paranormal_prob = 1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))
            total_s_prob += math.log(sceptic_prob)
            total_p_prob += math.log(paranormal_prob)
        
        #print(total_p_prob)
        #print(total_s_prob)
        if total_p_prob > total_s_prob:
            out_classes.append(total_p_prob)
        else:
            out_classes.append(total_s_prob)

    return out_classes


def predict_posts(path):
    posts = get_test_posts(path+'/in.tsv')
    classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
    with open(path+"/out.tsv", 'wt') as tsvfile:
            tsv_writer = csv.writer(tsvfile, delimiter='\t')
            # for i in classes:
            #     tsv_writer.writerow(i)
            tsv_writer.writerows(map(lambda x: [-x], classes))

predict_posts("dev-0")
predict_posts("test-A")

with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
    counter = 0
    positive = 0
    for out_line, exp_line in zip(out_file, exp_file):
        counter+=1
        if out_line == exp_line:
            positive += 1
    print(positive/counter)
first solution 2020-03-09 14:37:26 +01:00			`import csv`
test 2020-03-09 18:30:02 +01:00			`from collections import defaultdict`
			`import math`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`import pickle`
			`import os`
			`from pathlib import Path`
first solution 2020-03-09 14:37:26 +01:00
add tokenizer 2020-05-02 13:40:22 +02:00
			`def tokenize(text):`
			`text = text.replace("n't", " not")`
			`text = text.replace("'s", " is")`
			`text = text.replace("'ve", " have")`
			`text = text.replace("'", " ")`
			`text = text.replace("(", " ")`
			`text = text.replace(")", " ")`
			`text = text.replace("/", " ")`
			`text = text.replace("\\n\\n", "")`
			`text = text.replace(".", "")`
			`text = text.replace("?", "")`
			`text = text.replace(",", "")`
			`text = text.replace("!", "")`
			`text = text.replace('"', '')`
			`text = text.replace(" a ", " ")`
			`text = text.replace(" on ", " ")`
			`text = text.replace(" the ", " ")`
			`text = text.replace(" of ", " ")`
			`text = text.replace(" an ", " ")`
			`text = text.replace(" to ", " ")`
			`#text = text.replace("a", "")`
			`return text`




naive-bayess solution 2020-03-29 21:03:04 +02:00			`def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))`
test 2020-03-09 18:30:02 +01:00			`paranoarmal_class_count = 0`
			`skeptic_class_count = 0`
			`with open(expected_path) as f:`
			`for line in f:`
add tokenizer 2020-05-02 13:40:22 +02:00			`if "1" in line:`
test 2020-03-09 18:30:02 +01:00			`paranoarmal_class_count +=1`
add tokenizer 2020-05-02 13:40:22 +02:00			`elif "0" in line:`
test 2020-03-09 18:30:02 +01:00			`skeptic_class_count +=1`

			`paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)`
			`skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)`

change output 2020-05-02 19:29:47 +02:00			`return paranormal_class_prob, skeptic_class_prob`
test 2020-03-09 18:30:02 +01:00
			`def calc_word_counts(in_path, expected_path):`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`with open(in_path) as in_file, open(expected_path) as exp_file:`
test 2020-03-09 18:30:02 +01:00			`word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}`
			`for in_line, exp_line in zip(in_file, exp_file):`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`class_ = exp_line.rstrip('\n').replace(" ", "")`
			`text, timestamp = in_line.rstrip('\n').split('\t')`
add tokenizer 2020-05-02 13:40:22 +02:00			`text = tokenize(text)`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`tokens = text.lower().split(' ')`
			`for token in tokens:`
add tokenizer 2020-05-02 13:40:22 +02:00			`if class_ == '1':`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`word_counts['paranormal'][token] += 1`
add tokenizer 2020-05-02 13:40:22 +02:00			`elif class_ == '0':`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`word_counts['skeptic'][token] += 1`
test 2020-03-09 18:30:02 +01:00
			`return word_counts`


naive-bayess solution 2020-03-29 21:03:04 +02:00			`def calc_word_logprobs(word_counts):`
			`total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())`
			`total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())`
			`word_logprobs = {'paranormal': {}, 'skeptic':{}}`
			`for class_ in word_logprobs.keys():`
			`for token, value in word_counts[class_].items():`
			`if class_ == 'skeptic':`
			`word_prob = (value + 1)/ total_skeptic`
			`else:`
			`word_prob = (value + 1)/total_paranormal`
change output 2020-05-02 19:29:47 +02:00			`word_logprobs[class_][token] = word_prob`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`return word_logprobs`
test 2020-03-09 18:30:02 +01:00
naive-bayess solution 2020-03-29 21:03:04 +02:00			`paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")`
first solution 2020-03-09 14:37:26 +01:00
naive-bayess solution 2020-03-29 21:03:04 +02:00			`word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')`
first solution 2020-03-09 14:37:26 +01:00
naive-bayess solution 2020-03-29 21:03:04 +02:00			`word_logprobs = calc_word_logprobs(word_counts)`

add tokenizer 2020-05-02 13:40:22 +02:00			`#print(word_logprobs['skeptic']["hair."]) #-12.166205308815476`
naive-bayess solution 2020-03-29 21:03:04 +02:00
			`#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic`

			`def get_test_posts(path):`
			`posts = []`
			`with open(path) as f:`
			`for line in f:`
			`text, timestamp = line.rstrip('\n').split('\t')`
			`posts.append(text)`
			`return posts`


			`def predict_post_class(posts, sprob, pprob, word_logprobs):`
			`out_classes = []`

			`for post in posts:`
change output 2020-05-02 19:29:47 +02:00			`total_s_prob = math.log(sprob)`
			`total_p_prob = math.log(pprob)`
add tokenizer 2020-05-02 13:40:22 +02:00			`post = tokenize(post)`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`tokens = post.lower().split(' ')`
			`for token in tokens:`
			`#dlasceptic`
			`if (token in word_logprobs['skeptic'].keys()):`
change output 2020-05-02 19:29:47 +02:00			`sceptic_prob = word_logprobs['skeptic'][token]+1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`else:`
change output 2020-05-02 19:29:47 +02:00			`sceptic_prob = 1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`#dlaparanormal`
			`if (token in word_logprobs['paranormal'].keys()):`
change output 2020-05-02 19:29:47 +02:00			`paranormal_prob = word_logprobs['paranormal'][token]+1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`else:`
change output 2020-05-02 19:29:47 +02:00			`paranormal_prob = 1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal']))`
			`total_s_prob += math.log(sceptic_prob)`
			`total_p_prob += math.log(paranormal_prob)`
naive-bayess solution 2020-03-29 21:03:04 +02:00
			`#print(total_p_prob)`
			`#print(total_s_prob)`
			`if total_p_prob > total_s_prob:`
change output 2020-05-02 19:29:47 +02:00			`out_classes.append(total_p_prob)`
naive-bayess solution 2020-03-29 21:03:04 +02:00			`else:`
change output 2020-05-02 19:29:47 +02:00			`out_classes.append(total_s_prob)`
naive-bayess solution 2020-03-29 21:03:04 +02:00
			`return out_classes`


			`def predict_posts(path):`
			`posts = get_test_posts(path+'/in.tsv')`
			`classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)`
			`with open(path+"/out.tsv", 'wt') as tsvfile:`
			`tsv_writer = csv.writer(tsvfile, delimiter='\t')`
change output 2020-05-02 19:29:47 +02:00			`# for i in classes:`
			`# tsv_writer.writerow(i)`
			`tsv_writer.writerows(map(lambda x: [-x], classes))`
naive-bayess solution 2020-03-29 21:03:04 +02:00
			`predict_posts("dev-0")`
			`predict_posts("test-A")`
naive-bayess solution 2020-03-29 21:22:20 +02:00
			`with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:`
			`counter = 0`
			`positive = 0`
			`for out_line, exp_line in zip(out_file, exp_file):`
			`counter+=1`
add tokenizer 2020-05-02 13:40:22 +02:00			`if out_line == exp_line:`
naive-bayess solution 2020-03-29 21:22:20 +02:00			`positive += 1`
			`print(positive/counter)`