naive_bayes/Bayes.py

import spacy
from tqdm import tqdm
import re
import math

nlp = spacy.load("en_core_web_sm")

lexicon = {}
number_of_comments_per_class = [0, 0]

obfuscator = re.compile('[\\[?.,!()\\]*&^%$#@{}|\\\\/~\\- \t\n]+')


def tokenize(txt):
    return set([token.lower() for token in obfuscator.sub(' ', txt).split()])


def train():
    with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
        for line, result in tqdm(zip(fd, ex), desc="training"):
            result = int(result)
            comment, _ = line.split('\t')
            for lemma in tokenize(comment):
                results = lexicon.get(lemma)
                if not results:
                    results = [0, 0]
                    lexicon[lemma] = results
                results[result] += 1
            number_of_comments_per_class[result] += 1


def classify(comment):
    number_of_comments = number_of_comments_per_class[0] + number_of_comments_per_class[1]
    p_of_class0_a_priori = number_of_comments_per_class[0] / number_of_comments
    p_of_class1_a_priori = number_of_comments_per_class[1] / number_of_comments
    log_p_words_and_class0 = math.log(p_of_class0_a_priori)
    log_p_words_and_class1 = math.log(p_of_class1_a_priori)
    for lemma in tokenize(comment):
        frequencies = lexicon.get(lemma)
        if frequencies:
            class0_comments_with_word, class1_comments_with_word = frequencies
            log_p_word_given_class0 = math.log(class0_comments_with_word + 1) - math.log(
                number_of_comments_per_class[0] + 2)
            log_p_word_given_class1 = math.log(class1_comments_with_word + 1) - math.log(
                number_of_comments_per_class[1] + 2)
            log_p_words_and_class0 += log_p_word_given_class0
            log_p_words_and_class1 += log_p_word_given_class1
    # The following is not actually necessary to compute, becasue we don't
    # care about the exact probabilities. We only need to find which is greater.
    # However, the denominator is the same for both, so the division is actually
    # redundant
    # p_words = exp(log_p_words_and_class0) + exp(log_p_words_and_class1)
    # probability_of_class0 = exp(log_p_words_and_class0) / p_words
    # probability_of_class1 = exp(log_p_words_and_class1) / p_words
    return 0 if log_p_words_and_class0 > log_p_words_and_class1 else 1


def test():
    correct = 0
    incorrect = 0
    with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
        for line, result in tqdm(zip(fd, ex), desc="testing"):
            comment, _ = line.split('\t')
            predicetd = classify(comment)
            if predicetd == int(result):
                correct += 1
            else:
                incorrect += 1

    print(str(correct) + " / " + str(incorrect), " -> ", str(correct / (correct + incorrect)))


def infer(data_dir):
    with open(data_dir + '/in.tsv') as fd, open(data_dir + '/out.tsv', 'w+') as ex:
        for line in tqdm(fd, desc="inferring "+data_dir):
            comment, _ = line.split('\t')
            predicetd = classify(comment)
            ex.write(str(predicetd) + '\n')


train()
infer('train')
#  P(0 | lemma)