naive_bayes/Bayes.py

import spacy
from tqdm import tqdm
import re
import math

nlp = spacy.load("en_core_web_sm")

lexicon = {}
number_of_comments_per_class = [0, 0]

obfuscator = re.compile('[\\[?.,!()\\]*&^%$#@{}|\\\\/~\\- \t\n]+')


def tokenize(txt):
    return set([token.lower() for token in obfuscator.sub(' ', txt).split()])


def train():
    with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
        for line, result in tqdm(zip(fd, ex), desc="training"):
            result = int(result)
            comment, _ = line.split('\t')
            for lemma in tokenize(comment):
                results = lexicon.get(lemma)
                if not results:
                    results = [0, 0]
                    lexicon[lemma] = results
                results[result] += 1
            number_of_comments_per_class[result] += 1


def classify(comment):
    number_of_comments = number_of_comments_per_class[0] + number_of_comments_per_class[1]
    p_of_class0_a_priori = number_of_comments_per_class[0] / number_of_comments
    p_of_class1_a_priori = number_of_comments_per_class[1] / number_of_comments
    log_p_words_and_class0 = math.log(p_of_class0_a_priori)
    log_p_words_and_class1 = math.log(p_of_class1_a_priori)
    for lemma in tokenize(comment):
        frequencies = lexicon.get(lemma)
        if frequencies:
            class0_comments_with_word, class1_comments_with_word = frequencies
            log_p_word_given_class0 = math.log(class0_comments_with_word + 1) - math.log(
                number_of_comments_per_class[0] + 2)
            log_p_word_given_class1 = math.log(class1_comments_with_word + 1) - math.log(
                number_of_comments_per_class[1] + 2)
            log_p_words_and_class0 += log_p_word_given_class0
            log_p_words_and_class1 += log_p_word_given_class1
    # The following is not actually necessary to compute, becasue we don't
    # care about the exact probabilities. We only need to find which is greater.
    # However, the denominator is the same for both, so the division is actually
    # redundant
    # p_words = exp(log_p_words_and_class0) + exp(log_p_words_and_class1)
    # probability_of_class0 = exp(log_p_words_and_class0) / p_words
    # probability_of_class1 = exp(log_p_words_and_class1) / p_words
    return 0 if log_p_words_and_class0 > log_p_words_and_class1 else 1


def test():
    correct = 0
    incorrect = 0
    with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
        for line, result in tqdm(zip(fd, ex), desc="testing"):
            comment, _ = line.split('\t')
            predicetd = classify(comment)
            if predicetd == int(result):
                correct += 1
            else:
                incorrect += 1

    print(str(correct) + " / " + str(incorrect), " -> ", str(correct / (correct + incorrect)))


def infer(data_dir):
    with open(data_dir + '/in.tsv') as fd, open(data_dir + '/out.tsv', 'w+') as ex:
        for line in tqdm(fd, desc="inferring "+data_dir):
            comment, _ = line.split('\t')
            predicetd = classify(comment)
            ex.write(str(predicetd) + '\n')


train()
infer('train')
#  P(0 | lemma)
e 2021-05-09 16:42:53 +02:00			`import spacy`
			`from tqdm import tqdm`
			`import re`
			`import math`

			`nlp = spacy.load("en_core_web_sm")`

			`lexicon = {}`
			`number_of_comments_per_class = [0, 0]`

			`obfuscator = re.compile('[\\[?.,!()\\]*&^%$#@{}\|\\\\/~\\- \t\n]+')`


			`def tokenize(txt):`
			`return set([token.lower() for token in obfuscator.sub(' ', txt).split()])`


			`def train():`
			`with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:`
			`for line, result in tqdm(zip(fd, ex), desc="training"):`
			`result = int(result)`
			`comment, _ = line.split('\t')`
			`for lemma in tokenize(comment):`
			`results = lexicon.get(lemma)`
			`if not results:`
			`results = [0, 0]`
			`lexicon[lemma] = results`
			`results[result] += 1`
			`number_of_comments_per_class[result] += 1`


			`def classify(comment):`
			`number_of_comments = number_of_comments_per_class[0] + number_of_comments_per_class[1]`
			`p_of_class0_a_priori = number_of_comments_per_class[0] / number_of_comments`
			`p_of_class1_a_priori = number_of_comments_per_class[1] / number_of_comments`
			`log_p_words_and_class0 = math.log(p_of_class0_a_priori)`
			`log_p_words_and_class1 = math.log(p_of_class1_a_priori)`
			`for lemma in tokenize(comment):`
			`frequencies = lexicon.get(lemma)`
			`if frequencies:`
			`class0_comments_with_word, class1_comments_with_word = frequencies`
			`log_p_word_given_class0 = math.log(class0_comments_with_word + 1) - math.log(`
			`number_of_comments_per_class[0] + 2)`
			`log_p_word_given_class1 = math.log(class1_comments_with_word + 1) - math.log(`
			`number_of_comments_per_class[1] + 2)`
			`log_p_words_and_class0 += log_p_word_given_class0`
			`log_p_words_and_class1 += log_p_word_given_class1`
			`# The following is not actually necessary to compute, becasue we don't`
			`# care about the exact probabilities. We only need to find which is greater.`
			`# However, the denominator is the same for both, so the division is actually`
			`# redundant`
			`# p_words = exp(log_p_words_and_class0) + exp(log_p_words_and_class1)`
			`# probability_of_class0 = exp(log_p_words_and_class0) / p_words`
			`# probability_of_class1 = exp(log_p_words_and_class1) / p_words`
			`return 0 if log_p_words_and_class0 > log_p_words_and_class1 else 1`


			`def test():`
			`correct = 0`
			`incorrect = 0`
			`with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:`
			`for line, result in tqdm(zip(fd, ex), desc="testing"):`
			`comment, _ = line.split('\t')`
			`predicetd = classify(comment)`
			`if predicetd == int(result):`
			`correct += 1`
			`else:`
			`incorrect += 1`

			`print(str(correct) + " / " + str(incorrect), " -> ", str(correct / (correct + incorrect)))`


			`def infer(data_dir):`
			`with open(data_dir + '/in.tsv') as fd, open(data_dir + '/out.tsv', 'w+') as ex:`
			`for line in tqdm(fd, desc="inferring "+data_dir):`
			`comment, _ = line.split('\t')`
			`predicetd = classify(comment)`
			`ex.write(str(predicetd) + '\n')`


			`train()`
			`infer('train')`
			`# P(0 \| lemma)`