naive_bayes/Bayes.py

84 lines
3.1 KiB
Python
Raw Permalink Normal View History

2021-05-09 16:42:53 +02:00
import spacy
from tqdm import tqdm
import re
import math
nlp = spacy.load("en_core_web_sm")
lexicon = {}
number_of_comments_per_class = [0, 0]
obfuscator = re.compile('[\\[?.,!()\\]*&^%$#@{}|\\\\/~\\- \t\n]+')
def tokenize(txt):
return set([token.lower() for token in obfuscator.sub(' ', txt).split()])
def train():
with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
for line, result in tqdm(zip(fd, ex), desc="training"):
result = int(result)
comment, _ = line.split('\t')
for lemma in tokenize(comment):
results = lexicon.get(lemma)
if not results:
results = [0, 0]
lexicon[lemma] = results
results[result] += 1
number_of_comments_per_class[result] += 1
def classify(comment):
number_of_comments = number_of_comments_per_class[0] + number_of_comments_per_class[1]
p_of_class0_a_priori = number_of_comments_per_class[0] / number_of_comments
p_of_class1_a_priori = number_of_comments_per_class[1] / number_of_comments
log_p_words_and_class0 = math.log(p_of_class0_a_priori)
log_p_words_and_class1 = math.log(p_of_class1_a_priori)
for lemma in tokenize(comment):
frequencies = lexicon.get(lemma)
if frequencies:
class0_comments_with_word, class1_comments_with_word = frequencies
log_p_word_given_class0 = math.log(class0_comments_with_word + 1) - math.log(
number_of_comments_per_class[0] + 2)
log_p_word_given_class1 = math.log(class1_comments_with_word + 1) - math.log(
number_of_comments_per_class[1] + 2)
log_p_words_and_class0 += log_p_word_given_class0
log_p_words_and_class1 += log_p_word_given_class1
# The following is not actually necessary to compute, becasue we don't
# care about the exact probabilities. We only need to find which is greater.
# However, the denominator is the same for both, so the division is actually
# redundant
# p_words = exp(log_p_words_and_class0) + exp(log_p_words_and_class1)
# probability_of_class0 = exp(log_p_words_and_class0) / p_words
# probability_of_class1 = exp(log_p_words_and_class1) / p_words
return 0 if log_p_words_and_class0 > log_p_words_and_class1 else 1
def test():
correct = 0
incorrect = 0
with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
for line, result in tqdm(zip(fd, ex), desc="testing"):
comment, _ = line.split('\t')
predicetd = classify(comment)
if predicetd == int(result):
correct += 1
else:
incorrect += 1
print(str(correct) + " / " + str(incorrect), " -> ", str(correct / (correct + incorrect)))
def infer(data_dir):
with open(data_dir + '/in.tsv') as fd, open(data_dir + '/out.tsv', 'w+') as ex:
for line in tqdm(fd, desc="inferring "+data_dir):
comment, _ = line.split('\t')
predicetd = classify(comment)
ex.write(str(predicetd) + '\n')
train()
infer('train')
# P(0 | lemma)