import spacy from tqdm import tqdm import re import math nlp = spacy.load("en_core_web_sm") lexicon = {} number_of_comments_per_class = [0, 0] obfuscator = re.compile('[\\[?.,!()\\]*&^%$#@{}|\\\\/~\\- \t\n]+') def tokenize(txt): return set([token.lower() for token in obfuscator.sub(' ', txt).split()]) def train(): with open('train/in.tsv') as fd, open('train/expected.tsv') as ex: for line, result in tqdm(zip(fd, ex), desc="training"): result = int(result) comment, _ = line.split('\t') for lemma in tokenize(comment): results = lexicon.get(lemma) if not results: results = [0, 0] lexicon[lemma] = results results[result] += 1 number_of_comments_per_class[result] += 1 def classify(comment): number_of_comments = number_of_comments_per_class[0] + number_of_comments_per_class[1] p_of_class0_a_priori = number_of_comments_per_class[0] / number_of_comments p_of_class1_a_priori = number_of_comments_per_class[1] / number_of_comments log_p_words_and_class0 = math.log(p_of_class0_a_priori) log_p_words_and_class1 = math.log(p_of_class1_a_priori) for lemma in tokenize(comment): frequencies = lexicon.get(lemma) if frequencies: class0_comments_with_word, class1_comments_with_word = frequencies log_p_word_given_class0 = math.log(class0_comments_with_word + 1) - math.log( number_of_comments_per_class[0] + 2) log_p_word_given_class1 = math.log(class1_comments_with_word + 1) - math.log( number_of_comments_per_class[1] + 2) log_p_words_and_class0 += log_p_word_given_class0 log_p_words_and_class1 += log_p_word_given_class1 # The following is not actually necessary to compute, becasue we don't # care about the exact probabilities. We only need to find which is greater. # However, the denominator is the same for both, so the division is actually # redundant # p_words = exp(log_p_words_and_class0) + exp(log_p_words_and_class1) # probability_of_class0 = exp(log_p_words_and_class0) / p_words # probability_of_class1 = exp(log_p_words_and_class1) / p_words return 0 if log_p_words_and_class0 > log_p_words_and_class1 else 1 def test(): correct = 0 incorrect = 0 with open('train/in.tsv') as fd, open('train/expected.tsv') as ex: for line, result in tqdm(zip(fd, ex), desc="testing"): comment, _ = line.split('\t') predicetd = classify(comment) if predicetd == int(result): correct += 1 else: incorrect += 1 print(str(correct) + " / " + str(incorrect), " -> ", str(correct / (correct + incorrect))) def infer(data_dir): with open(data_dir + '/in.tsv') as fd, open(data_dir + '/out.tsv', 'w+') as ex: for line in tqdm(fd, desc="inferring "+data_dir): comment, _ = line.split('\t') predicetd = classify(comment) ex.write(str(predicetd) + '\n') train() infer('train') # P(0 | lemma)