84 lines
3.1 KiB
Python
84 lines
3.1 KiB
Python
|
import spacy
|
||
|
from tqdm import tqdm
|
||
|
import re
|
||
|
import math
|
||
|
|
||
|
nlp = spacy.load("en_core_web_sm")
|
||
|
|
||
|
lexicon = {}
|
||
|
number_of_comments_per_class = [0, 0]
|
||
|
|
||
|
obfuscator = re.compile('[\\[?.,!()\\]*&^%$#@{}|\\\\/~\\- \t\n]+')
|
||
|
|
||
|
|
||
|
def tokenize(txt):
|
||
|
return set([token.lower() for token in obfuscator.sub(' ', txt).split()])
|
||
|
|
||
|
|
||
|
def train():
|
||
|
with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
|
||
|
for line, result in tqdm(zip(fd, ex), desc="training"):
|
||
|
result = int(result)
|
||
|
comment, _ = line.split('\t')
|
||
|
for lemma in tokenize(comment):
|
||
|
results = lexicon.get(lemma)
|
||
|
if not results:
|
||
|
results = [0, 0]
|
||
|
lexicon[lemma] = results
|
||
|
results[result] += 1
|
||
|
number_of_comments_per_class[result] += 1
|
||
|
|
||
|
|
||
|
def classify(comment):
|
||
|
number_of_comments = number_of_comments_per_class[0] + number_of_comments_per_class[1]
|
||
|
p_of_class0_a_priori = number_of_comments_per_class[0] / number_of_comments
|
||
|
p_of_class1_a_priori = number_of_comments_per_class[1] / number_of_comments
|
||
|
log_p_words_and_class0 = math.log(p_of_class0_a_priori)
|
||
|
log_p_words_and_class1 = math.log(p_of_class1_a_priori)
|
||
|
for lemma in tokenize(comment):
|
||
|
frequencies = lexicon.get(lemma)
|
||
|
if frequencies:
|
||
|
class0_comments_with_word, class1_comments_with_word = frequencies
|
||
|
log_p_word_given_class0 = math.log(class0_comments_with_word + 1) - math.log(
|
||
|
number_of_comments_per_class[0] + 2)
|
||
|
log_p_word_given_class1 = math.log(class1_comments_with_word + 1) - math.log(
|
||
|
number_of_comments_per_class[1] + 2)
|
||
|
log_p_words_and_class0 += log_p_word_given_class0
|
||
|
log_p_words_and_class1 += log_p_word_given_class1
|
||
|
# The following is not actually necessary to compute, becasue we don't
|
||
|
# care about the exact probabilities. We only need to find which is greater.
|
||
|
# However, the denominator is the same for both, so the division is actually
|
||
|
# redundant
|
||
|
# p_words = exp(log_p_words_and_class0) + exp(log_p_words_and_class1)
|
||
|
# probability_of_class0 = exp(log_p_words_and_class0) / p_words
|
||
|
# probability_of_class1 = exp(log_p_words_and_class1) / p_words
|
||
|
return 0 if log_p_words_and_class0 > log_p_words_and_class1 else 1
|
||
|
|
||
|
|
||
|
def test():
|
||
|
correct = 0
|
||
|
incorrect = 0
|
||
|
with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
|
||
|
for line, result in tqdm(zip(fd, ex), desc="testing"):
|
||
|
comment, _ = line.split('\t')
|
||
|
predicetd = classify(comment)
|
||
|
if predicetd == int(result):
|
||
|
correct += 1
|
||
|
else:
|
||
|
incorrect += 1
|
||
|
|
||
|
print(str(correct) + " / " + str(incorrect), " -> ", str(correct / (correct + incorrect)))
|
||
|
|
||
|
|
||
|
def infer(data_dir):
|
||
|
with open(data_dir + '/in.tsv') as fd, open(data_dir + '/out.tsv', 'w+') as ex:
|
||
|
for line in tqdm(fd, desc="inferring "+data_dir):
|
||
|
comment, _ = line.split('\t')
|
||
|
predicetd = classify(comment)
|
||
|
ex.write(str(predicetd) + '\n')
|
||
|
|
||
|
|
||
|
train()
|
||
|
infer('train')
|
||
|
# P(0 | lemma)
|