This commit is contained in:
Aleksander Mendoza 2021-05-09 16:42:53 +02:00
commit d27ecb30e5
5 changed files with 10521 additions and 0 deletions

83
Bayes.py Normal file
View File

@ -0,0 +1,83 @@
import spacy
from tqdm import tqdm
import re
import math
nlp = spacy.load("en_core_web_sm")
lexicon = {}
number_of_comments_per_class = [0, 0]
obfuscator = re.compile('[\\[?.,!()\\]*&^%$#@{}|\\\\/~\\- \t\n]+')
def tokenize(txt):
return set([token.lower() for token in obfuscator.sub(' ', txt).split()])
def train():
with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
for line, result in tqdm(zip(fd, ex), desc="training"):
result = int(result)
comment, _ = line.split('\t')
for lemma in tokenize(comment):
results = lexicon.get(lemma)
if not results:
results = [0, 0]
lexicon[lemma] = results
results[result] += 1
number_of_comments_per_class[result] += 1
def classify(comment):
number_of_comments = number_of_comments_per_class[0] + number_of_comments_per_class[1]
p_of_class0_a_priori = number_of_comments_per_class[0] / number_of_comments
p_of_class1_a_priori = number_of_comments_per_class[1] / number_of_comments
log_p_words_and_class0 = math.log(p_of_class0_a_priori)
log_p_words_and_class1 = math.log(p_of_class1_a_priori)
for lemma in tokenize(comment):
frequencies = lexicon.get(lemma)
if frequencies:
class0_comments_with_word, class1_comments_with_word = frequencies
log_p_word_given_class0 = math.log(class0_comments_with_word + 1) - math.log(
number_of_comments_per_class[0] + 2)
log_p_word_given_class1 = math.log(class1_comments_with_word + 1) - math.log(
number_of_comments_per_class[1] + 2)
log_p_words_and_class0 += log_p_word_given_class0
log_p_words_and_class1 += log_p_word_given_class1
# The following is not actually necessary to compute, becasue we don't
# care about the exact probabilities. We only need to find which is greater.
# However, the denominator is the same for both, so the division is actually
# redundant
# p_words = exp(log_p_words_and_class0) + exp(log_p_words_and_class1)
# probability_of_class0 = exp(log_p_words_and_class0) / p_words
# probability_of_class1 = exp(log_p_words_and_class1) / p_words
return 0 if log_p_words_and_class0 > log_p_words_and_class1 else 1
def test():
correct = 0
incorrect = 0
with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
for line, result in tqdm(zip(fd, ex), desc="testing"):
comment, _ = line.split('\t')
predicetd = classify(comment)
if predicetd == int(result):
correct += 1
else:
incorrect += 1
print(str(correct) + " / " + str(incorrect), " -> ", str(correct / (correct + incorrect)))
def infer(data_dir):
with open(data_dir + '/in.tsv') as fd, open(data_dir + '/out.tsv', 'w+') as ex:
for line in tqdm(fd, desc="inferring "+data_dir):
comment, _ = line.split('\t')
predicetd = classify(comment)
ex.write(str(predicetd) + '\n')
train()
infer('train')
# P(0 | lemma)

13
README.md Normal file
View File

@ -0,0 +1,13 @@
Skeptic vs paranormal subreddits
================================
Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is the probability of a paranormal subreddit.
Sources
-------
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff