e
This commit is contained in:
commit
d27ecb30e5
83
Bayes.py
Normal file
83
Bayes.py
Normal file
@ -0,0 +1,83 @@
|
||||
import spacy
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
import math
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
lexicon = {}
|
||||
number_of_comments_per_class = [0, 0]
|
||||
|
||||
obfuscator = re.compile('[\\[?.,!()\\]*&^%$#@{}|\\\\/~\\- \t\n]+')
|
||||
|
||||
|
||||
def tokenize(txt):
|
||||
return set([token.lower() for token in obfuscator.sub(' ', txt).split()])
|
||||
|
||||
|
||||
def train():
|
||||
with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
|
||||
for line, result in tqdm(zip(fd, ex), desc="training"):
|
||||
result = int(result)
|
||||
comment, _ = line.split('\t')
|
||||
for lemma in tokenize(comment):
|
||||
results = lexicon.get(lemma)
|
||||
if not results:
|
||||
results = [0, 0]
|
||||
lexicon[lemma] = results
|
||||
results[result] += 1
|
||||
number_of_comments_per_class[result] += 1
|
||||
|
||||
|
||||
def classify(comment):
|
||||
number_of_comments = number_of_comments_per_class[0] + number_of_comments_per_class[1]
|
||||
p_of_class0_a_priori = number_of_comments_per_class[0] / number_of_comments
|
||||
p_of_class1_a_priori = number_of_comments_per_class[1] / number_of_comments
|
||||
log_p_words_and_class0 = math.log(p_of_class0_a_priori)
|
||||
log_p_words_and_class1 = math.log(p_of_class1_a_priori)
|
||||
for lemma in tokenize(comment):
|
||||
frequencies = lexicon.get(lemma)
|
||||
if frequencies:
|
||||
class0_comments_with_word, class1_comments_with_word = frequencies
|
||||
log_p_word_given_class0 = math.log(class0_comments_with_word + 1) - math.log(
|
||||
number_of_comments_per_class[0] + 2)
|
||||
log_p_word_given_class1 = math.log(class1_comments_with_word + 1) - math.log(
|
||||
number_of_comments_per_class[1] + 2)
|
||||
log_p_words_and_class0 += log_p_word_given_class0
|
||||
log_p_words_and_class1 += log_p_word_given_class1
|
||||
# The following is not actually necessary to compute, becasue we don't
|
||||
# care about the exact probabilities. We only need to find which is greater.
|
||||
# However, the denominator is the same for both, so the division is actually
|
||||
# redundant
|
||||
# p_words = exp(log_p_words_and_class0) + exp(log_p_words_and_class1)
|
||||
# probability_of_class0 = exp(log_p_words_and_class0) / p_words
|
||||
# probability_of_class1 = exp(log_p_words_and_class1) / p_words
|
||||
return 0 if log_p_words_and_class0 > log_p_words_and_class1 else 1
|
||||
|
||||
|
||||
def test():
|
||||
correct = 0
|
||||
incorrect = 0
|
||||
with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
|
||||
for line, result in tqdm(zip(fd, ex), desc="testing"):
|
||||
comment, _ = line.split('\t')
|
||||
predicetd = classify(comment)
|
||||
if predicetd == int(result):
|
||||
correct += 1
|
||||
else:
|
||||
incorrect += 1
|
||||
|
||||
print(str(correct) + " / " + str(incorrect), " -> ", str(correct / (correct + incorrect)))
|
||||
|
||||
|
||||
def infer(data_dir):
|
||||
with open(data_dir + '/in.tsv') as fd, open(data_dir + '/out.tsv', 'w+') as ex:
|
||||
for line in tqdm(fd, desc="inferring "+data_dir):
|
||||
comment, _ = line.split('\t')
|
||||
predicetd = classify(comment)
|
||||
ex.write(str(predicetd) + '\n')
|
||||
|
||||
|
||||
train()
|
||||
infer('train')
|
||||
# P(0 | lemma)
|
13
README.md
Normal file
13
README.md
Normal file
@ -0,0 +1,13 @@
|
||||
Skeptic vs paranormal subreddits
|
||||
================================
|
||||
|
||||
Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is the probability of a paranormal subreddit.
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
||||
Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
|
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user