e

2021-05-09 16:42:53 +02:00 · 2021-05-09 16:42:53 +02:00 · d27ecb30e5
commit d27ecb30e5
5 changed files with 10521 additions and 0 deletions
--- a/Bayes.py
+++ b/Bayes.py
@ -0,0 +1,83 @@
+import spacy
+from tqdm import tqdm
+import re
+import math
+
+nlp = spacy.load("en_core_web_sm")
+
+lexicon = {}
+number_of_comments_per_class = [0, 0]
+
+obfuscator = re.compile('[\\[?.,!()\\]*&^%$#@{}|\\\\/~\\- \t\n]+')
+
+
+def tokenize(txt):
+    return set([token.lower() for token in obfuscator.sub(' ', txt).split()])
+
+
+def train():
+    with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
+        for line, result in tqdm(zip(fd, ex), desc="training"):
+            result = int(result)
+            comment, _ = line.split('\t')
+            for lemma in tokenize(comment):
+                results = lexicon.get(lemma)
+                if not results:
+                    results = [0, 0]
+                    lexicon[lemma] = results
+                results[result] += 1
+            number_of_comments_per_class[result] += 1
+
+
+def classify(comment):
+    number_of_comments = number_of_comments_per_class[0] + number_of_comments_per_class[1]
+    p_of_class0_a_priori = number_of_comments_per_class[0] / number_of_comments
+    p_of_class1_a_priori = number_of_comments_per_class[1] / number_of_comments
+    log_p_words_and_class0 = math.log(p_of_class0_a_priori)
+    log_p_words_and_class1 = math.log(p_of_class1_a_priori)
+    for lemma in tokenize(comment):
+        frequencies = lexicon.get(lemma)
+        if frequencies:
+            class0_comments_with_word, class1_comments_with_word = frequencies
+            log_p_word_given_class0 = math.log(class0_comments_with_word + 1) - math.log(
+                number_of_comments_per_class[0] + 2)
+            log_p_word_given_class1 = math.log(class1_comments_with_word + 1) - math.log(
+                number_of_comments_per_class[1] + 2)
+            log_p_words_and_class0 += log_p_word_given_class0
+            log_p_words_and_class1 += log_p_word_given_class1
+    # The following is not actually necessary to compute, becasue we don't
+    # care about the exact probabilities. We only need to find which is greater.
+    # However, the denominator is the same for both, so the division is actually
+    # redundant
+    # p_words = exp(log_p_words_and_class0) + exp(log_p_words_and_class1)
+    # probability_of_class0 = exp(log_p_words_and_class0) / p_words
+    # probability_of_class1 = exp(log_p_words_and_class1) / p_words
+    return 0 if log_p_words_and_class0 > log_p_words_and_class1 else 1
+
+
+def test():
+    correct = 0
+    incorrect = 0
+    with open('train/in.tsv') as fd, open('train/expected.tsv') as ex:
+        for line, result in tqdm(zip(fd, ex), desc="testing"):
+            comment, _ = line.split('\t')
+            predicetd = classify(comment)
+            if predicetd == int(result):
+                correct += 1
+            else:
+                incorrect += 1
+
+    print(str(correct) + " / " + str(incorrect), " -> ", str(correct / (correct + incorrect)))
+
+
+def infer(data_dir):
+    with open(data_dir + '/in.tsv') as fd, open(data_dir + '/out.tsv', 'w+') as ex:
+        for line in tqdm(fd, desc="inferring "+data_dir):
+            comment, _ = line.split('\t')
+            predicetd = classify(comment)
+            ex.write(str(predicetd) + '\n')
+
+
+train()
+infer('train')
+#  P(0 | lemma)
--- a/README.md
+++ b/README.md
@ -0,0 +1,13 @@
+Skeptic vs paranormal subreddits
+================================
+
+Classify a reddit as either from Skeptic subreddit or one of the
+"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
+,Glitch-in-the-Matrix, conspiracytheories).
+
+Output label is the probability of a paranormal subreddit.
+
+Sources
+-------
+
+Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall>  --precision 4 --in-header in-header.tsv --out-header out-header.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
				`@ -0,0 +1 @@`
				`--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv`