Updated basline

2020-03-22 10:15:36 +01:00 · 2020-03-22 10:15:36 +01:00 · eb6ba923a4
commit eb6ba923a4
15 changed files with 884449 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,13 @@
+Skeptic vs paranormal subreddits
+================================
+
+Classify a reddit as either from Skeptic subreddit or one of the
+"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
+,Glitch-in-the-Matrix, conspiracytheories).
+
+Output label is `S` and `P`.
+
+Sources
+-------
+
+Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric Accuracy --precision 4 --in-header in-header.tsv --out-header out-header.tsv
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv.xz
+++ b/dev-0/in.tsv.xz
--- a/in-header.tsv
+++ b/in-header.tsv
@ -0,0 +1 @@
+PostText	Timestamp
--- a/naive_base_model.pkl
+++ b/naive_base_model.pkl
--- a/out-header.tsv
+++ b/out-header.tsv
@ -0,0 +1 @@
+Label
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,48 @@
+#!/usr/bin/python3
+
+import pickle
+import math
+
+def clear_tokens(tokens):
+    tokens = tokens.replace('\n', ' ')
+
+    return tokens
+
+def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
+    # dla kazdego tokenu z danego posta
+    text, timestap = post.rstrip('\n').split('\t')
+    text =  clear_tokens(text)
+    tokens = text.lower().split(' ')
+    probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
+    for class_ in word_logprobs.keys():
+        product = 1
+        for token in tokens:
+            try:
+                product *=  word_logprobs[class_][token]
+            except KeyError:
+                pass
+            # tu wzoru uzyj
+        if class_ == 'sceptic':
+            product *=  sceptic_class_logprob
+        elif class_ == 'paranormal':
+            product *= paranormal_class_logprob
+        probs[abs(product)] = class_
+        print(probs)
+
+    return probs[max(probs.keys())]
+
+
+def main():
+    with open('naive_base_model.pkl', 'rb') as f:
+        pickle_list = pickle.load(f)
+    paranormal_class_logprob = pickle_list[0]
+    sceptic_class_logprob = pickle_list[1]
+    word_logprobs = pickle_list[2]
+    with open('test-A/in.tsv') as in_f, open('test-A/out.tsv', 'w') as out_f:
+        for line in in_f:
+            hyp = calc_post_prob(line, paranormal_class_logprob, sceptic_class_logprob, word_logprobs)
+            if hyp == 'sceptic':
+                out_f.write(" S\n")
+            elif hyp == 'paranormal':
+                 out_f.write(' P\n')
+main()
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.py
+++ b/train.py
@ -0,0 +1,72 @@
+#!/usr/bin/python3
+from collections import defaultdict
+import math
+import pickle
+
+# in expected.tsv
+def calc_class_logprob(expected_path):
+    paranolal_classcount=0
+    sceptic_classcount=0
+    with open(expected_path) as f:
+        for line in f:
+            if 'P' in line:
+                paranolal_classcount +=1
+            elif 'S' in line:
+                sceptic_classcount +=1
+
+    paranol_prob = paranolal_classcount / (paranolal_classcount + sceptic_classcount)
+    sceptic_prob = sceptic_classcount / (paranolal_classcount + sceptic_classcount)
+
+    return math.log(paranol_prob), math.log(sceptic_prob)
+
+def clear_tokens(tokens):
+    tokens = tokens.replace('\n', ' ')
+    # delete links, special characters, kropki, and \n
+
+    return tokens
+
+# ile razy slowo wystepuje w dokumentach w danej klasie
+def calc_word_count(in_path, expected_path):
+    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
+    with open(in_path) as infile, open(expected_path)  as expectedfile:
+        for line, exp in zip(infile, expectedfile):
+            class_ = exp.rstrip('\n').replace(' ','')
+            text, timestap =line.rstrip('\n').split('\t')
+            #print(f"text  {type(text)}")
+            text = clear_tokens(text)
+            tokens = text.lower().split(' ')
+            #print(f"tokens {type(tokens)}")
+            for token in tokens:
+                if class_ == 'P':
+                    word_counts['paranormal'][token] += 1
+                elif class_ == 'S':
+                    word_counts['sceptic'][token]+=1
+
+    return word_counts
+
+def calc_word_logprobs(word_counts):
+    total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
+    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
+    word_logprobs= {'paranormal': {}, 'sceptic': {}}
+    for class_ in word_counts.keys(): # sceptic paranormal
+        for token, value in word_counts[class_].items():
+            if class_ == 'sceptic':
+                word_prob = (value +1)/ total_skeptic
+            elif class_ == 'paranormal':
+                word_prob = (value+1)/ total_paranormal
+
+            print (token)
+            word_logprobs[class_][token] = math.log(word_prob)
+
+    return word_logprobs
+
+def main():
+    paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob('./train/expected.tsv')
+    wordcounts =calc_word_count('./train/in.tsv','./train/expected.tsv')
+
+    word_logprobs = calc_word_logprobs(wordcounts)
+    with open('naive_base_model.pkl', 'wb') as f:
+        pickle.dump([paranormal_class_lgprob, skeptic_class_logprob, word_logprobs], f)
+     # w predict.py bierzemy ten wzor argmax P(w) iloczynP(w|c)
+
+main()
--- a/train.pyc
+++ b/train.pyc
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/expected.tsv_
+++ b/train/expected.tsv_
--- a/train/in.tsv
+++ b/train/in.tsv
				`@ -0,0 +1 @@`
				`--metric Accuracy --precision 4 --in-header in-header.tsv --out-header out-header.tsv`