Updated with stopwords

2020-03-29 23:29:19 +02:00 · 2020-03-29 23:29:19 +02:00 · d1ca0a2ea8
commit d1ca0a2ea8
parent a3a146a87c
17 changed files with 873 additions and 310901 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+dev-0/in.tsv
+train/in.tsv
+test/in.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/in.tsv.xz
+++ b/dev-0/in.tsv.xz
--- a/dev-0/naive_bigram.pkl
+++ b/dev-0/naive_bigram.pkl
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dev-0/together
+++ b/dev-0/together
--- a/naive.test-A.md5
+++ b/naive.test-A.md5
@ -1 +0,0 @@
-e412b617206095df98ac606360b222d0  naive_base_model.pkl
--- a/naive_bigram.pkl
+++ b/naive_bigram.pkl
--- a/predict.py
+++ b/predict.py
@ -4,11 +4,13 @@ import pickle
 import math
 import re
 import sys
+import nltk
+from nltk.corpus import stopwords

-def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs):
+def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs):
    text, timestap = post.rstrip('\n').split('\t')
-    text = clear_post(text)
-    tokens = text.lower().split(' ')
+    tokens = clear_post(text)
+    #tokens = text.lower().split(' ')
    probs = {}
    for class_ in bigrams_logprobs.keys():
        product = 0
@ -20,16 +22,23 @@ def calc_post_class(post, paranormal_class_logprob, sceptic_class_logprob, bigra
                product +=  bigrams_logprobs[class_][bigram]
            except KeyError:
                product += 0
+        for token in tokens:
+            try:
+                product += words_logprobs[class_][token]
+            except KeyError:
+                product += 0
        if class_ == 'sceptic':
            product += sceptic_class_logprob
        elif class_ == 'paranormal':
            product += paranormal_class_logprob
        probs[abs(product)] = class_
+
    #print(probs)
    return probs[max(probs.keys())]

 def clear_post(post):
    post = post.replace('\\n', ' ')
+    post = post.lower()
    post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\-\?\+\%]+(\)|)', ' internetlink ', post)
    post = re.sub(r'[\.\,\/\~]+', ' ', post)
    post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post) 
@ -37,7 +46,10 @@ def clear_post(post):
    post = re.sub(r'( \- |\-\-+)', ' ', post) 
    post = re.sub(r' +', ' ', post)
    post = post.rstrip(' ')
-    return post
+    post = post.split(' ')
+    stop_words = set(stopwords.words('english')) 
+    post_no_stop = [w for w in post if not w in stop_words] 
+    return post_no_stop

 def main():
    if len(sys.argv) != 4:
@ -52,10 +64,11 @@ def main():
    paranormal_class_logprob = pickle_list[0]
    sceptic_class_logprob = pickle_list[1]
    bigrams_logprobs = pickle_list[2]
+    words_logprobs = pickle_list[3]

    with open(in_file) as in_f, open(out_file, 'w') as out_f:
        for line in  in_f:
-            hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs)
+            hyp = calc_post_class(line, paranormal_class_logprob, sceptic_class_logprob, bigrams_logprobs, words_logprobs)
            if hyp == 'sceptic':
                out_f.write(' S\n')
            elif hyp == 'paranormal':
--- a/predict_baseline.py
+++ b/predict_baseline.py
@ -69,3 +69,4 @@ def main():
            elif hyp == 'paranormal':
                 out_f.write(' P\n')
 main()
+            c
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/in.tsv.xz
+++ b/test-A/in.tsv.xz
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.py
+++ b/train.py
@ -4,6 +4,8 @@ import math
 import pickle
 import re
 import sys
+import nltk
+from nltk.corpus import stopwords

 def calc_class_logprob(expected_path):
    paranormal_classcount = 0
@ -24,6 +26,7 @@ def calc_class_logprob(expected_path):

 def clear_post(post):
    post = post.replace('\\n', ' ')
+    post = post.lower()
    # delete links
    post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
    post = re.sub(r'[\.\,\/\~]+', ' ', post)
@ -32,30 +35,33 @@ def clear_post(post):
    post = re.sub(r'( \- |\-\-+)', ' ', post)
    post = re.sub(r' +', ' ', post)
    post = post.rstrip(' ')
-    return post
+    post = post.split(' ')
+    stop_words = set(stopwords.words('english'))
+    post_no_stop = [w for w in post if not w in stop_words]
+    return post_no_stop

-def calc_bigram_count(in_path, expected_path):
-    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
-    with open(in_path) as infile, open(expected_path) as expected_file:
-        num_of_bigams = 0
-        for line, exp in zip(infile, expected_file):
-            class_ = exp.rstrip('\n').replace(' ', '')
-            text, timestap = line.rstrip('\n').split('\t')
-            text = clear_post(text)
-            tokens = text.lower().split(' ')
-            for index in range(len(tokens)-1):
-                # if there is next token we append current and next
-                bigram = tokens[index] + " " + tokens[index + 1]
-                #print(bigram)
-                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
-                if class_ == 'P':
-                    bigram_counts['paranormal'][bigram] +=1
-                elif class_ == 'S':
-                    bigram_counts['sceptic'][bigram] +=1
-                num_of_bigams +=1
-    #print(f"num of every added bigams with repetitions {num_of_bigams})")
-    #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
-    return bigram_counts
+#def calc_bigram_count(in_path, expected_path):
+#    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
+#    with open(in_path) as infile, open(expected_path) as expected_file:
+#        num_of_bigams = 0
+#        for line, exp in zip(infile, expected_file):
+#            class_ = exp.rstrip('\n').replace(' ', '')
+#            text, timestap = line.rstrip('\n').split('\t')
+#            tokens = clear_post(text)
+#            #tokens = text.lower().split(' ')
+#            for index in range(len(tokens)-1):
+#                # if there is next token we append current and next
+#                bigram = tokens[index] + " " + tokens[index + 1]
+#                #print(bigram)
+#                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
+#                if class_ == 'P':
+#                    bigram_counts['paranormal'][bigram] +=1
+#                elif class_ == 'S':
+#                    bigram_counts['sceptic'][bigram] +=1
+#                num_of_bigams +=1
+#    #print(f"num of every added bigams with repetitions {num_of_bigams})")
+#    #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
+#    return bigram_counts

 def calc_bigram_logprobs(bigram_counts):
    total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
@ -72,6 +78,63 @@ def calc_bigram_logprobs(bigram_counts):

    return bigram_logprobs

+#def calc_word_count(in_path, expected_path):
+#    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)} # dzienik zawierajacy slownik w ktorym s slowa i ile razy wystepuja
+#    with open(in_path) as infile, open(expected_path)  as expectedfile:
+#        for line, exp in zip(infile, expectedfile):
+#            class_ = exp.rstrip('\n').replace(' ','')
+#            text, timestap =line.rstrip('\n').split('\t')
+#            #print(f"text  {type(text)}")
+#            text = clear_tokens(text, True)
+#            tokens = text.lower().split(' ')
+#            #print(f"tokens {type(tokens)}")
+#            for token in tokens:
+#                clear_tokens(token,False)
+#                if class_ == 'P':
+#                    word_counts['paranormal'][token] += 1
+#                elif class_ == 'S':
+#                    word_counts['sceptic'][token]+=1
+#
+#    return word_counts
+
+def calc_word_logprobs(word_counts):
+    total_skeptic = sum(word_counts['sceptic'].values()) + len(word_counts['sceptic'].keys())
+    total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys())
+    word_logprobs= {'paranormal': {}, 'sceptic': {}}
+    for class_ in word_counts.keys(): # sceptic paranormal
+        for token, value in word_counts[class_].items():
+            if class_ == 'sceptic':
+                word_prob = (value +1)/ total_skeptic
+            elif class_ == 'paranormal':
+                word_prob = (value+1)/ total_paranormal
+
+            #print (token)
+            word_logprobs[class_][token] = math.log(word_prob)
+
+    return word_logprobs
+
+def launch_bigrams_and_words(in_path, expected_path):
+    word_counts = {'paranormal':defaultdict(int), 'sceptic': defaultdict(int)}
+    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
+    with open(in_path) as infile, open(expected_path) as expected_file:
+        for line, exp in zip(infile, expected_file):
+            class_ = exp.rstrip('\n').replace(' ', '')
+            text, timestap = line.rstrip('\n').split('\t')
+            tokens = clear_post(text)
+            for index in range(len(tokens)-1):
+                # if there is next token we append current and next
+                bigram = tokens[index] + " " + tokens[index + 1]
+                #print(bigram)
+                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
+                if class_ == 'P':
+                    bigram_counts['paranormal'][bigram] +=1
+                    word_counts['paranormal'][tokens[index]] +=1
+                elif class_ == 'S':
+                    bigram_counts['sceptic'][bigram] +=1
+                    word_counts['sceptic'][tokens[index]] +=1
+
+    return bigram_counts, word_counts
+
 def main():
    if len(sys.argv) != 4:
        print("syntax is ./train.py expected.tsv in.tsv model.pkl")
@ -80,8 +143,11 @@ def main():
    in_file = str(sys.argv[2])
    model = str(sys.argv[3])
    paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
-    bigrams_count = calc_bigram_count(in_file, expected_file)
+    #bigrams_count = calc_bigram_count(in_file, expected_file)
+    bigrams_count, words_count = launch_bigrams_and_words(in_file, expected_file)
    bigram_logprobs = calc_bigram_logprobs(bigrams_count)
+    word_logprobs = calc_word_logprobs(words_count)
    with open(model, 'wb') as f:
-        pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs],f)
+        pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs, word_logprobs],f)
 main()
+
--- a/train.py_only_bi
+++ b/train.py_only_bi
@ -0,0 +1,94 @@
+#!/usr/bin/python3
+from collections import defaultdict
+import math
+import pickle
+import re
+import sys
+import nltk
+from nltk.corpus import stopwords
+
+def calc_class_logprob(expected_path):
+    paranormal_classcount = 0
+    sceptic_classcount = 0
+
+    with open(expected_path) as f:
+        for line in f:
+            line = line.rstrip('\n').replace(' ','')
+            if 'P' in line:
+                paranormal_classcount +=1
+            elif 'S' in line:
+                sceptic_classcount +=1
+
+    paranol_prob = paranormal_classcount / (paranormal_classcount + sceptic_classcount)
+    sceptic_prob = sceptic_classcount / (paranormal_classcount + sceptic_classcount)
+
+    return math.log(paranol_prob), math.log(sceptic_prob)
+
+def clear_post(post):
+    post = post.replace('\\n', ' ')
+    post = post.lower()
+    # delete links
+    post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
+    post = re.sub(r'[\.\,\/\~]+', ' ', post)
+    post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
+    post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
+    post = re.sub(r'( \- |\-\-+)', ' ', post)
+    post = re.sub(r' +', ' ', post)
+    post = post.rstrip(' ')
+    post = post.split(' ')
+    stop_words = set(stopwords.words('english'))
+    post_no_stop = [w for w in post if not w in stop_words]
+    return post_no_stop
+
+def calc_bigram_count(in_path, expected_path):
+    bigram_counts = {'paranormal' : defaultdict(int), 'sceptic' : defaultdict(int)}
+    with open(in_path) as infile, open(expected_path) as expected_file:
+        num_of_bigams = 0
+        for line, exp in zip(infile, expected_file):
+            class_ = exp.rstrip('\n').replace(' ', '')
+            text, timestap = line.rstrip('\n').split('\t')
+            tokens = clear_post(text)
+            #tokens = text.lower().split(' ')
+            for index in range(len(tokens)-1):
+                # if there is next token we append current and next
+                bigram = tokens[index] + " " + tokens[index + 1]
+                #print(bigram)
+                #print (f"bigram constructed from ;;;;{tokens[index]}:{tokens[index+1]};;;;;;;")
+                if class_ == 'P':
+                    bigram_counts['paranormal'][bigram] +=1
+                elif class_ == 'S':
+                    bigram_counts['sceptic'][bigram] +=1
+                num_of_bigams +=1
+    #print(f"num of every added bigams with repetitions {num_of_bigams})")
+    #print(f"num of bigams in paranormal {len(bigram_counts['paranormal'])} and sceptic {len(bigram_counts['sceptic'])}")
+    return bigram_counts
+
+def calc_bigram_logprobs(bigram_counts):
+    total_sceptic = sum(bigram_counts['sceptic'].values()) + len(bigram_counts['sceptic'].keys())
+    total_paranormal = sum(bigram_counts['paranormal'].values()) + len(bigram_counts['paranormal'].keys())
+    bigram_logprobs = {'paranormal' : {}, 'sceptic' : {}}
+    for class_ in bigram_counts.keys():
+        for bigram, value in bigram_counts[class_].items():
+            if class_ == "sceptic":
+                bigram_prob = (value + 1) / total_sceptic
+            elif class_ == "paranormal":
+                bigram_prob = (value + 1) / total_paranormal
+
+            bigram_logprobs[class_][bigram] = math.log(bigram_prob)
+
+    return bigram_logprobs
+
+def main():
+    if len(sys.argv) != 4:
+        print("syntax is ./train.py expected.tsv in.tsv model.pkl")
+        return
+    expected_file = str(sys.argv[1])
+    in_file = str(sys.argv[2])
+    model = str(sys.argv[3])
+    paranormal_class_logprob, sceptic_class_logprob = calc_class_logprob(expected_file)
+    bigrams_count = calc_bigram_count(in_file, expected_file)
+    bigram_logprobs = calc_bigram_logprobs(bigrams_count)
+    with open(model, 'wb') as f:
+        pickle.dump([paranormal_class_logprob, sceptic_class_logprob, bigram_logprobs],f)
+main()
+
--- a/train/in.tsv
+++ b/train/in.tsv
--- a/naive_base_model.pkl_baseline
+++ b/naive_base_model.pkl_baseline
				`@ -1 +0,0 @@`
				`e412b617206095df98ac606360b222d0 naive_base_model.pkl`