little rule base added

2020-03-22 13:32:09 +01:00 · 2020-03-22 13:32:09 +01:00 · c8cb346bf5
commit c8cb346bf5
parent 773683e7d4
7 changed files with 255 additions and 4110 deletions
--- a/.predict.py.swp
+++ b/.predict.py.swp
--- a/.train.py.swp
+++ b/.train.py.swp
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/naive_base_model.pkl
+++ b/naive_base_model.pkl
--- a/predict.py
+++ b/predict.py
@ -4,24 +4,29 @@ import pickle
 import math
 import re

-def clear_tokens(tokens):
+def clear_tokens(tokens, is_text=True):
    tokens = tokens.replace('\\n', ' ')
    tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
    tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\＞\″\±]+', ' ', tokens)
    tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
+    tokens = re.sub(r'[0-9]+', ' ', tokens)
    tokens = re.sub(r'œ|·', '', tokens)
+    if is_text:
+        tokens = re.sub(r' +', ' ', tokens)
+    else:
        tokens = re.sub(r' +', '', tokens)
    return tokens

 def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
    # dla kazdego tokenu z danego posta
    text, timestap = post.rstrip('\n').split('\t')
-    text =  clear_tokens(text)
+    text =  clear_tokens(text, True)
    tokens = text.lower().split(' ')
    probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
    for class_ in word_logprobs.keys():
        product = 1
        for token in tokens:
+            token = clear_tokens(token, False)
            try:
                product += word_logprobs[class_][token]
            except KeyError:
@ -34,8 +39,13 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
        probs[abs(product)] = class_
        #print(probs)
 # mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal
+    if search_for_keywords(text):
+        return 'paranormal'
    return probs[max(probs.keys())]

+def search_for_keywords(text):
+    keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens']
+    return any(keyword in text for keyword in keywords)

 def main():
    with open('naive_base_model.pkl', 'rb') as f:
@ -43,10 +53,10 @@ def main():
    paranormal_class_logprob = pickle_list[0]
    sceptic_class_logprob = pickle_list[1]
    word_logprobs = pickle_list[2]
-    #in_file = "test-A/in.tsv"
-    in_file = "dev-0/in.tsv"
-    #out_file = "test-A/out.tsv"
-    out_file = "dev-0/out.tsv"
+    in_file = "test-A/in.tsv"
+    #in_file = "dev-0/in.tsv"
+    out_file = "test-A/out.tsv"
+    #out_file = "dev-0/out.tsv"
    print (f"in {in_file}")
    print (f"out {out_file}")
    with open(in_file) as in_f, open(out_file, 'w') as out_f:
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.py
+++ b/train.py
@ -21,7 +21,7 @@ def calc_class_logprob(expected_path):

    return math.log(paranol_prob), math.log(sceptic_prob)

-def clear_tokens(tokens):
+def clear_tokens(tokens, is_text=True):
    tokens = tokens.replace('\\n', ' ')
    # delete links, special characters, kropki, and \n
    tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
@ -30,6 +30,9 @@ def clear_tokens(tokens):
    tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
    tokens = re.sub(r'[0-9]+', ' ', tokens)
    tokens = re.sub(r'œ|·', '', tokens)
+    if is_text:
+        tokens = re.sub(r' +', ' ', tokens)
+    else:
        tokens = re.sub(r' +', '', tokens)
    return tokens

@ -41,10 +44,11 @@ def calc_word_count(in_path, expected_path):
            class_ = exp.rstrip('\n').replace(' ','')
            text, timestap =line.rstrip('\n').split('\t')
            #print(f"text  {type(text)}")
-            text = clear_tokens(text)
+            text = clear_tokens(text, True)
            tokens = text.lower().split(' ')
            #print(f"tokens {type(tokens)}")
            for token in tokens:
+                clear_tokens(token,False)
                if class_ == 'P':
                    word_counts['paranormal'][token] += 1
                elif class_ == 'S':
@ -69,10 +73,10 @@ def calc_word_logprobs(word_counts):
    return word_logprobs

 def main():
-    #expected = './train/expected.tsv'
-    expected = './dev-0/expected.tsv'
-    #in_f = './train/in.tsv'
-    in_f = './dev-0/in.tsv'
+    expected = './train/expected.tsv'
+    #expected = './dev-0/expected.tsv'
+    in_f = './train/in.tsv'
+    #in_f = './dev-0/in.tsv'
    print (f"expected {expected}")
    print (f"in {in_f}")
    paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)