little rule base added

This commit is contained in:
s426135 2020-03-22 13:32:09 +01:00
parent 773683e7d4
commit c8cb346bf5
7 changed files with 255 additions and 4110 deletions

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -4,24 +4,29 @@ import pickle
import math
import re
def clear_tokens(tokens):
def clear_tokens(tokens, is_text=True):
tokens = tokens.replace('\\n', ' ')
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens)
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens)
tokens = re.sub(r'œ|·', '', tokens)
tokens = re.sub(r' +', ' ', tokens)
if is_text:
tokens = re.sub(r' +', ' ', tokens)
else:
tokens = re.sub(r' +', '', tokens)
return tokens
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
# dla kazdego tokenu z danego posta
text, timestap = post.rstrip('\n').split('\t')
text = clear_tokens(text)
text = clear_tokens(text, True)
tokens = text.lower().split(' ')
probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
for class_ in word_logprobs.keys():
product = 1
for token in tokens:
token = clear_tokens(token, False)
try:
product += word_logprobs[class_][token]
except KeyError:
@ -34,8 +39,13 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
probs[abs(product)] = class_
#print(probs)
# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal
if search_for_keywords(text):
return 'paranormal'
return probs[max(probs.keys())]
def search_for_keywords(text):
keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens']
return any(keyword in text for keyword in keywords)
def main():
with open('naive_base_model.pkl', 'rb') as f:
@ -43,10 +53,10 @@ def main():
paranormal_class_logprob = pickle_list[0]
sceptic_class_logprob = pickle_list[1]
word_logprobs = pickle_list[2]
#in_file = "test-A/in.tsv"
in_file = "dev-0/in.tsv"
#out_file = "test-A/out.tsv"
out_file = "dev-0/out.tsv"
in_file = "test-A/in.tsv"
#in_file = "dev-0/in.tsv"
out_file = "test-A/out.tsv"
#out_file = "dev-0/out.tsv"
print (f"in {in_file}")
print (f"out {out_file}")
with open(in_file) as in_f, open(out_file, 'w') as out_f:

File diff suppressed because it is too large Load Diff

View File

@ -21,7 +21,7 @@ def calc_class_logprob(expected_path):
return math.log(paranol_prob), math.log(sceptic_prob)
def clear_tokens(tokens):
def clear_tokens(tokens, is_text=True):
tokens = tokens.replace('\\n', ' ')
# delete links, special characters, kropki, and \n
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
@ -30,7 +30,10 @@ def clear_tokens(tokens):
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens)
tokens = re.sub(r'œ|·', '', tokens)
tokens = re.sub(r' +', ' ', tokens)
if is_text:
tokens = re.sub(r' +', ' ', tokens)
else:
tokens = re.sub(r' +', '', tokens)
return tokens
# ile razy slowo wystepuje w dokumentach w danej klasie
@ -41,10 +44,11 @@ def calc_word_count(in_path, expected_path):
class_ = exp.rstrip('\n').replace(' ','')
text, timestap =line.rstrip('\n').split('\t')
#print(f"text {type(text)}")
text = clear_tokens(text)
text = clear_tokens(text, True)
tokens = text.lower().split(' ')
#print(f"tokens {type(tokens)}")
for token in tokens:
clear_tokens(token,False)
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
@ -69,10 +73,10 @@ def calc_word_logprobs(word_counts):
return word_logprobs
def main():
#expected = './train/expected.tsv'
expected = './dev-0/expected.tsv'
#in_f = './train/in.tsv'
in_f = './dev-0/in.tsv'
expected = './train/expected.tsv'
#expected = './dev-0/expected.tsv'
in_f = './train/in.tsv'
#in_f = './dev-0/in.tsv'
print (f"expected {expected}")
print (f"in {in_f}")
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)