little rule base added

This commit is contained in:
s426135 2020-03-22 13:32:09 +01:00
parent 773683e7d4
commit c8cb346bf5
7 changed files with 255 additions and 4110 deletions

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -4,24 +4,29 @@ import pickle
import math import math
import re import re
def clear_tokens(tokens): def clear_tokens(tokens, is_text=True):
tokens = tokens.replace('\\n', ' ') tokens = tokens.replace('\\n', ' ')
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens) tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens)
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens) tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens)
tokens = re.sub(r'œ|·', '', tokens) tokens = re.sub(r'œ|·', '', tokens)
tokens = re.sub(r' +', ' ', tokens) if is_text:
tokens = re.sub(r' +', ' ', tokens)
else:
tokens = re.sub(r' +', '', tokens)
return tokens return tokens
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs): def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
# dla kazdego tokenu z danego posta # dla kazdego tokenu z danego posta
text, timestap = post.rstrip('\n').split('\t') text, timestap = post.rstrip('\n').split('\t')
text = clear_tokens(text) text = clear_tokens(text, True)
tokens = text.lower().split(' ') tokens = text.lower().split(' ')
probs = {0.0 : 'sceptic', 0.0 : 'paranormal'} probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
for class_ in word_logprobs.keys(): for class_ in word_logprobs.keys():
product = 1 product = 1
for token in tokens: for token in tokens:
token = clear_tokens(token, False)
try: try:
product += word_logprobs[class_][token] product += word_logprobs[class_][token]
except KeyError: except KeyError:
@ -34,8 +39,13 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
probs[abs(product)] = class_ probs[abs(product)] = class_
#print(probs) #print(probs)
# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal # mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal
if search_for_keywords(text):
return 'paranormal'
return probs[max(probs.keys())] return probs[max(probs.keys())]
def search_for_keywords(text):
keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens']
return any(keyword in text for keyword in keywords)
def main(): def main():
with open('naive_base_model.pkl', 'rb') as f: with open('naive_base_model.pkl', 'rb') as f:
@ -43,10 +53,10 @@ def main():
paranormal_class_logprob = pickle_list[0] paranormal_class_logprob = pickle_list[0]
sceptic_class_logprob = pickle_list[1] sceptic_class_logprob = pickle_list[1]
word_logprobs = pickle_list[2] word_logprobs = pickle_list[2]
#in_file = "test-A/in.tsv" in_file = "test-A/in.tsv"
in_file = "dev-0/in.tsv" #in_file = "dev-0/in.tsv"
#out_file = "test-A/out.tsv" out_file = "test-A/out.tsv"
out_file = "dev-0/out.tsv" #out_file = "dev-0/out.tsv"
print (f"in {in_file}") print (f"in {in_file}")
print (f"out {out_file}") print (f"out {out_file}")
with open(in_file) as in_f, open(out_file, 'w') as out_f: with open(in_file) as in_f, open(out_file, 'w') as out_f:

File diff suppressed because it is too large Load Diff

View File

@ -21,7 +21,7 @@ def calc_class_logprob(expected_path):
return math.log(paranol_prob), math.log(sceptic_prob) return math.log(paranol_prob), math.log(sceptic_prob)
def clear_tokens(tokens): def clear_tokens(tokens, is_text=True):
tokens = tokens.replace('\\n', ' ') tokens = tokens.replace('\\n', ' ')
# delete links, special characters, kropki, and \n # delete links, special characters, kropki, and \n
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
@ -30,7 +30,10 @@ def clear_tokens(tokens):
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens) tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
tokens = re.sub(r'[0-9]+', ' ', tokens) tokens = re.sub(r'[0-9]+', ' ', tokens)
tokens = re.sub(r'œ|·', '', tokens) tokens = re.sub(r'œ|·', '', tokens)
tokens = re.sub(r' +', ' ', tokens) if is_text:
tokens = re.sub(r' +', ' ', tokens)
else:
tokens = re.sub(r' +', '', tokens)
return tokens return tokens
# ile razy slowo wystepuje w dokumentach w danej klasie # ile razy slowo wystepuje w dokumentach w danej klasie
@ -41,10 +44,11 @@ def calc_word_count(in_path, expected_path):
class_ = exp.rstrip('\n').replace(' ','') class_ = exp.rstrip('\n').replace(' ','')
text, timestap =line.rstrip('\n').split('\t') text, timestap =line.rstrip('\n').split('\t')
#print(f"text {type(text)}") #print(f"text {type(text)}")
text = clear_tokens(text) text = clear_tokens(text, True)
tokens = text.lower().split(' ') tokens = text.lower().split(' ')
#print(f"tokens {type(tokens)}") #print(f"tokens {type(tokens)}")
for token in tokens: for token in tokens:
clear_tokens(token,False)
if class_ == 'P': if class_ == 'P':
word_counts['paranormal'][token] += 1 word_counts['paranormal'][token] += 1
elif class_ == 'S': elif class_ == 'S':
@ -69,10 +73,10 @@ def calc_word_logprobs(word_counts):
return word_logprobs return word_logprobs
def main(): def main():
#expected = './train/expected.tsv' expected = './train/expected.tsv'
expected = './dev-0/expected.tsv' #expected = './dev-0/expected.tsv'
#in_f = './train/in.tsv' in_f = './train/in.tsv'
in_f = './dev-0/in.tsv' #in_f = './dev-0/in.tsv'
print (f"expected {expected}") print (f"expected {expected}")
print (f"in {in_f}") print (f"in {in_f}")
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected) paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)