little rule base added
This commit is contained in:
parent
773683e7d4
commit
c8cb346bf5
BIN
.predict.py.swp
BIN
.predict.py.swp
Binary file not shown.
BIN
.train.py.swp
BIN
.train.py.swp
Binary file not shown.
3929
dev-0/out.tsv
3929
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
Binary file not shown.
24
predict.py
24
predict.py
@ -4,24 +4,29 @@ import pickle
|
||||
import math
|
||||
import re
|
||||
|
||||
def clear_tokens(tokens):
|
||||
def clear_tokens(tokens, is_text=True):
|
||||
tokens = tokens.replace('\\n', ' ')
|
||||
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
||||
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens)
|
||||
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
||||
tokens = re.sub(r'[0-9]+', ' ', tokens)
|
||||
tokens = re.sub(r'œ|·', '', tokens)
|
||||
tokens = re.sub(r' +', ' ', tokens)
|
||||
if is_text:
|
||||
tokens = re.sub(r' +', ' ', tokens)
|
||||
else:
|
||||
tokens = re.sub(r' +', '', tokens)
|
||||
return tokens
|
||||
|
||||
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
|
||||
# dla kazdego tokenu z danego posta
|
||||
text, timestap = post.rstrip('\n').split('\t')
|
||||
text = clear_tokens(text)
|
||||
text = clear_tokens(text, True)
|
||||
tokens = text.lower().split(' ')
|
||||
probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
|
||||
for class_ in word_logprobs.keys():
|
||||
product = 1
|
||||
for token in tokens:
|
||||
token = clear_tokens(token, False)
|
||||
try:
|
||||
product += word_logprobs[class_][token]
|
||||
except KeyError:
|
||||
@ -34,8 +39,13 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
|
||||
probs[abs(product)] = class_
|
||||
#print(probs)
|
||||
# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal
|
||||
if search_for_keywords(text):
|
||||
return 'paranormal'
|
||||
return probs[max(probs.keys())]
|
||||
|
||||
def search_for_keywords(text):
|
||||
keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens']
|
||||
return any(keyword in text for keyword in keywords)
|
||||
|
||||
def main():
|
||||
with open('naive_base_model.pkl', 'rb') as f:
|
||||
@ -43,10 +53,10 @@ def main():
|
||||
paranormal_class_logprob = pickle_list[0]
|
||||
sceptic_class_logprob = pickle_list[1]
|
||||
word_logprobs = pickle_list[2]
|
||||
#in_file = "test-A/in.tsv"
|
||||
in_file = "dev-0/in.tsv"
|
||||
#out_file = "test-A/out.tsv"
|
||||
out_file = "dev-0/out.tsv"
|
||||
in_file = "test-A/in.tsv"
|
||||
#in_file = "dev-0/in.tsv"
|
||||
out_file = "test-A/out.tsv"
|
||||
#out_file = "dev-0/out.tsv"
|
||||
print (f"in {in_file}")
|
||||
print (f"out {out_file}")
|
||||
with open(in_file) as in_f, open(out_file, 'w') as out_f:
|
||||
|
394
test-A/out.tsv
394
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
18
train.py
18
train.py
@ -21,7 +21,7 @@ def calc_class_logprob(expected_path):
|
||||
|
||||
return math.log(paranol_prob), math.log(sceptic_prob)
|
||||
|
||||
def clear_tokens(tokens):
|
||||
def clear_tokens(tokens, is_text=True):
|
||||
tokens = tokens.replace('\\n', ' ')
|
||||
# delete links, special characters, kropki, and \n
|
||||
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
||||
@ -30,7 +30,10 @@ def clear_tokens(tokens):
|
||||
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
||||
tokens = re.sub(r'[0-9]+', ' ', tokens)
|
||||
tokens = re.sub(r'œ|·', '', tokens)
|
||||
tokens = re.sub(r' +', ' ', tokens)
|
||||
if is_text:
|
||||
tokens = re.sub(r' +', ' ', tokens)
|
||||
else:
|
||||
tokens = re.sub(r' +', '', tokens)
|
||||
return tokens
|
||||
|
||||
# ile razy slowo wystepuje w dokumentach w danej klasie
|
||||
@ -41,10 +44,11 @@ def calc_word_count(in_path, expected_path):
|
||||
class_ = exp.rstrip('\n').replace(' ','')
|
||||
text, timestap =line.rstrip('\n').split('\t')
|
||||
#print(f"text {type(text)}")
|
||||
text = clear_tokens(text)
|
||||
text = clear_tokens(text, True)
|
||||
tokens = text.lower().split(' ')
|
||||
#print(f"tokens {type(tokens)}")
|
||||
for token in tokens:
|
||||
clear_tokens(token,False)
|
||||
if class_ == 'P':
|
||||
word_counts['paranormal'][token] += 1
|
||||
elif class_ == 'S':
|
||||
@ -69,10 +73,10 @@ def calc_word_logprobs(word_counts):
|
||||
return word_logprobs
|
||||
|
||||
def main():
|
||||
#expected = './train/expected.tsv'
|
||||
expected = './dev-0/expected.tsv'
|
||||
#in_f = './train/in.tsv'
|
||||
in_f = './dev-0/in.tsv'
|
||||
expected = './train/expected.tsv'
|
||||
#expected = './dev-0/expected.tsv'
|
||||
in_f = './train/in.tsv'
|
||||
#in_f = './dev-0/in.tsv'
|
||||
print (f"expected {expected}")
|
||||
print (f"in {in_f}")
|
||||
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)
|
||||
|
Loading…
Reference in New Issue
Block a user