little rule base added
This commit is contained in:
parent
773683e7d4
commit
c8cb346bf5
BIN
.predict.py.swp
BIN
.predict.py.swp
Binary file not shown.
BIN
.train.py.swp
BIN
.train.py.swp
Binary file not shown.
3929
dev-0/out.tsv
3929
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
Binary file not shown.
24
predict.py
24
predict.py
@ -4,24 +4,29 @@ import pickle
|
|||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
|
|
||||||
def clear_tokens(tokens):
|
def clear_tokens(tokens, is_text=True):
|
||||||
tokens = tokens.replace('\\n', ' ')
|
tokens = tokens.replace('\\n', ' ')
|
||||||
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
||||||
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens)
|
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens)
|
||||||
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
||||||
|
tokens = re.sub(r'[0-9]+', ' ', tokens)
|
||||||
tokens = re.sub(r'œ|·', '', tokens)
|
tokens = re.sub(r'œ|·', '', tokens)
|
||||||
tokens = re.sub(r' +', ' ', tokens)
|
if is_text:
|
||||||
|
tokens = re.sub(r' +', ' ', tokens)
|
||||||
|
else:
|
||||||
|
tokens = re.sub(r' +', '', tokens)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
|
def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_logprobs):
|
||||||
# dla kazdego tokenu z danego posta
|
# dla kazdego tokenu z danego posta
|
||||||
text, timestap = post.rstrip('\n').split('\t')
|
text, timestap = post.rstrip('\n').split('\t')
|
||||||
text = clear_tokens(text)
|
text = clear_tokens(text, True)
|
||||||
tokens = text.lower().split(' ')
|
tokens = text.lower().split(' ')
|
||||||
probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
|
probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
|
||||||
for class_ in word_logprobs.keys():
|
for class_ in word_logprobs.keys():
|
||||||
product = 1
|
product = 1
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
|
token = clear_tokens(token, False)
|
||||||
try:
|
try:
|
||||||
product += word_logprobs[class_][token]
|
product += word_logprobs[class_][token]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
@ -34,8 +39,13 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
|
|||||||
probs[abs(product)] = class_
|
probs[abs(product)] = class_
|
||||||
#print(probs)
|
#print(probs)
|
||||||
# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal
|
# mozna jeszcze zrobic aby bralo kluczowe slowa i wtedy decydowalo ze paranormal
|
||||||
|
if search_for_keywords(text):
|
||||||
|
return 'paranormal'
|
||||||
return probs[max(probs.keys())]
|
return probs[max(probs.keys())]
|
||||||
|
|
||||||
|
def search_for_keywords(text):
|
||||||
|
keywords = ['paranormal', 'ufo', 'aliens', 'conspiracy', 'aliens']
|
||||||
|
return any(keyword in text for keyword in keywords)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
with open('naive_base_model.pkl', 'rb') as f:
|
with open('naive_base_model.pkl', 'rb') as f:
|
||||||
@ -43,10 +53,10 @@ def main():
|
|||||||
paranormal_class_logprob = pickle_list[0]
|
paranormal_class_logprob = pickle_list[0]
|
||||||
sceptic_class_logprob = pickle_list[1]
|
sceptic_class_logprob = pickle_list[1]
|
||||||
word_logprobs = pickle_list[2]
|
word_logprobs = pickle_list[2]
|
||||||
#in_file = "test-A/in.tsv"
|
in_file = "test-A/in.tsv"
|
||||||
in_file = "dev-0/in.tsv"
|
#in_file = "dev-0/in.tsv"
|
||||||
#out_file = "test-A/out.tsv"
|
out_file = "test-A/out.tsv"
|
||||||
out_file = "dev-0/out.tsv"
|
#out_file = "dev-0/out.tsv"
|
||||||
print (f"in {in_file}")
|
print (f"in {in_file}")
|
||||||
print (f"out {out_file}")
|
print (f"out {out_file}")
|
||||||
with open(in_file) as in_f, open(out_file, 'w') as out_f:
|
with open(in_file) as in_f, open(out_file, 'w') as out_f:
|
||||||
|
394
test-A/out.tsv
394
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
18
train.py
18
train.py
@ -21,7 +21,7 @@ def calc_class_logprob(expected_path):
|
|||||||
|
|
||||||
return math.log(paranol_prob), math.log(sceptic_prob)
|
return math.log(paranol_prob), math.log(sceptic_prob)
|
||||||
|
|
||||||
def clear_tokens(tokens):
|
def clear_tokens(tokens, is_text=True):
|
||||||
tokens = tokens.replace('\\n', ' ')
|
tokens = tokens.replace('\\n', ' ')
|
||||||
# delete links, special characters, kropki, and \n
|
# delete links, special characters, kropki, and \n
|
||||||
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
||||||
@ -30,7 +30,10 @@ def clear_tokens(tokens):
|
|||||||
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
||||||
tokens = re.sub(r'[0-9]+', ' ', tokens)
|
tokens = re.sub(r'[0-9]+', ' ', tokens)
|
||||||
tokens = re.sub(r'œ|·', '', tokens)
|
tokens = re.sub(r'œ|·', '', tokens)
|
||||||
tokens = re.sub(r' +', ' ', tokens)
|
if is_text:
|
||||||
|
tokens = re.sub(r' +', ' ', tokens)
|
||||||
|
else:
|
||||||
|
tokens = re.sub(r' +', '', tokens)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
# ile razy slowo wystepuje w dokumentach w danej klasie
|
# ile razy slowo wystepuje w dokumentach w danej klasie
|
||||||
@ -41,10 +44,11 @@ def calc_word_count(in_path, expected_path):
|
|||||||
class_ = exp.rstrip('\n').replace(' ','')
|
class_ = exp.rstrip('\n').replace(' ','')
|
||||||
text, timestap =line.rstrip('\n').split('\t')
|
text, timestap =line.rstrip('\n').split('\t')
|
||||||
#print(f"text {type(text)}")
|
#print(f"text {type(text)}")
|
||||||
text = clear_tokens(text)
|
text = clear_tokens(text, True)
|
||||||
tokens = text.lower().split(' ')
|
tokens = text.lower().split(' ')
|
||||||
#print(f"tokens {type(tokens)}")
|
#print(f"tokens {type(tokens)}")
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
|
clear_tokens(token,False)
|
||||||
if class_ == 'P':
|
if class_ == 'P':
|
||||||
word_counts['paranormal'][token] += 1
|
word_counts['paranormal'][token] += 1
|
||||||
elif class_ == 'S':
|
elif class_ == 'S':
|
||||||
@ -69,10 +73,10 @@ def calc_word_logprobs(word_counts):
|
|||||||
return word_logprobs
|
return word_logprobs
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
#expected = './train/expected.tsv'
|
expected = './train/expected.tsv'
|
||||||
expected = './dev-0/expected.tsv'
|
#expected = './dev-0/expected.tsv'
|
||||||
#in_f = './train/in.tsv'
|
in_f = './train/in.tsv'
|
||||||
in_f = './dev-0/in.tsv'
|
#in_f = './dev-0/in.tsv'
|
||||||
print (f"expected {expected}")
|
print (f"expected {expected}")
|
||||||
print (f"in {in_f}")
|
print (f"in {in_f}")
|
||||||
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)
|
paranormal_class_lgprob, skeptic_class_logprob = calc_class_logprob(expected)
|
||||||
|
Loading…
Reference in New Issue
Block a user