This commit is contained in:
s426135 2020-03-22 13:58:35 +01:00
parent 9ea4e1abab
commit 95e6501fe5
7 changed files with 1518 additions and 1515 deletions

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -6,6 +6,7 @@ import re
def clear_tokens(tokens, is_text=True): def clear_tokens(tokens, is_text=True):
tokens = tokens.replace('\\n', ' ') tokens = tokens.replace('\\n', ' ')
return tokens
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens) tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\\\\±]+', ' ', tokens)
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens) tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
@ -22,7 +23,8 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
text, timestap = post.rstrip('\n').split('\t') text, timestap = post.rstrip('\n').split('\t')
text = clear_tokens(text, True) text = clear_tokens(text, True)
tokens = text.lower().split(' ') tokens = text.lower().split(' ')
probs = {0.0 : 'sceptic', 0.0 : 'paranormal'} #probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
probs = {}
for class_ in word_logprobs.keys(): for class_ in word_logprobs.keys():
product = 1 product = 1
for token in tokens: for token in tokens:
@ -30,7 +32,7 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
try: try:
product += word_logprobs[class_][token] product += word_logprobs[class_][token]
except KeyError: except KeyError:
pass product += 0
# tu wzoru uzyj # tu wzoru uzyj
if class_ == 'sceptic': if class_ == 'sceptic':
product += sceptic_class_logprob product += sceptic_class_logprob

File diff suppressed because it is too large Load Diff

View File

@ -23,6 +23,7 @@ def calc_class_logprob(expected_path):
def clear_tokens(tokens, is_text=True): def clear_tokens(tokens, is_text=True):
tokens = tokens.replace('\\n', ' ') tokens = tokens.replace('\\n', ' ')
return tokens
# delete links, special characters, kropki, and \n # delete links, special characters, kropki, and \n
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens) tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens) tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens)