old way
This commit is contained in:
parent
9ea4e1abab
commit
95e6501fe5
BIN
.predict.py.swp
BIN
.predict.py.swp
Binary file not shown.
BIN
.train.py.swp
BIN
.train.py.swp
Binary file not shown.
1182
dev-0/out.tsv
1182
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -6,6 +6,7 @@ import re
|
|||||||
|
|
||||||
def clear_tokens(tokens, is_text=True):
|
def clear_tokens(tokens, is_text=True):
|
||||||
tokens = tokens.replace('\\n', ' ')
|
tokens = tokens.replace('\\n', ' ')
|
||||||
|
return tokens
|
||||||
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
||||||
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens)
|
tokens = re.sub(r'[\n\&\"\?\\\'\*\[\]\,\;\.\=\+\(\)\!\/\:\`\~\%\^\$\#\@\’\>\″\±]+', ' ', tokens)
|
||||||
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
tokens = re.sub(r'[\.\-][\.\-]+', ' ', tokens)
|
||||||
@ -22,7 +23,8 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
|
|||||||
text, timestap = post.rstrip('\n').split('\t')
|
text, timestap = post.rstrip('\n').split('\t')
|
||||||
text = clear_tokens(text, True)
|
text = clear_tokens(text, True)
|
||||||
tokens = text.lower().split(' ')
|
tokens = text.lower().split(' ')
|
||||||
probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
|
#probs = {0.0 : 'sceptic', 0.0 : 'paranormal'}
|
||||||
|
probs = {}
|
||||||
for class_ in word_logprobs.keys():
|
for class_ in word_logprobs.keys():
|
||||||
product = 1
|
product = 1
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
@ -30,7 +32,7 @@ def calc_post_prob(post, paranormal_class_logprob, sceptic_class_logprob, word_l
|
|||||||
try:
|
try:
|
||||||
product += word_logprobs[class_][token]
|
product += word_logprobs[class_][token]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
product += 0
|
||||||
# tu wzoru uzyj
|
# tu wzoru uzyj
|
||||||
if class_ == 'sceptic':
|
if class_ == 'sceptic':
|
||||||
product += sceptic_class_logprob
|
product += sceptic_class_logprob
|
||||||
|
1844
test-A/out.tsv
1844
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
1
train.py
1
train.py
@ -23,6 +23,7 @@ def calc_class_logprob(expected_path):
|
|||||||
|
|
||||||
def clear_tokens(tokens, is_text=True):
|
def clear_tokens(tokens, is_text=True):
|
||||||
tokens = tokens.replace('\\n', ' ')
|
tokens = tokens.replace('\\n', ' ')
|
||||||
|
return tokens
|
||||||
# delete links, special characters, kropki, and \n
|
# delete links, special characters, kropki, and \n
|
||||||
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
tokens = re.sub(r'\(((http)|(https)).*((\.com)|(\.net)|(\.jpg)|(\.html))\)'," ", tokens)
|
||||||
tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens)
|
tokens = re.sub(r'(|\-|\_)([a-z]+(\-|\_))+[a-z]+(|\-|\_)', ' ', tokens)
|
||||||
|
Loading…
Reference in New Issue
Block a user