add tokenizer
This commit is contained in:
parent
744e5db758
commit
dafa49e690
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
48
solution.py
48
solution.py
@ -5,14 +5,41 @@ import pickle
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
text = text.replace("n't", " not")
|
||||
text = text.replace("'s", " is")
|
||||
text = text.replace("'ve", " have")
|
||||
text = text.replace("'", " ")
|
||||
text = text.replace("(", " ")
|
||||
text = text.replace(")", " ")
|
||||
text = text.replace("/", " ")
|
||||
text = text.replace("\\n\\n", "")
|
||||
text = text.replace(".", "")
|
||||
text = text.replace("?", "")
|
||||
text = text.replace(",", "")
|
||||
text = text.replace("!", "")
|
||||
text = text.replace('"', '')
|
||||
text = text.replace(" a ", " ")
|
||||
text = text.replace(" on ", " ")
|
||||
text = text.replace(" the ", " ")
|
||||
text = text.replace(" of ", " ")
|
||||
text = text.replace(" an ", " ")
|
||||
text = text.replace(" to ", " ")
|
||||
#text = text.replace("a", "")
|
||||
return text
|
||||
|
||||
|
||||
|
||||
|
||||
def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c))
|
||||
paranoarmal_class_count = 0
|
||||
skeptic_class_count = 0
|
||||
with open(expected_path) as f:
|
||||
for line in f:
|
||||
if "P" in line:
|
||||
if "1" in line:
|
||||
paranoarmal_class_count +=1
|
||||
elif "S" in line:
|
||||
elif "0" in line:
|
||||
skeptic_class_count +=1
|
||||
|
||||
paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
|
||||
@ -26,11 +53,12 @@ def calc_word_counts(in_path, expected_path):
|
||||
for in_line, exp_line in zip(in_file, exp_file):
|
||||
class_ = exp_line.rstrip('\n').replace(" ", "")
|
||||
text, timestamp = in_line.rstrip('\n').split('\t')
|
||||
text = tokenize(text)
|
||||
tokens = text.lower().split(' ')
|
||||
for token in tokens:
|
||||
if class_ == 'P':
|
||||
if class_ == '1':
|
||||
word_counts['paranormal'][token] += 1
|
||||
elif class_ == 'S':
|
||||
elif class_ == '0':
|
||||
word_counts['skeptic'][token] += 1
|
||||
|
||||
return word_counts
|
||||
@ -55,7 +83,7 @@ word_counts = calc_word_counts('train/in.tsv','train/expected.tsv')
|
||||
|
||||
word_logprobs = calc_word_logprobs(word_counts)
|
||||
|
||||
print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
|
||||
#print(word_logprobs['skeptic']["hair."]) #-12.166205308815476
|
||||
|
||||
#trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic
|
||||
|
||||
@ -74,6 +102,7 @@ def predict_post_class(posts, sprob, pprob, word_logprobs):
|
||||
for post in posts:
|
||||
total_s_prob = sprob
|
||||
total_p_prob = pprob
|
||||
post = tokenize(post)
|
||||
tokens = post.lower().split(' ')
|
||||
for token in tokens:
|
||||
#dlasceptic
|
||||
@ -92,9 +121,9 @@ def predict_post_class(posts, sprob, pprob, word_logprobs):
|
||||
#print(total_p_prob)
|
||||
#print(total_s_prob)
|
||||
if total_p_prob > total_s_prob:
|
||||
out_classes.append('P')
|
||||
out_classes.append('1')
|
||||
else:
|
||||
out_classes.append('S')
|
||||
out_classes.append('0')
|
||||
|
||||
return out_classes
|
||||
|
||||
@ -104,8 +133,7 @@ def predict_posts(path):
|
||||
classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs)
|
||||
with open(path+"/out.tsv", 'wt') as tsvfile:
|
||||
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
||||
for i in classes:
|
||||
tsv_writer.writerow(i)
|
||||
tsv_writer.writerows(map(lambda x: [x], classes))
|
||||
|
||||
predict_posts("dev-0")
|
||||
predict_posts("test-A")
|
||||
@ -115,6 +143,6 @@ with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
|
||||
positive = 0
|
||||
for out_line, exp_line in zip(out_file, exp_file):
|
||||
counter+=1
|
||||
if " "+out_line == exp_line:
|
||||
if out_line == exp_line:
|
||||
positive += 1
|
||||
print(positive/counter)
|
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user