laptop commit fixed naive baise
This commit is contained in:
parent
4d38c7f755
commit
84b8b45e76
9
code.py
9
code.py
@ -28,18 +28,13 @@ def calc_word_count(in_path, expected_path):
|
||||
word_counts = {'paranormal':defaultdict(int), 'skeptic': defaultdict(int)}
|
||||
with open(in_path,encoding='utf-8') as in_file, open(expected_path,encoding='utf-8') as expected_file:
|
||||
for line, exp in zip(in_file, expected_file):
|
||||
class_ = exp.rstrip('\n')
|
||||
#.replace(' ','')
|
||||
class_ = exp.rstrip('\n').replace(' ','')
|
||||
text, timestamp = line.rstrip('\n').split('\t')
|
||||
text = text.lower()
|
||||
text = re.sub(r'\\n+', " ", text)
|
||||
text = re.sub(r'http\S+', " ", text)
|
||||
text = re.sub(r'\/[a-z]\/', " ", text)
|
||||
text = re.sub(r'[^a-z]', " ", text)
|
||||
text = re.sub(r'\s{2,}', " ", text)
|
||||
text = re.sub(r'(\s+|\\n)', ' ', text)
|
||||
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
|
||||
text = re.sub(r'^\s', "", text)
|
||||
tokens = text.split(' ')
|
||||
for token in tokens:
|
||||
if class_ == 'P':
|
||||
@ -67,6 +62,8 @@ def calc_word_logprobs(word_counts):
|
||||
def main():
|
||||
paranomal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
|
||||
word_counts=calc_word_count("train/in.tsv","train/expected.tsv")
|
||||
word_counts['paranormal'][''] = 0
|
||||
word_counts['skeptic'][''] = 0
|
||||
word_logprobs = calc_word_logprobs(word_counts)
|
||||
pickle.dump([paranomal_class_logprob, skeptic_class_logprob, word_logprobs], open('naive_base_model.pkl','wb'))
|
||||
|
||||
|
@ -21,12 +21,8 @@ def prediction(input,output):
|
||||
text = text.lower()
|
||||
text = re.sub(r'\\n+', " ", text)
|
||||
text = re.sub(r'http\S+', " ", text)
|
||||
text = re.sub(r'\/[a-z]\/', " ", text)
|
||||
text = re.sub(r'[^a-z]', " ", text)
|
||||
text = re.sub(r'\s{2,}', " ", text)
|
||||
text = re.sub(r'(\s+|\\n)', ' ', text)
|
||||
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
|
||||
text = re.sub(r'^\s', "", text)
|
||||
tokens = text.split(' ')
|
||||
for token in tokens:
|
||||
if token not in word_logprobs['paranormal']:
|
||||
|
9660
dev-0/out.tsv
9660
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
Binary file not shown.
9490
test-A/out.tsv
9490
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user