laptop commit fixed naive baise

This commit is contained in:
Bartosz Ogonowski 2020-05-02 18:49:56 +02:00
parent 4d38c7f755
commit 84b8b45e76
5 changed files with 9578 additions and 9585 deletions

View File

@ -28,18 +28,13 @@ def calc_word_count(in_path, expected_path):
word_counts = {'paranormal':defaultdict(int), 'skeptic': defaultdict(int)}
with open(in_path,encoding='utf-8') as in_file, open(expected_path,encoding='utf-8') as expected_file:
for line, exp in zip(in_file, expected_file):
class_ = exp.rstrip('\n')
#.replace(' ','')
class_ = exp.rstrip('\n').replace(' ','')
text, timestamp = line.rstrip('\n').split('\t')
text = text.lower()
text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
text = re.sub(r'[^a-z]', " ", text)
text = re.sub(r'\s{2,}', " ", text)
text = re.sub(r'(\s+|\\n)', ' ', text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'^\s', "", text)
tokens = text.split(' ')
for token in tokens:
if class_ == 'P':
@ -67,6 +62,8 @@ def calc_word_logprobs(word_counts):
def main():
paranomal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv")
word_counts=calc_word_count("train/in.tsv","train/expected.tsv")
word_counts['paranormal'][''] = 0
word_counts['skeptic'][''] = 0
word_logprobs = calc_word_logprobs(word_counts)
pickle.dump([paranomal_class_logprob, skeptic_class_logprob, word_logprobs], open('naive_base_model.pkl','wb'))

View File

@ -21,12 +21,8 @@ def prediction(input,output):
text = text.lower()
text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
text = re.sub(r'[^a-z]', " ", text)
text = re.sub(r'\s{2,}', " ", text)
text = re.sub(r'(\s+|\\n)', ' ', text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'^\s', "", text)
tokens = text.split(' ')
for token in tokens:
if token not in word_logprobs['paranormal']:

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff