From c0c541dcedf5150617a20e04d6e5da1bbba35c0a Mon Sep 17 00:00:00 2001 From: Bartosz Ogonowski Date: Sun, 22 Mar 2020 14:21:40 +0100 Subject: [PATCH] PyCharm test commit --- code.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/code.py b/code.py index e69de29..1042fd2 100644 --- a/code.py +++ b/code.py @@ -0,0 +1,57 @@ +from collections import defaultdict +import math +import pickle + +def calc_class_logprob(expected_path): + paranormal_classcount=0 + skeptic_classcount=0 + with open(expected_path) as f: + for line in f: + if 'P' in line: + paranormal_classcount += 1 + if 'S' in line: + skeptic_classcount += 1 + + paranormal_prob = paranormal_classcount / (paranormal_classcount + skeptic_classcount) + skeptic_prob = skeptic_classcount / (paranormal_classcount + skeptic_classcount) + + return math.log(paranormal_prob), math.log(skeptic_prob) + + +def calc_word_count(in_path, expected_path): + word_counts = {'paranormal':defaultdict(int), 'skeptic': defaultdict(int)} + with open(in_path) as in_file, open(expected_path) as expected_file: + for line, exp in zip(in_file, expected_file): + class_ = exp.rstrip('\n').replace(' ','') + text, timestamp = line.rstrip('\n').split('\t') + tokens = text.lower().split(' ') + for token in tokens: + if class_ == 'P': + word_counts['paranormal'][token] += 1 + elif class_ == 'S': + word_counts['skeptic'][token] += 1 + return word_counts + +def calc_word_logprobs(word_counts): + total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()) + total_paranormal = sum(word_counts['paranormal'].values()) + len(word_counts['paranormal'].keys()) + + word_logprobs= {'paranormal': {}, 'skeptic': {}} + for class_ in word_counts.keys(): # sceptic paranormal + for token, tokens in word_counts[class_].items(): + if class_ == 'skeptic': + word_prob = (tokens+1)/total_skeptic + else: + word_prob = (tokens+1)/total_paranormal + word_logprobs[class_][token] = math.log(word_prob) + return word_logprobs + + +def main(): + paranomal_class_logprob, skeptic_class_logprob = calc_class_logprob("F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/expected.tsv") + word_counts=calc_word_count("F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/in.tsv","F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/expected.tsv") + word_logprobs = calc_word_logprobs(word_counts) + pickle.dump([paranomal_class_logprob, skeptic_class_logprob, word_logprobs], open('naive_base_model.pkl','wb')) + + +main()