From db710a4df843cfb3d88059f0bb3bbe11204943a5 Mon Sep 17 00:00:00 2001 From: Bartusiak Date: Thu, 2 Apr 2020 12:44:08 +0200 Subject: [PATCH] New branch --- code.py | 57 ---------------------------------------------- code_prediction.py | 38 ------------------------------- 2 files changed, 95 deletions(-) delete mode 100644 code_prediction.py diff --git a/code.py b/code.py index 0114570..fcae292 100644 --- a/code.py +++ b/code.py @@ -2,61 +2,4 @@ from collections import defaultdict import math import pickle -open_file=('test-A/out.tsv') -#---------------TRAIN START - -#Prawdopodobienstwo wylosowania dokumentu -def calc_class_logprob(expected_path): - paranormal_classcount=0 - skeptic_classcount=0 - with open(expected_path,encoding='utf-8') as f: - for line in f: - if 'P' in line: - paranormal_classcount += 1 - if 'S' in line: - skeptic_classcount += 1 - - paranormal_prob = paranormal_classcount / (paranormal_classcount + skeptic_classcount) - skeptic_prob = skeptic_classcount / (paranormal_classcount + skeptic_classcount) - - return math.log(paranormal_prob), math.log(skeptic_prob) - - -def calc_word_count(in_path, expected_path): - word_counts = {'paranormal':defaultdict(int), 'skeptic': defaultdict(int)} - with open(in_path,encoding='utf-8') as in_file, open(expected_path,encoding='utf-8') as expected_file: - for line, exp in zip(in_file, expected_file): - class_ = exp.rstrip('\n').replace(' ','') - text, timestamp = line.rstrip('\n').split('\t') - tokens = text.lower().split(' ') - for token in tokens: - if class_ == 'P': - word_counts['paranormal'][token] += 1 - elif class_ == 'S': - word_counts['skeptic'][token] += 1 - return word_counts - -def calc_word_logprobs(word_counts): - total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()) - total_paranormal = sum(word_counts['paranormal'].values()) + len(word_counts['paranormal'].keys()) - - word_logprobs= {'paranormal': {}, 'skeptic': {}} - for class_ in word_counts.keys(): # sceptic paranormal - for token, tokens in word_counts[class_].items(): - if class_ == 'skeptic': - word_prob = (tokens+1)/total_skeptic - else: - word_prob = (tokens+1)/total_paranormal - word_logprobs[class_][token] = math.log(word_prob) - return word_logprobs - -#--------------- TRAIN END - -def main(): - paranomal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv") - word_counts=calc_word_count("train/in.tsv","train/expected.tsv") - word_logprobs = calc_word_logprobs(word_counts) - pickle.dump([paranomal_class_logprob, skeptic_class_logprob, word_logprobs], open('naive_base_model.pkl','wb')) - -main() diff --git a/code_prediction.py b/code_prediction.py deleted file mode 100644 index 969e898..0000000 --- a/code_prediction.py +++ /dev/null @@ -1,38 +0,0 @@ -from collections import defaultdict -import math -import pickle - -open_file = open('naive_base_model.pkl', 'rb') -pickle_loaded = pickle.load(open_file) -paranomal_class_logprob, skeptic_class_logprob, word_logprobs = pickle_loaded -#pickle_loaded=pickle.load(open_file) -#paranomal_class_logprob, skeptic_class_logprob, word_logprobs = pickle_loaded -#Niektórych słów nie bezie w zbiorze treningowym dev-0 i dev-A -def prediction(input,output): - output_file = open(output,'w') - with open(input,encoding='utf-8') as in_file: - for line in in_file: - temp_paranormal_logprob = paranomal_class_logprob - temp_skeptic_logprob = skeptic_class_logprob - text, timestamp = line.rstrip('\n').split('\t') - tokens = text.lower().split(' ') - for token in tokens: - if token not in word_logprobs['paranormal']: - word_logprobs['paranormal'][token] = 0 - if token not in word_logprobs['skeptic']: - word_logprobs['skeptic'][token] = 0 - - temp_paranormal_logprob += paranomal_class_logprob + word_logprobs['paranormal'][token] - temp_skeptic_logprob += skeptic_class_logprob + word_logprobs['skeptic'][token] - - if temp_paranormal_logprob > temp_skeptic_logprob: - output_file.write('P\n') - else: - output_file.write('S\n') - -def main(): - prediction('dev-0/in.tsv','dev-0/out.tsv') - prediction('test-A/in.tsv/in.tsv','test-A/out.tsv') - -main() -