From 710f261670a7fa3968d6c7f16179c108650e49ef Mon Sep 17 00:00:00 2001 From: Bartusiak Date: Mon, 30 Mar 2020 18:03:14 +0200 Subject: [PATCH] Preparation to ISI-003 --- code.py | 64 ---------------------------------------------- code_prediction.py | 18 ------------- 2 files changed, 82 deletions(-) delete mode 100644 code.py delete mode 100644 code_prediction.py diff --git a/code.py b/code.py deleted file mode 100644 index 6362faf..0000000 --- a/code.py +++ /dev/null @@ -1,64 +0,0 @@ -from collections import defaultdict -import math -import pickle - -open_file=('test-A/out.tsv') - -#---------------TRAIN START - -#Prawdopodobienstwo wylosowania dokumentu -def calc_class_logprob(expected_path): - paranormal_classcount=0 - skeptic_classcount=0 - with open(expected_path,encoding='utf-8') as f: - for line in f: - if 'P' in line: - paranormal_classcount += 1 - if 'S' in line: - skeptic_classcount += 1 - - paranormal_prob = paranormal_classcount / (paranormal_classcount + skeptic_classcount) - skeptic_prob = skeptic_classcount / (paranormal_classcount + skeptic_classcount) - - return math.log(paranormal_prob), math.log(skeptic_prob) - - -def calc_word_count(in_path, expected_path): - word_counts = {'paranormal':defaultdict(int), 'skeptic': defaultdict(int)} - with open(in_path,encoding='utf-8') as in_file, open(expected_path,encoding='utf-8') as expected_file: - for line, exp in zip(in_file, expected_file): - class_ = exp.rstrip('\n').replace(' ','') - text, timestamp = line.rstrip('\n').split('\t') - tokens = text.lower().split(' ') - for token in tokens: - if class_ == 'P': - word_counts['paranormal'][token] += 1 - elif class_ == 'S': - word_counts['skeptic'][token] += 1 - return word_counts - -def calc_word_logprobs(word_counts): - total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()) - total_paranormal = sum(word_counts['paranormal'].values()) + len(word_counts['paranormal'].keys()) - - word_logprobs= {'paranormal': {}, 'skeptic': {}} - for class_ in word_counts.keys(): # sceptic paranormal - for token, tokens in word_counts[class_].items(): - if class_ == 'skeptic': - word_prob = (tokens+1)/total_skeptic - else: - word_prob = (tokens+1)/total_paranormal - word_logprobs[class_][token] = math.log(word_prob) - return word_logprobs - -#--------------- TRAIN END - -def main(): - paranomal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv") - word_counts=calc_word_count("train/in.tsv","train/expected.tsv") - word_logprobs = calc_word_logprobs(word_counts) - pickle.dump([paranomal_class_logprob, skeptic_class_logprob, word_logprobs], open('naive_base_model.pkl','wb')) -# write_data() - - -main() diff --git a/code_prediction.py b/code_prediction.py deleted file mode 100644 index 870459c..0000000 --- a/code_prediction.py +++ /dev/null @@ -1,18 +0,0 @@ -from collections import defaultdict -import math -import pickle - -open_file=open('naive_base_model.pkl','rb') -write_file_test=open('test-A/out.tsv','w') -write_file_dev=open('dev-0/out.tsv','w') -pickle_loaded=pickle.load(open_file) -paranomal_class_logprob, skeptic_class_logprob, word_logprobs = pickle_loaded -#Niektórych słów nie bezie w zbiorze treningowym dev-0 i dev-A -for i in word_logprobs.keys(): - for token, tokens in word_logprobs[i].items(): - if (word_logprobs['skeptic'][token] > word_logprobs['paranormal'][token]): - write_file_test.write("S\n") - write_file_dev.write("S\n") - else: - write_file_test.write("P\n") - write_file_dev.write("P\n") \ No newline at end of file