import csv from collections import defaultdict import math import pickle import os from pathlib import Path def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c)) paranoarmal_class_count = 0 skeptic_class_count = 0 with open(expected_path) as f: for line in f: if "P" in line: paranoarmal_class_count +=1 elif "S" in line: skeptic_class_count +=1 paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count) skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count) return math.log(paranormal_class_prob), math.log(skeptic_class_prob) def calc_word_counts(in_path, expected_path): with open(in_path) as in_file, open(expected_path) as exp_file: word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)} for in_line, exp_line in zip(in_file, exp_file): class_ = exp_line.rstrip('\n').replace(" ", "") text, timestamp = in_line.rstrip('\n').split('\t') tokens = text.lower().split(' ') for token in tokens: if class_ == 'P': word_counts['paranormal'][token] += 1 elif class_ == 'S': word_counts['skeptic'][token] += 1 return word_counts def calc_word_logprobs(word_counts): total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()) total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys()) word_logprobs = {'paranormal': {}, 'skeptic':{}} for class_ in word_logprobs.keys(): for token, value in word_counts[class_].items(): if class_ == 'skeptic': word_prob = (value + 1)/ total_skeptic else: word_prob = (value + 1)/total_paranormal word_logprobs[class_][token] = math.log(word_prob) return word_logprobs paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv") word_counts = calc_word_counts('train/in.tsv','train/expected.tsv') word_logprobs = calc_word_logprobs(word_counts) print(word_logprobs['skeptic']["hair."]) #-12.166205308815476 #trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic def get_test_posts(path): posts = [] with open(path) as f: for line in f: text, timestamp = line.rstrip('\n').split('\t') posts.append(text) return posts def predict_post_class(posts, sprob, pprob, word_logprobs): out_classes = [] for post in posts: total_s_prob = sprob total_p_prob = pprob tokens = post.lower().split(' ') for token in tokens: #dlasceptic if (token in word_logprobs['skeptic'].keys()): sceptic_prob = word_logprobs['skeptic'][token] else: sceptic_prob = 0 #dlaparanormal if (token in word_logprobs['paranormal'].keys()): paranormal_prob = word_logprobs['paranormal'][token] else: paranormal_prob = 0 total_s_prob += sceptic_prob total_p_prob += paranormal_prob #print(total_p_prob) #print(total_s_prob) if total_p_prob > total_s_prob: out_classes.append('P') else: out_classes.append('S') return out_classes def predict_posts(path): posts = get_test_posts(path+'/in.tsv') classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs) with open(path+"/out.tsv", 'wt') as tsvfile: tsv_writer = csv.writer(tsvfile, delimiter='\t') for i in classes: tsv_writer.writerow(i) predict_posts("dev-0") predict_posts("test-A") with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file: counter = 0 positive = 0 for out_line, exp_line in zip(out_file, exp_file): counter+=1 if " "+out_line == exp_line: positive += 1 print(positive/counter)