import csv from collections import defaultdict import math import pickle import os from pathlib import Path def tokenize(text): text = text.replace("n't", " not") text = text.replace("'s", " is") text = text.replace("'ve", " have") text = text.replace("'", " ") text = text.replace("(", " ") text = text.replace(")", " ") text = text.replace("/", " ") text = text.replace("\\n\\n", "") text = text.replace(".", "") text = text.replace("?", "") text = text.replace(",", "") text = text.replace("!", "") text = text.replace('"', '') text = text.replace(" a ", " ") text = text.replace(" on ", " ") text = text.replace(" the ", " ") text = text.replace(" of ", " ") text = text.replace(" an ", " ") text = text.replace(" to ", " ") #text = text.replace("a", "") return text def calc_class_logprob(expected_path): #zliczamy ogólne prawdopodobieństwo dla klasy (P(c)) paranoarmal_class_count = 0 skeptic_class_count = 0 with open(expected_path) as f: for line in f: if "1" in line: paranoarmal_class_count +=1 elif "0" in line: skeptic_class_count +=1 paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count) skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count) return paranormal_class_prob, skeptic_class_prob def calc_word_counts(in_path, expected_path): with open(in_path) as in_file, open(expected_path) as exp_file: word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)} for in_line, exp_line in zip(in_file, exp_file): class_ = exp_line.rstrip('\n').replace(" ", "") text, timestamp = in_line.rstrip('\n').split('\t') text = tokenize(text) tokens = text.lower().split(' ') for token in tokens: if class_ == '1': word_counts['paranormal'][token] += 1 elif class_ == '0': word_counts['skeptic'][token] += 1 return word_counts def calc_word_logprobs(word_counts): total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()) total_paranormal = sum(word_counts['paranormal'].values())+ len(word_counts['paranormal'].keys()) word_logprobs = {'paranormal': {}, 'skeptic':{}} for class_ in word_logprobs.keys(): for token, value in word_counts[class_].items(): if class_ == 'skeptic': word_prob = (value + 1)/ total_skeptic else: word_prob = (value + 1)/total_paranormal word_logprobs[class_][token] = word_prob return word_logprobs paranormal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv") word_counts = calc_word_counts('train/in.tsv','train/expected.tsv') word_logprobs = calc_word_logprobs(word_counts) #print(word_logprobs['skeptic']["hair."]) #-12.166205308815476 #trzeba teraz 1. pobrac post 2. podzielić go na termy 3 policzyć prawdopodibeństwo każdego termu 4. dodać je do siebie 5 porwonac paranormal ze sceptic def get_test_posts(path): posts = [] with open(path) as f: for line in f: text, timestamp = line.rstrip('\n').split('\t') posts.append(text) return posts def predict_post_class(posts, sprob, pprob, word_logprobs): out_classes = [] for post in posts: total_s_prob = math.log(sprob) total_p_prob = math.log(pprob) post = tokenize(post) tokens = post.lower().split(' ') for token in tokens: #dlasceptic if (token in word_logprobs['skeptic'].keys()): sceptic_prob = word_logprobs['skeptic'][token]+1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal'])) else: sceptic_prob = 1/(len(word_logprobs['skeptic']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal'])) #dlaparanormal if (token in word_logprobs['paranormal'].keys()): paranormal_prob = word_logprobs['paranormal'][token]+1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal'])) else: paranormal_prob = 1/(len(word_logprobs['paranormal']) + len(word_logprobs['skeptic']) + len(word_logprobs['paranormal'])) total_s_prob += math.log(sceptic_prob) total_p_prob += math.log(paranormal_prob) #print(total_p_prob) #print(total_s_prob) if total_p_prob > total_s_prob: out_classes.append(total_p_prob) else: out_classes.append(total_s_prob) return out_classes def predict_posts(path): posts = get_test_posts(path+'/in.tsv') classes = predict_post_class(posts, skeptic_class_logprob, paranormal_class_logprob, word_logprobs) with open(path+"/out.tsv", 'wt') as tsvfile: tsv_writer = csv.writer(tsvfile, delimiter='\t') # for i in classes: # tsv_writer.writerow(i) tsv_writer.writerows(map(lambda x: [-x], classes)) predict_posts("dev-0") predict_posts("test-A") with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file: counter = 0 positive = 0 for out_line, exp_line in zip(out_file, exp_file): counter+=1 if out_line == exp_line: positive += 1 print(positive/counter)