import csv from collections import defaultdict import math counter = 0 docs = [] with open('in.tsv') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') for row in reader: docs.append(row) counter+=1 print(counter) pcounter = 0 scounter = 0 with open('expected.tsv') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') for row in reader: if row[0] == " P": pcounter += 1 if row[0] == " S": scounter += 1 print(pcounter) print(scounter) print("P(S) = " + str(scounter+1/counter+2)) print("P(P) = " + str(pcounter+1/counter+2)) def calc_class_logprob(expected_path): paranoarmal_class_count = 0 skeptic_class_count = 0 with open(expected_path) as f: for line in f: if "P" in line: paranoarmal_class_count +=1 elif "S" in line: skeptic_class_count +=1 paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count) skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count) return math.log(paranormal_class_prob), math.log(skeptic_class_prob) def calc_word_counts(in_path, expected_path): with open(in_path), open(expected_path) as in_file, exp_file: word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)} for in_line, exp_line in zip(in_file, exp_file): for line in f: class_ = exp_line.rstrip('\n').replace(" ", "") text, timestamp = line.rstrip('\n').split('\t') tokens = text.lower().split(' ') for token in tokens: if class_ == 'P': word_counts['paranormal'][token] += 1 elif class_ == 'S': word_counts['skeptic'][token] += 1 return word_counts def calc_words_logprobs(words_counts): total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())) total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys())) with open('prediction.tsv', 'wt') as tsvfile: tsv_writer = csv.writer(tsvfile, delimiter='\t') for i in range(counter): tsv_writer.writerow('S')