paranormal-or-skeptic/solution.py

72 lines
2.3 KiB
Python
Raw Normal View History

2020-03-09 14:37:26 +01:00
import csv
2020-03-09 18:30:02 +01:00
from collections import defaultdict
import math
2020-03-09 14:37:26 +01:00
counter = 0
2020-03-09 18:30:02 +01:00
docs = []
2020-03-09 14:37:26 +01:00
with open('in.tsv') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
2020-03-09 18:30:02 +01:00
docs.append(row)
2020-03-09 14:37:26 +01:00
counter+=1
print(counter)
2020-03-09 18:30:02 +01:00
pcounter = 0
scounter = 0
with open('expected.tsv') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
if row[0] == " P":
pcounter += 1
if row[0] == " S":
scounter += 1
print(pcounter)
print(scounter)
print("P(S) = " + str(scounter+1/counter+2))
print("P(P) = " + str(pcounter+1/counter+2))
def calc_class_logprob(expected_path):
paranoarmal_class_count = 0
skeptic_class_count = 0
with open(expected_path) as f:
for line in f:
if "P" in line:
paranoarmal_class_count +=1
elif "S" in line:
skeptic_class_count +=1
paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)
return math.log(paranormal_class_prob), math.log(skeptic_class_prob)
def calc_word_counts(in_path, expected_path):
with open(in_path), open(expected_path) as in_file, exp_file:
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
for in_line, exp_line in zip(in_file, exp_file):
for line in f:
class_ = exp_line.rstrip('\n').replace(" ", "")
text, timestamp = line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['skeptic'][token] += 1
return word_counts
def calc_words_logprobs(words_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()))
total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))
2020-03-09 14:37:26 +01:00
2020-03-09 18:30:02 +01:00
# with open('prediction.tsv', 'wt') as tsvfile:
# tsv_writer = csv.writer(tsvfile, delimiter='\t')
# for i in range(counter):
# tsv_writer.writerow('S')
2020-03-09 14:37:26 +01:00