2020-03-09 14:37:26 +01:00
|
|
|
import csv
|
2020-03-09 18:30:02 +01:00
|
|
|
from collections import defaultdict
|
|
|
|
import math
|
2020-03-09 14:37:26 +01:00
|
|
|
|
|
|
|
counter = 0
|
2020-03-09 18:30:02 +01:00
|
|
|
docs = []
|
2020-03-09 14:37:26 +01:00
|
|
|
with open('in.tsv') as tsvfile:
|
|
|
|
reader = csv.reader(tsvfile, delimiter='\t')
|
|
|
|
for row in reader:
|
2020-03-09 18:30:02 +01:00
|
|
|
docs.append(row)
|
2020-03-09 14:37:26 +01:00
|
|
|
counter+=1
|
|
|
|
|
|
|
|
print(counter)
|
2020-03-09 18:30:02 +01:00
|
|
|
pcounter = 0
|
|
|
|
scounter = 0
|
|
|
|
with open('expected.tsv') as tsvfile:
|
|
|
|
reader = csv.reader(tsvfile, delimiter='\t')
|
|
|
|
for row in reader:
|
|
|
|
if row[0] == " P":
|
|
|
|
pcounter += 1
|
|
|
|
if row[0] == " S":
|
|
|
|
scounter += 1
|
|
|
|
|
|
|
|
print(pcounter)
|
|
|
|
print(scounter)
|
|
|
|
|
|
|
|
print("P(S) = " + str(scounter+1/counter+2))
|
|
|
|
print("P(P) = " + str(pcounter+1/counter+2))
|
|
|
|
|
|
|
|
def calc_class_logprob(expected_path):
|
|
|
|
paranoarmal_class_count = 0
|
|
|
|
skeptic_class_count = 0
|
|
|
|
with open(expected_path) as f:
|
|
|
|
for line in f:
|
|
|
|
if "P" in line:
|
|
|
|
paranoarmal_class_count +=1
|
|
|
|
elif "S" in line:
|
|
|
|
skeptic_class_count +=1
|
|
|
|
|
|
|
|
paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
|
|
|
|
skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)
|
|
|
|
|
|
|
|
return math.log(paranormal_class_prob), math.log(skeptic_class_prob)
|
|
|
|
|
|
|
|
def calc_word_counts(in_path, expected_path):
|
|
|
|
with open(in_path), open(expected_path) as in_file, exp_file:
|
|
|
|
word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
|
|
|
|
for in_line, exp_line in zip(in_file, exp_file):
|
|
|
|
for line in f:
|
|
|
|
class_ = exp_line.rstrip('\n').replace(" ", "")
|
|
|
|
text, timestamp = line.rstrip('\n').split('\t')
|
|
|
|
tokens = text.lower().split(' ')
|
|
|
|
for token in tokens:
|
|
|
|
if class_ == 'P':
|
|
|
|
word_counts['paranormal'][token] += 1
|
|
|
|
elif class_ == 'S':
|
|
|
|
word_counts['skeptic'][token] += 1
|
|
|
|
|
|
|
|
return word_counts
|
|
|
|
|
|
|
|
|
|
|
|
def calc_words_logprobs(words_counts):
|
|
|
|
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()))
|
|
|
|
total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))
|
|
|
|
|
2020-03-09 14:37:26 +01:00
|
|
|
|
2020-03-09 18:30:02 +01:00
|
|
|
# with open('prediction.tsv', 'wt') as tsvfile:
|
|
|
|
# tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
|
|
|
# for i in range(counter):
|
|
|
|
# tsv_writer.writerow('S')
|
2020-03-09 14:37:26 +01:00
|
|
|
|