paranormal-or-skeptic/solution.py

import csv
from collections import defaultdict
import math

counter = 0
docs = []
with open('in.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    for row in reader:
        docs.append(row)
        counter+=1

print(counter)
pcounter = 0
scounter = 0
with open('expected.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    for row in reader:
        if row[0] == " P":
            pcounter += 1
        if row[0] == " S":
            scounter += 1

print(pcounter)
print(scounter)

print("P(S) = " + str(scounter+1/counter+2))
print("P(P) = " + str(pcounter+1/counter+2))

def calc_class_logprob(expected_path):
    paranoarmal_class_count = 0
    skeptic_class_count = 0
    with open(expected_path) as f:
        for line in f:
            if "P" in line:
                paranoarmal_class_count +=1
            elif "S" in line:
                skeptic_class_count +=1

    paranormal_class_prob = paranoarmal_class_count / (paranoarmal_class_count + skeptic_class_count)
    skeptic_class_prob = skeptic_class_count / (paranoarmal_class_count + skeptic_class_count)

    return math.log(paranormal_class_prob), math.log(skeptic_class_prob)

def calc_word_counts(in_path, expected_path):
    with open(in_path), open(expected_path) as in_file, exp_file:
        word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
        for in_line, exp_line in zip(in_file, exp_file):
            for line in f:
                class_ = exp_line.rstrip('\n').replace(" ", "")
                text, timestamp = line.rstrip('\n').split('\t')
                tokens = text.lower().split(' ')
                for token in tokens:
                    if class_ == 'P':
                        word_counts['paranormal'][token] += 1
                    elif class_ == 'S':
                        word_counts['skeptic'][token] += 1

    return word_counts


def calc_words_logprobs(words_counts):
    total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys()))
    total_paranormal = sum(word_counts['paranormal'].values() + len(word_counts['paranormal'].keys()))


with open('prediction.tsv', 'wt') as tsvfile:
    tsv_writer = csv.writer(tsvfile, delimiter='\t')
    for i in range(counter):
        tsv_writer.writerow('S')