paranormal-or-skeptic/trainNB.py

#!/usr/bin/python3


import re
import sys
import pickle
from tokenize import tokenize


def train():
    documents_total = 0
    sceptic_documents_total = 0
    vocabulary = set()

    sceptic_words_total = 0
    paranormal_words_total = 0

    skeptic_count = {}
    paranormal_count = {}

    for line in sys.stdin:
        line = line.rstrip()
        fields = line.split('\t')
        label = fields[0].strip()
        document = fields[1]
        print(document)
        terms = tokenize(document)
        print(terms)
        for t in terms:
            vocabulary.add(t)

        documents_total += 1
        if label == 'S':
            sceptic_documents_total += 1
            sceptic_words_total += len(terms)
            for term in terms:
                if term in skeptic_count:
                    skeptic_count[term] += 1
                else:
                    skeptic_count[term] = 1
        else:
            paranormal_words_total += len(terms)
            for term in terms:
                if term in paranormal_count:
                    paranormal_count[term] += 1
                else:
                    paranormal_count[term] = 1

    pskeptic =  sceptic_documents_total / documents_total
    vocabulary_size = len(vocabulary)
    model = (pskeptic,
             vocabulary_size,
             sceptic_words_total,
             paranormal_words_total,
             skeptic_count,
             paranormal_count)
    pickle.dump(model, open("model.pkl", "wb"))


train()