#!/usr/bin/python3 import re import sys import pickle from tokenize import tokenize def train(): documents_total = 0 sceptic_documents_total = 0 vocabulary = set() sceptic_words_total = 0 paranormal_words_total = 0 skeptic_count = {} paranormal_count = {} for line in sys.stdin: line = line.rstrip() fields = line.split('\t') label = fields[0].strip() document = fields[1] print(document) terms = tokenize(document) print(terms) for t in terms: vocabulary.add(t) documents_total += 1 if label == 'S': sceptic_documents_total += 1 sceptic_words_total += len(terms) for term in terms: if term in skeptic_count: skeptic_count[term] += 1 else: skeptic_count[term] = 1 else: paranormal_words_total += len(terms) for term in terms: if term in paranormal_count: paranormal_count[term] += 1 else: paranormal_count[term] = 1 pskeptic = sceptic_documents_total / documents_total vocabulary_size = len(vocabulary) model = (pskeptic, vocabulary_size, sceptic_words_total, paranormal_words_total, skeptic_count, paranormal_count) pickle.dump(model, open("model.pkl", "wb")) train()