#!/usr/bin/python3 import sys import pickle from normalize import normalize def train(): dokuments_total = 0 skeptic_dokuments_total = 0 vocabulary = set() skeptick_words_total = 0 paranormal_words_total = 0 skeptic_count = {} paranormal_count = {} for line in sys.stdin: line = line.rstrip() fields = line.split('\t') label = fields[0].strip() dokument = fields[1] terms = normalize(dokument) for t in terms: vocabulary.add(t) dokuments_total += 1 if label == 'S': skeptic_dokuments_total += 1 skeptick_words_total += len(terms) for term in terms: if term in skeptic_count: skeptic_count[term] +=1 else: skeptic_count[term] = 1 else: paranormal_words_total += len(terms) for term in terms: if term in paranormal_count: paranormal_count[term] +=1 else: paranormal_count[term] = 1 pskeptic = skeptic_dokuments_total / dokuments_total vocabulary_size = len(vocabulary) print(pskeptic) print(vocabulary_size) print(paranormal_words_total) print(skeptick_words_total) model = (pskeptic, vocabulary_size, skeptick_words_total, paranormal_words_total, skeptic_count,paranormal_count) pickle.dump(model, open("model.pkl", "wb")) train()