#!/usr/bin/python3 import pandas as pd import csv import pickle from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() def train(): train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["document","date"], quoting=csv.QUOTE_NONE) document = train["document"] y = pd.read_csv("train/expected.tsv", header=None) vectorizer = CountVectorizer() x = vectorizer.fit_transform(document) clf = MultinomialNB().fit(x, y) pickle.dump(clf, open("clf.model", "wb")) pickle.dump(vectorizer, open("vectorizer.model", "wb")) train()