23 lines
687 B
Python
23 lines
687 B
Python
#!/usr/bin/python3
|
|
|
|
import pandas as pd
|
|
import csv
|
|
import pickle
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
vectorizer = CountVectorizer()
|
|
|
|
def train():
|
|
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["document","date"], quoting=csv.QUOTE_NONE)
|
|
document = train["document"]
|
|
y = pd.read_csv("train/expected.tsv", header=None)
|
|
|
|
vectorizer = CountVectorizer()
|
|
x = vectorizer.fit_transform(document)
|
|
clf = MultinomialNB().fit(x, y)
|
|
pickle.dump(clf, open("clf.model", "wb"))
|
|
pickle.dump(vectorizer, open("vectorizer.model", "wb"))
|
|
|
|
train()
|
|
|