from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
vectorizer = TfidfVectorizer()


def train():
    with open('train/train.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    y = []
    for doc in docs:
        y_with_doc = doc.split('\t')
        y.append(y_with_doc[0])
        doc = y_with_doc[1]
        docs_preprocessed.append(doc)
    y = [int(value) for value in y]
    x = vectorizer.fit_transform(docs_preprocessed)
    classifier.fit(x, y)


def classify(path):
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
    test_x = vectorizer.transform(docs)
    predictions = classifier.predict(test_x)
    with open(path + 'out.tsv', 'w') as file:
        for prediction in predictions:
            file.write("%i\n" % prediction)


train()
classify('dev-0/')
classify('test-A/')