from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() vectorizer = TfidfVectorizer() def train(): with open('train/train.tsv') as f: docs = [line.rstrip() for line in f] docs_preprocessed = [] y = [] for doc in docs: y_with_doc = doc.split('\t') y.append(y_with_doc[0]) doc = y_with_doc[1] docs_preprocessed.append(doc) y = [int(value) for value in y] x = vectorizer.fit_transform(docs_preprocessed) classifier.fit(x, y) def classify(path): with open(path + 'in.tsv') as f: docs = [line.rstrip() for line in f] test_x = vectorizer.transform(docs) predictions = classifier.predict(test_x) with open(path + 'out.tsv', 'w') as file: for prediction in predictions: file.write("%i\n" % prediction) train() classify('dev-0/') classify('test-A/')