from sklearn.naive_bayes import GaussianNB import pandas as pd from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer PATHS = ['./train/train.tsv', './dev-0/in.tsv', './test-A/in.tsv'] PATHS_OUTPUT = ['./dev-0/out.tsv', './test-A/out.tsv'] def get_data(path): return pd.read_table(path, error_bad_lines=False, sep='\t', header=None) def get_X_y_train(data): X_train = data[1].values y_train = data[0].values return X_train, y_train def training(x, y): vectorizer = TfidfVectorizer() result = vectorizer.fit_transform(x) classifier = MultinomialNB() classifier.fit(result, y) return classifier, vectorizer def predict(vectorizer, classifier, x): result = vectorizer.transform(x) pred = classifier.predict(result) return pred def generate_output(pred, path): pred.tofile(path, sep = '\n') def main(): #prepare train train = get_data(PATHS[0]) X_train, y_train = get_X_y_train(train) #train classifier, vectorizer = training(X_train, y_train) #dev X_dev = get_data(PATHS[1]) X_dev = X_dev[0].values pred_dev = predict(vectorizer, classifier, X_dev) #test X_test = get_data(PATHS[2]) X_test = X_test[0].values pred_test = predict(vectorizer, classifier, X_test) #generate output generate_output(pred_dev, PATHS_OUTPUT[0]) generate_output(pred_test, PATHS_OUTPUT[1]) if __name__ == '__main__': main()