import pandas as pd from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer def main(): clf = MultinomialNB() vectorizer = TfidfVectorizer() x_train, y_train = get_training_data(vectorizer) clf.fit(x_train, y_train) x_dev = get_dev_data(vectorizer) Y_dev_predicted = clf.predict(x_dev) save_to_tsv(Y_dev_predicted, 'dev-0/out.tsv') x_test = get_test_data(vectorizer) Y_test_predicted = clf.predict(x_test) save_to_tsv(Y_test_predicted, 'test-A/out.tsv') def save_to_tsv(data, path): pd.DataFrame(data).to_csv(path, sep='\t', index=False, header=False) def get_training_data(vectorizer): train_dataset = pd.read_csv('train/train.tsv/train.tsv', sep='\t', header=None, error_bad_lines=False) y_train = train_dataset[0] X_train = train_dataset[1] x_train = [str(item) for item in X_train.to_numpy()] x_train = vectorizer.fit_transform(x_train) return x_train, y_train def get_dev_data(vectorizer): dev_dataset = pd.read_csv('dev-0/in.tsv', sep='\t', header=None, error_bad_lines=False) X_dev = [str(item) for item in dev_dataset.to_numpy()] return vectorizer.transform(X_dev) def get_test_data(vectorizer): test_dataset = pd.read_csv('test-A/in.tsv', sep='\t', header=None, error_bad_lines=False) X_test = [str(item) for item in test_dataset.to_numpy()] return vectorizer.transform(X_test) if __name__ == '__main__': main()