import pandas as pd from sklearn.naive_bayes import GaussianNB from sklearn.feature_extraction.text import TfidfVectorizer def read_train_data(): print('Load train data') train_data = pd.read_csv("train/train.tsv", sep='\t', header=None) train_data = train_data[:10000] return train_data[1], train_data[0] def read_pred_data(): print('Load pred data') x_p = [] with open("dev-0/in.tsv", encoding='utf-8') as f: for line in f: x_p.append(line) return x_p def vectorize(x,x_p): print('Vectorize') vectorizer = TfidfVectorizer() x = vectorizer.fit_transform(x) x_p = vectorizer.transform(x_p) return x, x_p def calc_score(x, y, x_p): print('Calculate score') model = GaussianNB() model.fit(x.toarray(), y) return model.predict(x_p.toarray()) def get_result(): x, y = read_train_data() x_p = read_pred_data() x, x_p = vectorize(x, x_p) return calc_score(x, y, x_p) pd.DataFrame(get_result()).to_csv("dev-0/out.tsv", header=False, index=None)