from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import pickle with open('../train/in.tsv') as f: train_in = [x.split('\t')[-1].rstrip() for x in f.readlines()] with open('../train/expected.tsv') as f: train_expected = [x.rstrip() for x in f.readlines()] classes = sorted(set(train_expected)) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(train_in) y_train = [classes.index(x) for x in train_expected] model = LogisticRegression().fit(X_train, y_train) print(accuracy_score(y_train, model.predict(X_train))) for DATASET in 'dev-0', 'test-A', 'test-B': with open(f'../{DATASET}/in.tsv') as f: d_in = [x.split('\t')[-1].rstrip() for x in f.readlines()] X_d = vectorizer.transform(d_in) out = model.predict(X_d) with open(f'../{DATASET}/out.tsv','w') as f: for sample in out: class_name = classes[sample] f.write(class_name + '\n') with open('vectorizer.pickle','wb') as f: pickle.dump(vectorizer, f) with open('model.pickle','wb') as f: pickle.dump(model, f)