#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import vowpalwabbit from sklearn import preprocessing # In[2]: vw = vowpalwabbit.Workspace('--oaa 20') # In[3]: X_train = pd.read_csv('train\in.tsv', sep='\t', usecols=[2], names=['text']) Y_train = pd.read_csv('train\expected.tsv', sep='\t', usecols=[0], names=['class']) # In[4]: Y_train['class'].unique() # In[5]: le = preprocessing.LabelEncoder() le.fit(['business', 'culture', 'lifestyle', 'news', 'opinion', 'removed', 'sport']) Y_train['class'] = le.fit_transform(Y_train['class']) # In[6]: for x, y in zip(X_train['text'], Y_train['class']): vw.learn(f'{y} | text:{x}') # In[16]: def make_prediction(path_in, path_out): test_set = pd.read_csv(path_in, sep='\t', usecols=[2], names=['text']) predictions = [] for x in test_set['text']: predictions.append(vw.predict(f'| text:{x}')) predictions = le.inverse_transform(predictions) file = open(path_out, 'w') for pred in predictions: file.write(f'{pred}\n') file.close() # In[17]: make_prediction('dev-0\in.tsv', 'dev-0\out.tsv') # In[18]: make_prediction('test-A\in.tsv', 'test-A\out.tsv') # In[19]: make_prediction('test-B\in.tsv', 'test-B\out.tsv')