ireland-news-headlines/run.py

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import vowpalwabbit
from sklearn import preprocessing


# In[2]:


vw = vowpalwabbit.Workspace('--oaa 20')


# In[3]:


X_train = pd.read_csv('train\in.tsv', sep='\t', usecols=[2], names=['text'])
Y_train = pd.read_csv('train\expected.tsv', sep='\t', usecols=[0], names=['class'])


# In[4]:


Y_train['class'].unique()


# In[5]:


le = preprocessing.LabelEncoder()
le.fit(['business', 'culture', 'lifestyle', 'news', 'opinion', 'removed', 'sport'])
Y_train['class'] = le.fit_transform(Y_train['class'])


# In[6]:


for x, y in zip(X_train['text'], Y_train['class']):
    vw.learn(f'{y} | text:{x}')


# In[16]:


def make_prediction(path_in, path_out):
    test_set = pd.read_csv(path_in, sep='\t', usecols=[2], names=['text'])
    predictions = []
    for x in test_set['text']:
        predictions.append(vw.predict(f'| text:{x}'))
    predictions = le.inverse_transform(predictions)
    file = open(path_out, 'w')
    for pred in predictions:
        file.write(f'{pred}\n')
    file.close()


# In[17]:


make_prediction('dev-0\in.tsv', 'dev-0\out.tsv')


# In[18]:


make_prediction('test-A\in.tsv', 'test-A\out.tsv')


# In[19]:


make_prediction('test-B\in.tsv', 'test-B\out.tsv')