ireland-news-headlines/run.py

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import vowpalwabbit
from sklearn import preprocessing


# In[2]:


vw = vowpalwabbit.Workspace('--oaa 20')


# In[3]:


X_train = pd.read_csv('train\in.tsv', sep='\t', usecols=[2], names=['text'])
Y_train = pd.read_csv('train\expected.tsv', sep='\t', usecols=[0], names=['class'])


# In[4]:


Y_train['class'].unique()


# In[5]:


le = preprocessing.LabelEncoder()
le.fit(['business', 'culture', 'lifestyle', 'news', 'opinion', 'removed', 'sport'])
Y_train['class'] = le.fit_transform(Y_train['class'])


# In[6]:


for x, y in zip(X_train['text'], Y_train['class']):
    vw.learn(f'{y} | text:{x}')


# In[16]:


def make_prediction(path_in, path_out):
    test_set = pd.read_csv(path_in, sep='\t', usecols=[2], names=['text'])
    predictions = []
    for x in test_set['text']:
        predictions.append(vw.predict(f'| text:{x}'))
    predictions = le.inverse_transform(predictions)
    file = open(path_out, 'w')
    for pred in predictions:
        file.write(f'{pred}\n')
    file.close()


# In[17]:


make_prediction('dev-0\in.tsv', 'dev-0\out.tsv')


# In[18]:


make_prediction('test-A\in.tsv', 'test-A\out.tsv')


# In[19]:


make_prediction('test-B\in.tsv', 'test-B\out.tsv')
s444421 2022-05-27 21:21:57 +02:00			`#!/usr/bin/env python`
			`# coding: utf-8`

Fix 2022-05-27 21:33:35 +02:00			`# In[1]:`
s444421 2022-05-27 21:21:57 +02:00

			`import pandas as pd`
			`import vowpalwabbit`
			`from sklearn import preprocessing`


Fix 2022-05-27 21:33:35 +02:00			`# In[2]:`
s444421 2022-05-27 21:21:57 +02:00

			`vw = vowpalwabbit.Workspace('--oaa 20')`


Fix 2022-05-27 21:33:35 +02:00			`# In[3]:`
s444421 2022-05-27 21:21:57 +02:00

			`X_train = pd.read_csv('train\in.tsv', sep='\t', usecols=[2], names=['text'])`
			`Y_train = pd.read_csv('train\expected.tsv', sep='\t', usecols=[0], names=['class'])`


Fix 2022-05-27 21:33:35 +02:00			`# In[4]:`
s444421 2022-05-27 21:21:57 +02:00

			`Y_train['class'].unique()`


Fix 2022-05-27 21:33:35 +02:00			`# In[5]:`
s444421 2022-05-27 21:21:57 +02:00

			`le = preprocessing.LabelEncoder()`
			`le.fit(['business', 'culture', 'lifestyle', 'news', 'opinion', 'removed', 'sport'])`
			`Y_train['class'] = le.fit_transform(Y_train['class'])`


Fix 2022-05-27 21:33:35 +02:00			`# In[6]:`
s444421 2022-05-27 21:21:57 +02:00

			`for x, y in zip(X_train['text'], Y_train['class']):`
			`vw.learn(f'{y} \| text:{x}')`


Fix 2022-05-27 21:33:35 +02:00			`# In[16]:`
s444421 2022-05-27 21:21:57 +02:00

			`def make_prediction(path_in, path_out):`
			`test_set = pd.read_csv(path_in, sep='\t', usecols=[2], names=['text'])`
			`predictions = []`
Fix 2022-05-27 21:33:35 +02:00			`for x in test_set['text']:`
s444421 2022-05-27 21:21:57 +02:00			`predictions.append(vw.predict(f'\| text:{x}'))`
			`predictions = le.inverse_transform(predictions)`
			`file = open(path_out, 'w')`
			`for pred in predictions:`
			`file.write(f'{pred}\n')`
			`file.close()`


Fix 2022-05-27 21:33:35 +02:00			`# In[17]:`
s444421 2022-05-27 21:21:57 +02:00

			`make_prediction('dev-0\in.tsv', 'dev-0\out.tsv')`


Fix 2022-05-27 21:33:35 +02:00			`# In[18]:`
s444421 2022-05-27 21:21:57 +02:00

			`make_prediction('test-A\in.tsv', 'test-A\out.tsv')`


Fix 2022-05-27 21:33:35 +02:00			`# In[19]:`
s444421 2022-05-27 21:21:57 +02:00

			`make_prediction('test-B\in.tsv', 'test-B\out.tsv')`