ireland-news-headlines/run.ipynb

8.5 KiB
Raw Permalink Blame History

import vowpalwabbit
import pandas as pd
import re
def prediction(path_in, path_out, model, categories):
    data = pd.read_csv(path_in, header=None, sep='\t')
    data = data.drop(1, axis=1)
    data.columns = ['year', 'text']

    data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)

    with open(path_out, 'w', encoding='utf-8') as file:
        for example in data['train_input']:
            predicted = model.predict(example)
            text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)
            file.write(str(text_predicted) + '\n')
def to_vowpalwabbit(row, categories):
    text = row['text'].replace('\n', ' ').lower().strip()
    text = re.sub("[^a-zA-Z -']", '', text)
    text = re.sub(" +", ' ', text)
    year = row['year']
    try:
        category = categories[row['category']]
    except KeyError:
        category = ''

    vw = f"{category} | year:{year} text:{text}\n"

    return vw
x_train = pd.read_csv('train/in.tsv', header=None, sep='\t')
x_train = x_train.drop(1, axis=1)
x_train.columns = ['year', 'text']


y_train = pd.read_csv('train/expected.tsv', header=None, sep='\t')
y_train.columns = ['category']

x_train = x_train[0:800000]
y_train = y_train[0:800000]

data = pd.concat([x_train, y_train], axis=1)
x_train
year text
0 2004.508197 Sudan claims it is disarming militias
1 2008.442623 Bluffer's guide to Euro 2008
2 2012.587432 Ennis tallies her highest first day total
3 2009.071233 Sri Lanka continues to battle Tamil Tigers
4 1997.345205 Talks today to avert new health service strike
... ... ...
799995 2010.876712 Top league stars among 135 listed online
799996 2000.879452 Cabinet to consider options for animal disposal
799997 2004.915068 Last orders for Bewley's this evening
799998 2014.797260 Toulon; Ospreys and Toulouse win Champions Cup...
799999 1999.019178 Volatile year in store for the markets

800000 rows × 2 columns

categories = {}

for i, x in enumerate(data['category'].unique()):
    categories[x] = i+1

print(categories)
    
data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)

model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')

for example in data['train_input']:
    model.learn(example)
{'news': 1, 'sport': 2, 'opinion': 3, 'business': 4, 'culture': 5, 'lifestyle': 6, 'removed': 7}
prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)
prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)
prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)
!jupyter nbconvert --to script run.ipynb
[NbConvertApp] Converting notebook run.ipynb to script
[NbConvertApp] Writing 2030 bytes to run.py