ireland-news-headlines/run.ipynb at master

2022-06-14 23:05:20 +02:00

8.5 KiB

Raw Permalink Blame History

import vowpalwabbit
import pandas as pd
import re

def prediction(path_in, path_out, model, categories):
    data = pd.read_csv(path_in, header=None, sep='\t')
    data = data.drop(1, axis=1)
    data.columns = ['year', 'text']

    data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)

    with open(path_out, 'w', encoding='utf-8') as file:
        for example in data['train_input']:
            predicted = model.predict(example)
            text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)
            file.write(str(text_predicted) + '\n')

def to_vowpalwabbit(row, categories):
    text = row['text'].replace('\n', ' ').lower().strip()
    text = re.sub("[^a-zA-Z -']", '', text)
    text = re.sub(" +", ' ', text)
    year = row['year']
    try:
        category = categories[row['category']]
    except KeyError:
        category = ''

    vw = f"{category} | year:{year} text:{text}\n"

    return vw

x_train = pd.read_csv('train/in.tsv', header=None, sep='\t')
x_train = x_train.drop(1, axis=1)
x_train.columns = ['year', 'text']


y_train = pd.read_csv('train/expected.tsv', header=None, sep='\t')
y_train.columns = ['category']

x_train = x_train[0:800000]
y_train = y_train[0:800000]

data = pd.concat([x_train, y_train], axis=1)

x_train

	year	text
0	2004.508197	Sudan claims it is disarming militias
1	2008.442623	Bluffer's guide to Euro 2008
2	2012.587432	Ennis tallies her highest first day total
3	2009.071233	Sri Lanka continues to battle Tamil Tigers
4	1997.345205	Talks today to avert new health service strike
...	...	...
799995	2010.876712	Top league stars among 135 listed online
799996	2000.879452	Cabinet to consider options for animal disposal
799997	2004.915068	Last orders for Bewley's this evening
799998	2014.797260	Toulon; Ospreys and Toulouse win Champions Cup...
799999	1999.019178	Volatile year in store for the markets

800000 rows × 2 columns

categories = {}

for i, x in enumerate(data['category'].unique()):
    categories[x] = i+1

print(categories)
    
data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)

model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')

for example in data['train_input']:
    model.learn(example)

{'news': 1, 'sport': 2, 'opinion': 3, 'business': 4, 'culture': 5, 'lifestyle': 6, 'removed': 7}

prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)
prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)
prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)

!jupyter nbconvert --to script run.ipynb

[NbConvertApp] Converting notebook run.ipynb to script
[NbConvertApp] Writing 2030 bytes to run.py

8.5 KiB Raw Permalink Blame History Unescape Escape

8.5 KiB

Raw Permalink Blame History