ireland-news-headlines/run.py

59 lines
1.8 KiB
Python

from cProfile import label
import pandas as pd, vowpalwabbit
from sklearn.preprocessing import LabelEncoder
import re
def clean_data(data):
return [
re.sub(
' +', ' ',
re.sub('[^a-zA-Z -]', '', elem[0].replace('\n',
' ').strip().lower()))
for elem in data.values
]
def predict(data, model, encoder):
preds = encoder.inverse_transform(
[model.predict(f' | text: {el}\n') - 1 for el in data])
return preds
def write_results(data, path):
with open(path, 'w') as f:
for line in data:
f.write(f'{line}\n')
print(f"Data written to the file {path}")
if __name__ == '__main__':
# * Loading & cleaning data
training_data = pd.read_csv('train/in.tsv',
delimiter='\t',
usecols=[2],
names=['text'])
expected = pd.read_csv('train/expected.tsv',
delimiter='\t',
names=['class'])
cleaned_training = clean_data(training_data)
# * Encoding the categories
fit_list = list(expected['class'].unique())
fit_list.sort()
print("Categories: ", fit_list)
label_enc = LabelEncoder()
expected['class'] = label_enc.fit_transform(expected['class']) + 1
# * Training
wabbit = vowpalwabbit.Workspace('--oaa 7')
for text, category in zip(cleaned_training, expected['class']):
wabbit.learn(f'{category} | text:{text}\n')
# * Predictions
for path in ['dev-0/', 'test-A/', 'test-B/']:
to_predict = clean_data(
pd.read_csv(f'{path}in.tsv', delimiter='\t', names=['text']))
predictions = predict(to_predict, wabbit, label_enc)
write_results(predictions, f'{path}out.tsv')