59 lines
1.8 KiB
Python
59 lines
1.8 KiB
Python
|
from cProfile import label
|
||
|
import pandas as pd, vowpalwabbit
|
||
|
from sklearn.preprocessing import LabelEncoder
|
||
|
import re
|
||
|
|
||
|
|
||
|
def clean_data(data):
|
||
|
return [
|
||
|
re.sub(
|
||
|
' +', ' ',
|
||
|
re.sub('[^a-zA-Z -]', '', elem[0].replace('\n',
|
||
|
' ').strip().lower()))
|
||
|
for elem in data.values
|
||
|
]
|
||
|
|
||
|
|
||
|
def predict(data, model, encoder):
|
||
|
preds = encoder.inverse_transform(
|
||
|
[model.predict(f' | text: {el}\n') - 1 for el in data])
|
||
|
return preds
|
||
|
|
||
|
|
||
|
def write_results(data, path):
|
||
|
with open(path, 'w') as f:
|
||
|
for line in data:
|
||
|
f.write(f'{line}\n')
|
||
|
print(f"Data written to the file {path}")
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
# * Loading & cleaning data
|
||
|
training_data = pd.read_csv('train/in.tsv',
|
||
|
delimiter='\t',
|
||
|
usecols=[2],
|
||
|
names=['text'])
|
||
|
expected = pd.read_csv('train/expected.tsv',
|
||
|
delimiter='\t',
|
||
|
names=['class'])
|
||
|
cleaned_training = clean_data(training_data)
|
||
|
|
||
|
# * Encoding the categories
|
||
|
fit_list = list(expected['class'].unique())
|
||
|
fit_list.sort()
|
||
|
print("Categories: ", fit_list)
|
||
|
label_enc = LabelEncoder()
|
||
|
expected['class'] = label_enc.fit_transform(expected['class']) + 1
|
||
|
|
||
|
# * Training
|
||
|
wabbit = vowpalwabbit.Workspace('--oaa 7')
|
||
|
for text, category in zip(cleaned_training, expected['class']):
|
||
|
wabbit.learn(f'{category} | text:{text}\n')
|
||
|
|
||
|
# * Predictions
|
||
|
for path in ['dev-0/', 'test-A/', 'test-B/']:
|
||
|
to_predict = clean_data(
|
||
|
pd.read_csv(f'{path}in.tsv', delimiter='\t', names=['text']))
|
||
|
predictions = predict(to_predict, wabbit, label_enc)
|
||
|
write_results(predictions, f'{path}out.tsv')
|