57 lines
1.6 KiB
Python
57 lines
1.6 KiB
Python
import pandas as pd
|
|
import re
|
|
import vowpalwabbit
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
|
|
def format_data(data):
|
|
return [
|
|
re.sub('[^a-zA-z -]', '', str(i[0]).replace('\n',' ').lower().strip())
|
|
for i in data.values
|
|
]
|
|
|
|
def load_train_data():
|
|
in_df = pd.read_csv('train/in.tsv',
|
|
delimiter='\t',
|
|
usecols=[2],
|
|
names=['text'])
|
|
exp_df = pd.read_csv('train/expected.tsv',
|
|
delimiter='\t',
|
|
names=['class'])
|
|
|
|
fit_list = list(exp_df['class'].unique())
|
|
fit_list.sort()
|
|
print("Categories: ", fit_list)
|
|
label_enc = LabelEncoder()
|
|
exp_df['class'] = label_enc.fit_transform(exp_df['class']) + 1
|
|
|
|
return format_data(in_df), exp_df, label_enc
|
|
|
|
def predict(data, model):
|
|
return [model.predict(f' || text: {i}\n') - 1 for i in data]
|
|
|
|
def write_res(data, path):
|
|
with open(path, 'w') as f:
|
|
for line in data:
|
|
f.write(f'{line}\n')
|
|
print(f"Data written {path}/out.tsv")
|
|
|
|
|
|
def main():
|
|
in_df, exp_df, label_enc = load_train_data()
|
|
|
|
|
|
wabbit = vowpalwabbit.Workspace('--oaa 7')
|
|
for text, category in zip(in_df, exp_df['class']):
|
|
wabbit.learn(f'{category} | text:{text}\n')
|
|
|
|
|
|
for path in ['dev-0/', 'test-A/', 'test-B/']:
|
|
to_predict = format_data(pd.read_csv(f'{path}in.tsv', delimiter='\t', names=['text']))
|
|
predictions = label_enc.inverse_transform(predict(to_predict, wabbit))
|
|
write_res(predictions, f'{path}out.tsv')
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |