from cProfile import label import pandas as pd, vowpalwabbit from sklearn.preprocessing import LabelEncoder import re def clean_data(data): return [ re.sub( ' +', ' ', re.sub('[^a-zA-Z -]', '', elem[0].replace('\n', ' ').strip().lower())) for elem in data.values ] def predict(data, model, encoder): preds = encoder.inverse_transform( [model.predict(f' | text: {el}\n') - 1 for el in data]) return preds def write_results(data, path): with open(path, 'w') as f: for line in data: f.write(f'{line}\n') print(f"Data written to the file {path}") if __name__ == '__main__': # * Loading & cleaning data training_data = pd.read_csv('train/in.tsv', delimiter='\t', usecols=[2], names=['text']) expected = pd.read_csv('train/expected.tsv', delimiter='\t', names=['class']) cleaned_training = clean_data(training_data) # * Encoding the categories fit_list = list(expected['class'].unique()) fit_list.sort() print("Categories: ", fit_list) label_enc = LabelEncoder() expected['class'] = label_enc.fit_transform(expected['class']) + 1 # * Training wabbit = vowpalwabbit.Workspace('--oaa 7') for text, category in zip(cleaned_training, expected['class']): wabbit.learn(f'{category} | text:{text}\n') # * Predictions for path in ['dev-0/', 'test-A/', 'test-B/']: to_predict = clean_data( pd.read_csv(f'{path}in.tsv', delimiter='\t', names=['text'])) predictions = predict(to_predict, wabbit, label_enc) write_results(predictions, f'{path}out.tsv')