import pandas as pd import csv import regex as re def clean_text(text): text = text.lower().replace('-\\n', '').replace('\\n', ' ') text = re.sub(r'\p{P}', '', text) return text train_data = pd.read_csv('train/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) train_labels = pd.read_csv('train/expected.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) train_data = train_data[[6, 7]] train_data = pd.concat([train_data, train_labels], axis=1) train_data['text'] = train_data[6] + train_data[0] + train_data[7] train_data = train_data[['text']] with open('processed_train.txt', 'w') as file: for _, row in train_data.iterrows(): text = clean_text(str(row['text'])) file.write(text + '\n')