challenging-america-word-ga.../process_test.py
2022-04-11 11:05:46 +02:00

26 lines
831 B
Python

import pandas as pd
import csv
import regex as re
def clean_text(text):
text = text.lower().replace('-\\n', '').replace('\\n', ' ')
text = re.sub(r'\p{P}', '', text)
return text
train_data = pd.read_csv('train/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
train_labels = pd.read_csv('train/expected.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
train_data = train_data[[6, 7]]
train_data = pd.concat([train_data, train_labels], axis=1)
train_data['text'] = train_data[6] + train_data[0] + train_data[7]
train_data = train_data[['text']]
with open('processed_train.txt', 'w') as file:
for _, row in train_data.iterrows():
text = clean_text(str(row['text']))
file.write(text + '\n')