26 lines
831 B
Python
26 lines
831 B
Python
import pandas as pd
|
|
import csv
|
|
import regex as re
|
|
|
|
def clean_text(text):
|
|
text = text.lower().replace('-\\n', '').replace('\\n', ' ')
|
|
text = re.sub(r'\p{P}', '', text)
|
|
|
|
return text
|
|
|
|
|
|
train_data = pd.read_csv('train/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
|
|
train_labels = pd.read_csv('train/expected.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
|
|
|
|
train_data = train_data[[6, 7]]
|
|
train_data = pd.concat([train_data, train_labels], axis=1)
|
|
|
|
train_data['text'] = train_data[6] + train_data[0] + train_data[7]
|
|
train_data = train_data[['text']]
|
|
|
|
with open('processed_train.txt', 'w') as file:
|
|
for _, row in train_data.iterrows():
|
|
text = clean_text(str(row['text']))
|
|
file.write(text + '\n')
|
|
|