synthetic_errors/preprocess_plewi.py
2022-04-26 19:24:23 +02:00

20 lines
857 B
Python

import regex as re
filenames = ['test.co', 'test.er', 'train.co', 'train.er', 'tune.co', 'tune.er']
output_filenames = ['./plewi_co.txt',
'./plewi_er.txt',
'./plewi_co.txt',
'./plewi_er.txt',
'./plewi_co.txt',
'./plewi_er.txt']
for idx, filename in enumerate(filenames):
with open('./plewic/' + filename, encoding="utf-8", mode='r') as f:
with open(output_filenames[idx], encoding="utf-8", mode='w') as f2:
for line in f.readlines():
new_line = line.replace("\n", "").replace("\t", " ")
if re.match(r"^\!\s\'.*\'$", new_line):
new_line = new_line[3:len(new_line)-1]
elif re.match(r"^\!\s\".*\"$", new_line):
new_line = new_line[3:len(new_line)-1]
f2.write(new_line.strip() + "\n")