45 lines
1.1 KiB
Python
45 lines
1.1 KiB
Python
|
#!/usr/bin/env python
|
||
|
# coding: utf-8
|
||
|
|
||
|
# In[9]:
|
||
|
|
||
|
|
||
|
import regex as re
|
||
|
from tqdm.notebook import tqdm
|
||
|
|
||
|
def _clean(text):
|
||
|
text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')#.replace('<s>','s')
|
||
|
while ' ' in text:
|
||
|
text = text.replace(' ',' ')
|
||
|
|
||
|
return re.sub(r'\p{P}', '', text)
|
||
|
|
||
|
def clean(text):
|
||
|
text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')
|
||
|
text = re.sub(r'\n', ' ', text)
|
||
|
text = re.sub(r'(?<=\w)[,-](?=\w)', '', text)
|
||
|
text = re.sub(r'\s+', ' ', text)
|
||
|
text = re.sub(r'\p{P}', '', text)
|
||
|
text = text.strip()
|
||
|
return text
|
||
|
|
||
|
def generate_file(input_path, expected_path, output_path):
|
||
|
with open(input_path, encoding='utf8') as input_file, open(expected_path, encoding='utf8') as expected_file, open(output_path, 'w', encoding='utf-8') as output_file:
|
||
|
for line, word in tqdm(zip(input_file, expected_file), total=432022):
|
||
|
columns = line.split('\t')
|
||
|
prefix = clean(columns[6])
|
||
|
suffix = clean(columns[7])
|
||
|
|
||
|
train_line = f"{prefix.strip()} {word.strip()} {suffix.strip()}\n"
|
||
|
|
||
|
output_file.write(train_line)
|
||
|
|
||
|
generate_file('train/in.tsv', 'train/expected.tsv', 'train/train.txt')
|
||
|
|
||
|
|
||
|
# In[ ]:
|
||
|
|
||
|
|
||
|
|
||
|
|