challenging-america-word-ga.../generate_train_txt.py

45 lines
1.1 KiB
Python
Raw Normal View History

2024-05-14 17:10:07 +02:00
#!/usr/bin/env python
# coding: utf-8
# In[9]:
import regex as re
from tqdm.notebook import tqdm
def _clean(text):
text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')#.replace('<s>','s')
while ' ' in text:
text = text.replace(' ',' ')
return re.sub(r'\p{P}', '', text)
def clean(text):
text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')
text = re.sub(r'\n', ' ', text)
text = re.sub(r'(?<=\w)[,-](?=\w)', '', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\p{P}', '', text)
text = text.strip()
return text
def generate_file(input_path, expected_path, output_path):
with open(input_path, encoding='utf8') as input_file, open(expected_path, encoding='utf8') as expected_file, open(output_path, 'w', encoding='utf-8') as output_file:
for line, word in tqdm(zip(input_file, expected_file), total=432022):
columns = line.split('\t')
prefix = clean(columns[6])
suffix = clean(columns[7])
train_line = f"{prefix.strip()} {word.strip()} {suffix.strip()}\n"
output_file.write(train_line)
generate_file('train/in.tsv', 'train/expected.tsv', 'train/train.txt')
# In[ ]: