challenging-america-word-ga.../generate_train_txt.py

#!/usr/bin/env python
# coding: utf-8

# In[9]:


import regex as re
from tqdm.notebook import tqdm

def _clean(text):
	text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')#.replace('<s>','s')
	while '  ' in text:
		text = text.replace('  ',' ')

	return re.sub(r'\p{P}', '', text)

def clean(text):
    text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'(?<=\w)[,-](?=\w)', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\p{P}', '', text)
    text = text.strip()
    return text

def generate_file(input_path, expected_path, output_path):
	with open(input_path, encoding='utf8') as input_file, open(expected_path, encoding='utf8') as expected_file, open(output_path, 'w', encoding='utf-8') as output_file:
		for line, word in tqdm(zip(input_file, expected_file), total=432022):
			columns = line.split('\t')
			prefix = clean(columns[6])
			suffix = clean(columns[7])

			train_line = f"{prefix.strip()} {word.strip()} {suffix.strip()}\n"

			output_file.write(train_line)

generate_file('train/in.tsv', 'train/expected.tsv', 'train/train.txt')


# In[ ]: