challenging-america-word-ga.../generate_train_txt.py

#!/usr/bin/env python
# coding: utf-8

# In[9]:


import regex as re
from tqdm.notebook import tqdm

def _clean(text):
	text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')#.replace('<s>','s')
	while '  ' in text:
		text = text.replace('  ',' ')

	return re.sub(r'\p{P}', '', text)

def clean(text):
    text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'(?<=\w)[,-](?=\w)', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\p{P}', '', text)
    text = text.strip()
    return text

def generate_file(input_path, expected_path, output_path):
	with open(input_path, encoding='utf8') as input_file, open(expected_path, encoding='utf8') as expected_file, open(output_path, 'w', encoding='utf-8') as output_file:
		for line, word in tqdm(zip(input_file, expected_file), total=432022):
			columns = line.split('\t')
			prefix = clean(columns[6])
			suffix = clean(columns[7])

			train_line = f"{prefix.strip()} {word.strip()} {suffix.strip()}\n"

			output_file.write(train_line)

generate_file('train/in.tsv', 'train/expected.tsv', 'train/train.txt')


# In[ ]:
init solution trigram. score 580 2024-05-14 17:10:07 +02:00			`#!/usr/bin/env python`
			`# coding: utf-8`

			`# In[9]:`


			`import regex as re`
			`from tqdm.notebook import tqdm`

			`def _clean(text):`
			`text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')#.replace('<s>','s')`
			`while ' ' in text:`
			`text = text.replace(' ',' ')`

			`return re.sub(r'\p{P}', '', text)`

			`def clean(text):`
			`text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')`
			`text = re.sub(r'\n', ' ', text)`
			`text = re.sub(r'(?<=\w)[,-](?=\w)', '', text)`
			`text = re.sub(r'\s+', ' ', text)`
			`text = re.sub(r'\p{P}', '', text)`
			`text = text.strip()`
			`return text`

			`def generate_file(input_path, expected_path, output_path):`
			`with open(input_path, encoding='utf8') as input_file, open(expected_path, encoding='utf8') as expected_file, open(output_path, 'w', encoding='utf-8') as output_file:`
			`for line, word in tqdm(zip(input_file, expected_file), total=432022):`
			`columns = line.split('\t')`
			`prefix = clean(columns[6])`
			`suffix = clean(columns[7])`

			`train_line = f"{prefix.strip()} {word.strip()} {suffix.strip()}\n"`

			`output_file.write(train_line)`

			`generate_file('train/in.tsv', 'train/expected.tsv', 'train/train.txt')`


			`# In[ ]:`