#!/usr/bin/env python # coding: utf-8 # In[9]: import regex as re from tqdm.notebook import tqdm def _clean(text): text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ')#.replace('','s') while ' ' in text: text = text.replace(' ',' ') return re.sub(r'\p{P}', '', text) def clean(text): text = text.replace('-\\n', '').replace('\\n', ' ').replace('\\t', ' ') text = re.sub(r'\n', ' ', text) text = re.sub(r'(?<=\w)[,-](?=\w)', '', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'\p{P}', '', text) text = text.strip() return text def generate_file(input_path, expected_path, output_path): with open(input_path, encoding='utf8') as input_file, open(expected_path, encoding='utf8') as expected_file, open(output_path, 'w', encoding='utf-8') as output_file: for line, word in tqdm(zip(input_file, expected_file), total=432022): columns = line.split('\t') prefix = clean(columns[6]) suffix = clean(columns[7]) train_line = f"{prefix.strip()} {word.strip()} {suffix.strip()}\n" output_file.write(train_line) generate_file('train/in.tsv', 'train/expected.tsv', 'train/train.txt') # In[ ]: