import io import itertools import nltk from nltk import tokenize from transformers import MBartForConditionalGeneration, MBart50TokenizerFast nltk.download('punkt') with io.open('data/text.txt', 'r', encoding='utf8') as f: lines = f.readlines() sentences = tokenize.sent_tokenize(' '.join(lines)) model = MBartForConditionalGeneration.from_pretrained("model") tokenizer = MBart50TokenizerFast.from_pretrained("model", src_lang="pl_PL") returns = [] for sentence in sentences: model_inputs = tokenizer(sentence, return_tensors="pt") generated_tokens = model.generate( **model_inputs, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"] ) returns.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)) returns = list(itertools.chain(*returns)) with io.open('translation_output.txt', 'w', encoding='utf8') as f: for line in returns: f.write(line + ' ')