transfix-translator/resources/video.py

57 lines
1.9 KiB
Python
Raw Permalink Normal View History

2022-06-08 20:34:35 +02:00
from flask import send_file
from flask_restful import Resource, reqparse
import werkzeug
import time
import io
import itertools
import nltk
from nltk import tokenize
class Video(Resource):
def __init__(self, **kwargs):
super().__init__()
self.parser = reqparse.RequestParser()
self.model = kwargs['model']
self.tokenizer = kwargs['tokenizer']
self.parser.add_argument('file', required=True, type=werkzeug.datastructures.FileStorage, location='files')
def post(self):
try:
text_file = self.parser.parse_args().file
request_id = int(time.time())
text_path = "in/" + str(request_id) + '_pl.txt'
text_file.save(text_path)
self.run_on_video(text_path, request_id)
path_file = "out/" + str(request_id) + '_en.txt'
return send_file(path_file, as_attachment=True, conditional=True)
except Exception as e:
print(e)
outcome = 'fail'
return {'file_storage_result': outcome, 'error': e}
def run_on_video(self, file_path, request_id):
nltk.download('punkt')
with io.open(file_path, 'r', encoding='utf8') as f:
lines = f.readlines()
sentences = tokenize.sent_tokenize(' '.join(lines))
returns = []
for sentence in sentences:
model_inputs = self.tokenizer(sentence, return_tensors="pt")
generated_tokens = self.model.generate(
**model_inputs,
forced_bos_token_id=self.tokenizer.lang_code_to_id["en_XX"]
)
returns.append(self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))
returns = list(itertools.chain(*returns))
with io.open('out/' + str(request_id) + '_en.txt', 'w', encoding='utf8') as f:
for line in returns:
f.write(line + ' ')