From 98dc92beea04669bfb50163d0ff0a4c956f0dbbf Mon Sep 17 00:00:00 2001 From: kuba Date: Wed, 8 Jun 2022 19:34:35 +0100 Subject: [PATCH] init --- .gitignore | 190 +++++++++++++++++++ .idea/.gitignore | 8 + .idea/inspectionProfiles/Project_Default.xml | 22 +++ .idea/misc.xml | 6 + .idea/modules.xml | 8 + .idea/pbr-private.iml | 11 ++ .idea/vcs.xml | 7 + Dockerfile | 22 +++ app.py | 20 ++ in/1654710055_pl.txt | 1 + in/1654710117_pl.txt | 1 + in/1654710131_pl.txt | 1 + in/1654710175_pl.txt | 2 + readme.md | 3 + resources/video.py | 56 ++++++ 15 files changed, 358 insertions(+) create mode 100644 .gitignore create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/pbr-private.iml create mode 100644 .idea/vcs.xml create mode 100644 Dockerfile create mode 100644 app.py create mode 100644 in/1654710055_pl.txt create mode 100644 in/1654710117_pl.txt create mode 100644 in/1654710131_pl.txt create mode 100644 in/1654710175_pl.txt create mode 100644 readme.md create mode 100644 resources/video.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fba7136 --- /dev/null +++ b/.gitignore @@ -0,0 +1,190 @@ +# Created by .ignore support plugin (hsz.mobi) +# MY IGNORES +model-en2pl + + + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject +### VirtualEnv template +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +.venv +pip-selfcheck.json + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +# idea folder, uncomment if you don't need it + .idea \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..6a33865 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,22 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..9a2e088 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..45c7249 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/pbr-private.iml b/.idea/pbr-private.iml new file mode 100644 index 0000000..2cdb1e3 --- /dev/null +++ b/.idea/pbr-private.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..be82563 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..16bda50 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM ubuntu:18.04 + +WORKDIR app + +COPY in in +COPY out out +COPY model-pl2en model-pl2en +COPY translate.py . +COPY resources resources +COPY app.py . + +RUN apt-get update \ + && apt-get install -y python3-pip python3-dev \ + && cd /usr/local/bin \ + && ln -s /usr/bin/python3 python \ + && pip3 install --upgrade pip + +RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu +RUN pip3 install protobuf==3.17.3 transformers nltk sentencepiece +RUN pip3 install flask_restful werkzeug + +CMD ["python3", "-m", "flask", "run", "--host=0.0.0.0"] diff --git a/app.py b/app.py new file mode 100644 index 0000000..a114afc --- /dev/null +++ b/app.py @@ -0,0 +1,20 @@ +#!flask/bin/python +from transformers import MBartForConditionalGeneration, MBart50TokenizerFast +from flask import Flask +from flask_restful import Api + + +from resources.video import Video + + +model = MBartForConditionalGeneration.from_pretrained("model-pl2en") +tokenizer = MBart50TokenizerFast.from_pretrained("model-pl2en", src_lang="pl_PL") + +app = Flask(__name__) +api = Api(app) + +api.add_resource(Video, '/api/video', resource_class_kwargs={'model': model, + 'tokenizer': tokenizer}) + +if __name__ == '__main__': + app.run(debug=True, host='0.0.0.0') diff --git a/in/1654710055_pl.txt b/in/1654710055_pl.txt new file mode 100644 index 0000000..3928fc3 --- /dev/null +++ b/in/1654710055_pl.txt @@ -0,0 +1 @@ +przepis na ciasto bananowe jest zaskakująco prosty. diff --git a/in/1654710117_pl.txt b/in/1654710117_pl.txt new file mode 100644 index 0000000..3928fc3 --- /dev/null +++ b/in/1654710117_pl.txt @@ -0,0 +1 @@ +przepis na ciasto bananowe jest zaskakująco prosty. diff --git a/in/1654710131_pl.txt b/in/1654710131_pl.txt new file mode 100644 index 0000000..3928fc3 --- /dev/null +++ b/in/1654710131_pl.txt @@ -0,0 +1 @@ +przepis na ciasto bananowe jest zaskakująco prosty. diff --git a/in/1654710175_pl.txt b/in/1654710175_pl.txt new file mode 100644 index 0000000..513200e --- /dev/null +++ b/in/1654710175_pl.txt @@ -0,0 +1,2 @@ +przepis na ciasto bananowe jest zaskakująco prosty. +przepis na ciasto bananowe jest zaskakująco prosty i skuteczny. diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..c707465 --- /dev/null +++ b/readme.md @@ -0,0 +1,3 @@ +# PBR TransFix translator + +Docker with a POST endpoint that receives a file in Polish and returns its English translation. diff --git a/resources/video.py b/resources/video.py new file mode 100644 index 0000000..d74bee0 --- /dev/null +++ b/resources/video.py @@ -0,0 +1,56 @@ +from flask import send_file +from flask_restful import Resource, reqparse +import werkzeug +import time +import io +import itertools +import nltk +from nltk import tokenize + + +class Video(Resource): + def __init__(self, **kwargs): + super().__init__() + self.parser = reqparse.RequestParser() + self.model = kwargs['model'] + self.tokenizer = kwargs['tokenizer'] + self.parser.add_argument('file', required=True, type=werkzeug.datastructures.FileStorage, location='files') + + def post(self): + try: + text_file = self.parser.parse_args().file + request_id = int(time.time()) + text_path = "in/" + str(request_id) + '_pl.txt' + text_file.save(text_path) + self.run_on_video(text_path, request_id) + path_file = "out/" + str(request_id) + '_en.txt' + return send_file(path_file, as_attachment=True, conditional=True) + + except Exception as e: + print(e) + outcome = 'fail' + return {'file_storage_result': outcome, 'error': e} + + def run_on_video(self, file_path, request_id): + nltk.download('punkt') + + with io.open(file_path, 'r', encoding='utf8') as f: + lines = f.readlines() + + sentences = tokenize.sent_tokenize(' '.join(lines)) + + returns = [] + for sentence in sentences: + model_inputs = self.tokenizer(sentence, return_tensors="pt") + + generated_tokens = self.model.generate( + **model_inputs, + forced_bos_token_id=self.tokenizer.lang_code_to_id["en_XX"] + ) + returns.append(self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)) + + returns = list(itertools.chain(*returns)) + + with io.open('out/' + str(request_id) + '_en.txt', 'w', encoding='utf8') as f: + for line in returns: + f.write(line + ' ')