This commit is contained in:
kuba 2022-06-08 19:34:35 +01:00
commit 98dc92beea
15 changed files with 358 additions and 0 deletions

190
.gitignore vendored Normal file
View File

@ -0,0 +1,190 @@
# Created by .ignore support plugin (hsz.mobi)
# MY IGNORES
model-en2pl
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
### VirtualEnv template
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
.venv
pip-selfcheck.json
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# AWS User-specific
.idea/**/aws.xml
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# SonarLint plugin
.idea/sonarlint/
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
# idea folder, uncomment if you don't need it
.idea

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,22 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="9">
<item index="0" class="java.lang.String" itemvalue="Werkzeug" />
<item index="1" class="java.lang.String" itemvalue="aniso8601" />
<item index="2" class="java.lang.String" itemvalue="MarkupSafe" />
<item index="3" class="java.lang.String" itemvalue="pytz" />
<item index="4" class="java.lang.String" itemvalue="itsdangerous" />
<item index="5" class="java.lang.String" itemvalue="click" />
<item index="6" class="java.lang.String" itemvalue="Jinja2" />
<item index="7" class="java.lang.String" itemvalue="Flask-RESTful" />
<item index="8" class="java.lang.String" itemvalue="Flask" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>

6
.idea/misc.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" project-jdk-name="Python 3.8 (pbr-private)" project-jdk-type="Python SDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/pbr-private.iml" filepath="$PROJECT_DIR$/.idea/pbr-private.iml" />
</modules>
</component>
</project>

11
.idea/pbr-private.iml Normal file
View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

7
.idea/vcs.xml Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
<mapping directory="$PROJECT_DIR$/mbart-large-50-one-to-many-mmt" vcs="Git" />
</component>
</project>

22
Dockerfile Normal file
View File

@ -0,0 +1,22 @@
FROM ubuntu:18.04
WORKDIR app
COPY in in
COPY out out
COPY model-pl2en model-pl2en
COPY translate.py .
COPY resources resources
COPY app.py .
RUN apt-get update \
&& apt-get install -y python3-pip python3-dev \
&& cd /usr/local/bin \
&& ln -s /usr/bin/python3 python \
&& pip3 install --upgrade pip
RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
RUN pip3 install protobuf==3.17.3 transformers nltk sentencepiece
RUN pip3 install flask_restful werkzeug
CMD ["python3", "-m", "flask", "run", "--host=0.0.0.0"]

20
app.py Normal file
View File

@ -0,0 +1,20 @@
#!flask/bin/python
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from flask import Flask
from flask_restful import Api
from resources.video import Video
model = MBartForConditionalGeneration.from_pretrained("model-pl2en")
tokenizer = MBart50TokenizerFast.from_pretrained("model-pl2en", src_lang="pl_PL")
app = Flask(__name__)
api = Api(app)
api.add_resource(Video, '/api/video', resource_class_kwargs={'model': model,
'tokenizer': tokenizer})
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0')

1
in/1654710055_pl.txt Normal file
View File

@ -0,0 +1 @@
przepis na ciasto bananowe jest zaskakująco prosty.

1
in/1654710117_pl.txt Normal file
View File

@ -0,0 +1 @@
przepis na ciasto bananowe jest zaskakująco prosty.

1
in/1654710131_pl.txt Normal file
View File

@ -0,0 +1 @@
przepis na ciasto bananowe jest zaskakująco prosty.

2
in/1654710175_pl.txt Normal file
View File

@ -0,0 +1,2 @@
przepis na ciasto bananowe jest zaskakująco prosty.
przepis na ciasto bananowe jest zaskakująco prosty i skuteczny.

3
readme.md Normal file
View File

@ -0,0 +1,3 @@
# PBR TransFix translator
Docker with a POST endpoint that receives a file in Polish and returns its English translation.

56
resources/video.py Normal file
View File

@ -0,0 +1,56 @@
from flask import send_file
from flask_restful import Resource, reqparse
import werkzeug
import time
import io
import itertools
import nltk
from nltk import tokenize
class Video(Resource):
def __init__(self, **kwargs):
super().__init__()
self.parser = reqparse.RequestParser()
self.model = kwargs['model']
self.tokenizer = kwargs['tokenizer']
self.parser.add_argument('file', required=True, type=werkzeug.datastructures.FileStorage, location='files')
def post(self):
try:
text_file = self.parser.parse_args().file
request_id = int(time.time())
text_path = "in/" + str(request_id) + '_pl.txt'
text_file.save(text_path)
self.run_on_video(text_path, request_id)
path_file = "out/" + str(request_id) + '_en.txt'
return send_file(path_file, as_attachment=True, conditional=True)
except Exception as e:
print(e)
outcome = 'fail'
return {'file_storage_result': outcome, 'error': e}
def run_on_video(self, file_path, request_id):
nltk.download('punkt')
with io.open(file_path, 'r', encoding='utf8') as f:
lines = f.readlines()
sentences = tokenize.sent_tokenize(' '.join(lines))
returns = []
for sentence in sentences:
model_inputs = self.tokenizer(sentence, return_tensors="pt")
generated_tokens = self.model.generate(
**model_inputs,
forced_bos_token_id=self.tokenizer.lang_code_to_id["en_XX"]
)
returns.append(self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))
returns = list(itertools.chain(*returns))
with io.open('out/' + str(request_id) + '_en.txt', 'w', encoding='utf8') as f:
for line in returns:
f.write(line + ' ')