init
This commit is contained in:
commit
98dc92beea
190
.gitignore
vendored
Normal file
190
.gitignore
vendored
Normal file
@ -0,0 +1,190 @@
|
|||||||
|
# Created by .ignore support plugin (hsz.mobi)
|
||||||
|
# MY IGNORES
|
||||||
|
model-en2pl
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Python template
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
env/
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*,cover
|
||||||
|
.hypothesis/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# IPython Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# celery beat schedule file
|
||||||
|
celerybeat-schedule
|
||||||
|
|
||||||
|
# dotenv
|
||||||
|
.env
|
||||||
|
|
||||||
|
# virtualenv
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
### VirtualEnv template
|
||||||
|
# Virtualenv
|
||||||
|
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
|
||||||
|
[Bb]in
|
||||||
|
[Ii]nclude
|
||||||
|
[Ll]ib
|
||||||
|
[Ll]ib64
|
||||||
|
[Ll]ocal
|
||||||
|
[Ss]cripts
|
||||||
|
pyvenv.cfg
|
||||||
|
.venv
|
||||||
|
pip-selfcheck.json
|
||||||
|
|
||||||
|
### JetBrains template
|
||||||
|
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
||||||
|
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||||
|
|
||||||
|
# User-specific stuff
|
||||||
|
.idea/**/workspace.xml
|
||||||
|
.idea/**/tasks.xml
|
||||||
|
.idea/**/usage.statistics.xml
|
||||||
|
.idea/**/dictionaries
|
||||||
|
.idea/**/shelf
|
||||||
|
|
||||||
|
# AWS User-specific
|
||||||
|
.idea/**/aws.xml
|
||||||
|
|
||||||
|
# Generated files
|
||||||
|
.idea/**/contentModel.xml
|
||||||
|
|
||||||
|
# Sensitive or high-churn files
|
||||||
|
.idea/**/dataSources/
|
||||||
|
.idea/**/dataSources.ids
|
||||||
|
.idea/**/dataSources.local.xml
|
||||||
|
.idea/**/sqlDataSources.xml
|
||||||
|
.idea/**/dynamic.xml
|
||||||
|
.idea/**/uiDesigner.xml
|
||||||
|
.idea/**/dbnavigator.xml
|
||||||
|
|
||||||
|
# Gradle
|
||||||
|
.idea/**/gradle.xml
|
||||||
|
.idea/**/libraries
|
||||||
|
|
||||||
|
# Gradle and Maven with auto-import
|
||||||
|
# When using Gradle or Maven with auto-import, you should exclude module files,
|
||||||
|
# since they will be recreated, and may cause churn. Uncomment if using
|
||||||
|
# auto-import.
|
||||||
|
# .idea/artifacts
|
||||||
|
# .idea/compiler.xml
|
||||||
|
# .idea/jarRepositories.xml
|
||||||
|
# .idea/modules.xml
|
||||||
|
# .idea/*.iml
|
||||||
|
# .idea/modules
|
||||||
|
# *.iml
|
||||||
|
# *.ipr
|
||||||
|
|
||||||
|
# CMake
|
||||||
|
cmake-build-*/
|
||||||
|
|
||||||
|
# Mongo Explorer plugin
|
||||||
|
.idea/**/mongoSettings.xml
|
||||||
|
|
||||||
|
# File-based project format
|
||||||
|
*.iws
|
||||||
|
|
||||||
|
# IntelliJ
|
||||||
|
out/
|
||||||
|
|
||||||
|
# mpeltonen/sbt-idea plugin
|
||||||
|
.idea_modules/
|
||||||
|
|
||||||
|
# JIRA plugin
|
||||||
|
atlassian-ide-plugin.xml
|
||||||
|
|
||||||
|
# Cursive Clojure plugin
|
||||||
|
.idea/replstate.xml
|
||||||
|
|
||||||
|
# SonarLint plugin
|
||||||
|
.idea/sonarlint/
|
||||||
|
|
||||||
|
# Crashlytics plugin (for Android Studio and IntelliJ)
|
||||||
|
com_crashlytics_export_strings.xml
|
||||||
|
crashlytics.properties
|
||||||
|
crashlytics-build.properties
|
||||||
|
fabric.properties
|
||||||
|
|
||||||
|
# Editor-based Rest Client
|
||||||
|
.idea/httpRequests
|
||||||
|
|
||||||
|
# Android studio 3.1+ serialized cache file
|
||||||
|
.idea/caches/build_file_checksums.ser
|
||||||
|
|
||||||
|
# idea folder, uncomment if you don't need it
|
||||||
|
.idea
|
8
.idea/.gitignore
vendored
Normal file
8
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
22
.idea/inspectionProfiles/Project_Default.xml
Normal file
22
.idea/inspectionProfiles/Project_Default.xml
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<profile version="1.0">
|
||||||
|
<option name="myName" value="Project Default" />
|
||||||
|
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||||
|
<option name="ignoredPackages">
|
||||||
|
<value>
|
||||||
|
<list size="9">
|
||||||
|
<item index="0" class="java.lang.String" itemvalue="Werkzeug" />
|
||||||
|
<item index="1" class="java.lang.String" itemvalue="aniso8601" />
|
||||||
|
<item index="2" class="java.lang.String" itemvalue="MarkupSafe" />
|
||||||
|
<item index="3" class="java.lang.String" itemvalue="pytz" />
|
||||||
|
<item index="4" class="java.lang.String" itemvalue="itsdangerous" />
|
||||||
|
<item index="5" class="java.lang.String" itemvalue="click" />
|
||||||
|
<item index="6" class="java.lang.String" itemvalue="Jinja2" />
|
||||||
|
<item index="7" class="java.lang.String" itemvalue="Flask-RESTful" />
|
||||||
|
<item index="8" class="java.lang.String" itemvalue="Flask" />
|
||||||
|
</list>
|
||||||
|
</value>
|
||||||
|
</option>
|
||||||
|
</inspection_tool>
|
||||||
|
</profile>
|
||||||
|
</component>
|
6
.idea/misc.xml
Normal file
6
.idea/misc.xml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" project-jdk-name="Python 3.8 (pbr-private)" project-jdk-type="Python SDK">
|
||||||
|
<output url="file://$PROJECT_DIR$/out" />
|
||||||
|
</component>
|
||||||
|
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/pbr-private.iml" filepath="$PROJECT_DIR$/.idea/pbr-private.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
11
.idea/pbr-private.iml
Normal file
11
.idea/pbr-private.iml
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="JAVA_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||||
|
<exclude-output />
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
7
.idea/vcs.xml
Normal file
7
.idea/vcs.xml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
<mapping directory="$PROJECT_DIR$/mbart-large-50-one-to-many-mmt" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
22
Dockerfile
Normal file
22
Dockerfile
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
FROM ubuntu:18.04
|
||||||
|
|
||||||
|
WORKDIR app
|
||||||
|
|
||||||
|
COPY in in
|
||||||
|
COPY out out
|
||||||
|
COPY model-pl2en model-pl2en
|
||||||
|
COPY translate.py .
|
||||||
|
COPY resources resources
|
||||||
|
COPY app.py .
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y python3-pip python3-dev \
|
||||||
|
&& cd /usr/local/bin \
|
||||||
|
&& ln -s /usr/bin/python3 python \
|
||||||
|
&& pip3 install --upgrade pip
|
||||||
|
|
||||||
|
RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
|
RUN pip3 install protobuf==3.17.3 transformers nltk sentencepiece
|
||||||
|
RUN pip3 install flask_restful werkzeug
|
||||||
|
|
||||||
|
CMD ["python3", "-m", "flask", "run", "--host=0.0.0.0"]
|
20
app.py
Normal file
20
app.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
#!flask/bin/python
|
||||||
|
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
|
||||||
|
from flask import Flask
|
||||||
|
from flask_restful import Api
|
||||||
|
|
||||||
|
|
||||||
|
from resources.video import Video
|
||||||
|
|
||||||
|
|
||||||
|
model = MBartForConditionalGeneration.from_pretrained("model-pl2en")
|
||||||
|
tokenizer = MBart50TokenizerFast.from_pretrained("model-pl2en", src_lang="pl_PL")
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
api = Api(app)
|
||||||
|
|
||||||
|
api.add_resource(Video, '/api/video', resource_class_kwargs={'model': model,
|
||||||
|
'tokenizer': tokenizer})
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(debug=True, host='0.0.0.0')
|
1
in/1654710055_pl.txt
Normal file
1
in/1654710055_pl.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
przepis na ciasto bananowe jest zaskakująco prosty.
|
1
in/1654710117_pl.txt
Normal file
1
in/1654710117_pl.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
przepis na ciasto bananowe jest zaskakująco prosty.
|
1
in/1654710131_pl.txt
Normal file
1
in/1654710131_pl.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
przepis na ciasto bananowe jest zaskakująco prosty.
|
2
in/1654710175_pl.txt
Normal file
2
in/1654710175_pl.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
przepis na ciasto bananowe jest zaskakująco prosty.
|
||||||
|
przepis na ciasto bananowe jest zaskakująco prosty i skuteczny.
|
3
readme.md
Normal file
3
readme.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# PBR TransFix translator
|
||||||
|
|
||||||
|
Docker with a POST endpoint that receives a file in Polish and returns its English translation.
|
56
resources/video.py
Normal file
56
resources/video.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
from flask import send_file
|
||||||
|
from flask_restful import Resource, reqparse
|
||||||
|
import werkzeug
|
||||||
|
import time
|
||||||
|
import io
|
||||||
|
import itertools
|
||||||
|
import nltk
|
||||||
|
from nltk import tokenize
|
||||||
|
|
||||||
|
|
||||||
|
class Video(Resource):
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.parser = reqparse.RequestParser()
|
||||||
|
self.model = kwargs['model']
|
||||||
|
self.tokenizer = kwargs['tokenizer']
|
||||||
|
self.parser.add_argument('file', required=True, type=werkzeug.datastructures.FileStorage, location='files')
|
||||||
|
|
||||||
|
def post(self):
|
||||||
|
try:
|
||||||
|
text_file = self.parser.parse_args().file
|
||||||
|
request_id = int(time.time())
|
||||||
|
text_path = "in/" + str(request_id) + '_pl.txt'
|
||||||
|
text_file.save(text_path)
|
||||||
|
self.run_on_video(text_path, request_id)
|
||||||
|
path_file = "out/" + str(request_id) + '_en.txt'
|
||||||
|
return send_file(path_file, as_attachment=True, conditional=True)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
outcome = 'fail'
|
||||||
|
return {'file_storage_result': outcome, 'error': e}
|
||||||
|
|
||||||
|
def run_on_video(self, file_path, request_id):
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
with io.open(file_path, 'r', encoding='utf8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
sentences = tokenize.sent_tokenize(' '.join(lines))
|
||||||
|
|
||||||
|
returns = []
|
||||||
|
for sentence in sentences:
|
||||||
|
model_inputs = self.tokenizer(sentence, return_tensors="pt")
|
||||||
|
|
||||||
|
generated_tokens = self.model.generate(
|
||||||
|
**model_inputs,
|
||||||
|
forced_bos_token_id=self.tokenizer.lang_code_to_id["en_XX"]
|
||||||
|
)
|
||||||
|
returns.append(self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))
|
||||||
|
|
||||||
|
returns = list(itertools.chain(*returns))
|
||||||
|
|
||||||
|
with io.open('out/' + str(request_id) + '_en.txt', 'w', encoding='utf8') as f:
|
||||||
|
for line in returns:
|
||||||
|
f.write(line + ' ')
|
Loading…
Reference in New Issue
Block a user