diff --git a/concordia-aligner/.gitignore b/concordia-aligner/.gitignore index 2493a16..c712278 100644 --- a/concordia-aligner/.gitignore +++ b/concordia-aligner/.gitignore @@ -1,5 +1,5 @@ corpora/**/*.lem -corpora/**/*.norm +corpora/**/*.tok corpora/**/alignments.txt corpora/**/falign_corpus.txt corpora/**/falign_result.txt @@ -8,3 +8,4 @@ corpora/**/src.dict corpora/**/src_clean.txt corpora/**/trg.dict corpora/**/trg_clean.txt +corpus.cfg diff --git a/concordia-aligner/Dockerfile b/concordia-aligner/Dockerfile index e37deda..30182c8 100644 --- a/concordia-aligner/Dockerfile +++ b/concordia-aligner/Dockerfile @@ -8,7 +8,11 @@ ENV LANG pl_PL.UTF-8 ENV LANGUAGE pl_PL:pl ENV LC_ALL pl_PL.UTF-8 -RUN apt-get install -y git cmake g++ python3 python3-pip +RUN apt-get install -y git cmake g++ python3 python3-pip libfcgi-dev libpq-dev python3-psycopg2 nginx php apache2 libapache2-mod-php spawn-fcgi cmake g++ libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev + +RUN git clone https://git.code.sf.net/p/tmconcordia/code +RUN cd code && mkdir build && cd build && ../cmake.sh && make && make install && ldconfig + RUN pip3 install requests RUN git clone https://git.wmi.amu.edu.pl/rjawor/concordia-aligner.git RUN git clone https://github.com/clab/fast_align diff --git a/concordia-aligner/add_corpus.sh b/concordia-aligner/add_corpus.sh new file mode 100755 index 0000000..5a079a6 --- /dev/null +++ b/concordia-aligner/add_corpus.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +CORPUS_NAME=opensubtitles +CORPUS_PATH=../fast-aligner/corpora/$CORPUS_NAME +SRC_LANG_ID=1 +TRG_LANG_ID=2 + +./addFastAlignedTM.py $CORPUS_NAME $CORPUS_PATH/src_clean.txt $CORPUS_PATH/src_clean.lem $SRC_LANG_ID $CORPUS_PATH/trg_clean.txt $TRG_LANG_ID $CORPUS_PATH/alignments.txt $CORPUS_PATH/ids_clean.txt diff --git a/concordia-aligner/add_fast_aligned_TM.py b/concordia-aligner/add_fast_aligned_TM.py new file mode 100755 index 0000000..4b0396e --- /dev/null +++ b/concordia-aligner/add_fast_aligned_TM.py @@ -0,0 +1,114 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import json +import urllib2 +import sys +import time + +BUFFER_SIZE = 500 +LEAVE_OUT = 1 # that does not leave out anything + +address = 'http://127.0.0.1:10001' + + +def file_len(fname): + with open(fname) as f: + for i, l in enumerate(f): + pass + return i + 1 + +def add_examples(examplesData): + req = urllib2.Request(address) + req.add_header('Content-Type', 'application/json') + response = json.loads(urllib2.urlopen(req, json.dumps(examplesData), timeout = 3600).read()) + print(response) + if response['status'] == 'error': + raise Exception(response['message']) + +if len(sys.argv) != 9: + raise Exception("wrong number of arguments") + +name = sys.argv[1] +sourceFile = sys.argv[2] +lemmatizedSourceFile = sys.argv[3] +sourceLangId = int(sys.argv[4]) +targetFile = sys.argv[5] +targetLangId = int(sys.argv[6]) +alignmentsFile = sys.argv[7] +sourceIdsFile = sys.argv[8] + +sourceFileLength = file_len(sourceFile) +lemmatizedSourceFileLength = file_len(lemmatizedSourceFile) +targetFileLength = file_len(targetFile) +alignmentsFileLength = file_len(alignmentsFile) +sourceIdsFileLength = file_len(sourceIdsFile) + +if not (sourceFileLength == lemmatizedSourceFileLength and lemmatizedSourceFileLength == targetFileLength and targetFileLength == alignmentsFileLength and alignmentsFileLength == sourceIdsFileLength): + print("File lengths:") + print("source file: %d\nlemmatized source file: %d\ntarget file: %d\nalignments file: %d\nsource ids file: %d" % (sourceFileLength, lemmatizedSourceFileLength, targetFileLength, alignmentsFileLength, sourceIdsFileLength)) + raise Exception("files are not of the same length!") + +totalExamples = sourceFileLength / LEAVE_OUT + +data = { + 'operation': 'addTm', + 'sourceLangId':sourceLangId, + 'targetLangId':targetLangId, + 'name':name, + 'tmLemmatized':True +} + +req = urllib2.Request(address) +req.add_header('Content-Type', 'application/json') +response = json.loads(urllib2.urlopen(req, json.dumps(data), timeout = 3600).read()) +print(response) +tmId = int(response['newTmId']) +print "Added new tm: %d" % tmId + +data = { + 'operation': 'addSentences', + 'tmId':tmId +} + +examples = [] +start = time.time() +with open(sourceFile) as source_file, open(lemmatizedSourceFile) as lemmatized_source_file, open(targetFile) as target_file, open(alignmentsFile) as alignments_file, open(sourceIdsFile) as source_ids_file: + addedCount = 0 + for lineNumber in range(sourceFileLength): + if lineNumber % LEAVE_OUT == 0: + sourceSentence = source_file.readline().strip() + lemmatizedSourceSentence = lemmatized_source_file.readline().strip() + targetSentence = target_file.readline().strip() + alignment = json.loads(alignments_file.readline().strip()) + sourceId = int(source_ids_file.readline().strip()) + + examples.append([sourceSentence, lemmatizedSourceSentence, targetSentence, alignment, sourceId]) + addedCount += 1 + if len(examples) >= BUFFER_SIZE: + data['examples'] = examples + add_examples(data) + mark = time.time() + print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % (addedCount, totalExamples, mark-start, addedCount/(mark-start)) + examples = [] + + +if len(examples) > 0: + data['examples'] = examples + add_examples(data) + +end = time.time() +print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % (addedCount, end-start, addedCount/(end-start)) + +print "Generating index..." +start = time.time() +data = { + 'operation': 'refreshIndex', + 'tmId' : tmId +} +req = urllib2.Request(address) +req.add_header('Content-Type', 'application/json') +urllib2.urlopen(req, json.dumps(data), timeout = 3600).read() + +end = time.time() +print "Index regeneration complete. The operation took %.4f s" % (end - start) diff --git a/concordia-aligner/align_corpus.sh b/concordia-aligner/align_corpus.sh index 6f3defb..7747a6a 100755 --- a/concordia-aligner/align_corpus.sh +++ b/concordia-aligner/align_corpus.sh @@ -1,3 +1,4 @@ #!/bin/sh -docker run -it --rm --name=concordia-aligner --network=concordiadocker_default --mount src="$(pwd)"/corpora,target=/concordia-aligner/corpora,type=bind concordia-aligner:Dockerfile cd concordia-aligner && make clean && make \ No newline at end of file +#docker run -it --rm --name=concordia-aligner --network=concordiadocker_default --mount src="$(pwd)"/corpora,target=/concordia-aligner/corpora,type=bind concordia-aligner:Dockerfile cd concordia-aligner && make clean && make +docker run -it --rm --name=concordia-aligner --network=concordiadocker_default --mount src="$(pwd)"/corpora,target=/concordia-aligner/corpora,type=bind concordia-aligner:Dockerfile \ No newline at end of file diff --git a/concordia-aligner/corpus.cfg_sample b/concordia-aligner/corpus.cfg_sample new file mode 100644 index 0000000..c915bcf --- /dev/null +++ b/concordia-aligner/corpus.cfg_sample @@ -0,0 +1,8 @@ +CORPUS_NAME=opensubtitles_sample +CORPUS_PATH=corpora/$CORPUS_NAME + +# Language configuration. Available language ids: +# 1 - Polish +# 2 - English +SRC_LANG_ID=1 +TRG_LANG_ID=2 \ No newline at end of file diff --git a/concordia-aligner/setup.sh b/concordia-aligner/setup_aligner.sh similarity index 100% rename from concordia-aligner/setup.sh rename to concordia-aligner/setup_aligner.sh