diff --git a/.gitignore b/.gitignore index b3ee04f..efdcffc 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ mgiza-aligner/mgiza/mgizapp/src/mkcls/-MT mgiza-aligner/mgiza/mgizapp/src/mkcls/CMakeFiles/ mgiza-aligner/mgiza/mgizapp/src/mkcls/Makefile mgiza-aligner/mgiza/mgizapp/src/mkcls/cmake_install.cmake +__pycache__ diff --git a/INSTALL.txt b/INSTALL.txt index 5599fb5..37f1dd2 100644 --- a/INSTALL.txt +++ b/INSTALL.txt @@ -1,4 +1,4 @@ -- sudo apt-get install postgresql libfcgi-dev libpq-dev mono-complete +- sudo apt-get install postgresql libfcgi-dev libpq-dev mono-complete python3-psycopg2 - clone github repo, mkdir build, cd build, ../cmake.sh, make - sudo -u postgres psql postgres - create user concordia with encrypted password 'concordia'; diff --git a/cat/css/concordia_cat.css b/cat/css/concordia_cat.css index 2903305..375c99d 100644 --- a/cat/css/concordia_cat.css +++ b/cat/css/concordia_cat.css @@ -213,6 +213,7 @@ h2 { color: #e32; font-family:'Gill Sans','lucida grande', helvetica, arial, sans-serif; font-size: 190%; + margin-top: 30px; } h3 { color: #2c6877; @@ -620,6 +621,7 @@ label { display: block; font-size: 110%; margin-bottom:3px; + margin-top:15px; } input, textarea { clear: both; @@ -685,14 +687,14 @@ form .submit input[type=submit]:hover { background: #5BA150; } /* Form errors */ -form .error { +.error { background: #FFDACC; -moz-border-radius: 4px; -webkit-border-radius: 4px; border-radius: 4px; font-weight: normal; } -form .error-message { +.error-message { -moz-border-radius: none; -webkit-border-radius: none; border-radius: none; @@ -702,8 +704,8 @@ form .error-message { padding-left: 4px; padding-right: 0; } -form .error, -form .error-message { +.error, +.error-message { color: #9E2424; -webkit-box-shadow: none; -moz-box-shadow: none; diff --git a/cat/js/cat.js b/cat/js/cat.js index eddcde7..93bf02b 100644 --- a/cat/js/cat.js +++ b/cat/js/cat.js @@ -1,3 +1,28 @@ +function showNewTmOptions() { + showField('tm_name'); + hideField('tm_id'); + showField('src_lang_id'); + showField('trg_lang_id'); +} + +function showExtendTmOptions() { + hideField('tm_name'); + showField('tm_id'); + hideField('src_lang_id'); + hideField('trg_lang_id'); +} + +function hideField(fieldId) { + $('#'+fieldId).addClass('hidden'); + $('label[for='+fieldId+']').addClass('hidden'); +} + +function showField(fieldId) { + $('#'+fieldId).removeClass('hidden'); + $('label[for='+fieldId+']').removeClass('hidden'); +} + + function toggleHelp() { $('#help').toggleClass('hidden'); } diff --git a/cat/publish.py b/cat/publish.py index 392a974..d43d27c 100755 --- a/cat/publish.py +++ b/cat/publish.py @@ -3,6 +3,13 @@ import sys, os, shutil, re +def config_file(config, file_name, root_dir): + with open(file_name+'_pattern', 'r') as pattern_file, open(root_dir+'/'+file_name, 'w') as out_file: + for line in pattern_file: + for field, value in config.iteritems(): + line = re.sub('@'+field+'@', value, line) + out_file.write(line) + root_dir = sys.argv[1] if not os.path.exists(root_dir): @@ -23,31 +30,16 @@ shutil.copytree('images', root_dir+'/images') shutil.copy('favicon.ico', root_dir+'/favicon.ico') +config = dict() with open('host.cfg', 'r') as host_file: for line in host_file: field, value = line.strip().split('@#@') - if field == 'concordia_host': - concordia_host = value - elif field == 'concordia_port': - concordia_port = value + config[field] = value -with open('concordia_gate.php_pattern', 'r') as gate_pattern_file, open(root_dir+'/concordia_gate.php', 'w') as gate_file: - for line in gate_pattern_file: - line = re.sub('@concordia_host@', concordia_host, line) - line = re.sub('@concordia_port@', concordia_port, line) - gate_file.write(line) - -with open('concordia_search.php_pattern', 'r') as search_pattern_file, open(root_dir+'/concordia_search.php', 'w') as search_file: - for line in search_pattern_file: - line = re.sub('@concordia_host@', concordia_host, line) - line = re.sub('@concordia_port@', concordia_port, line) - search_file.write(line) - -with open('tm_info.php_pattern', 'r') as tm_info_pattern_file, open(root_dir+'/tm_info.php', 'w') as tm_info_file: - for line in tm_info_pattern_file: - line = re.sub('@concordia_host@', concordia_host, line) - line = re.sub('@concordia_port@', concordia_port, line) - tm_info_file.write(line) +config_file(config, 'concordia_gate.php', root_dir) +config_file(config, 'concordia_search.php', root_dir) +config_file(config, 'tm_info.php', root_dir) +config_file(config, 'tm_manager.php', root_dir) versions_dir = 'versions_enabled' diff --git a/cat/tm_manager.php_pattern b/cat/tm_manager.php_pattern new file mode 100644 index 0000000..02c0e44 --- /dev/null +++ b/cat/tm_manager.php_pattern @@ -0,0 +1,257 @@ + array( + 'header' => "Content-type: application/x-www-form-urlencoded\r\n", + 'method' => 'POST', + 'content' => json_encode($request), + ), + ); + $context = stream_context_create($options); + $response = file_get_contents($url, false, $context); + + + return json_decode($response); + +} + +function lineCount($file_name) +{ + $linecount = 0; + $handle = fopen($file_name, 'r'); + while (!feof($handle)) { + $line = fgets($handle); + ++$linecount; + } + + fclose($handle); + + return $linecount; +} + +function addRequest($url, $postArray, $filesArray) { + // add import request. Return empty string if no error occurred + if ($filesArray['src_file']['error'] != 0 && $filesArray['src_file']['size'] > 0) { + return "Error uploading source file or no source file given."; + } + if ($filesArray['trg_file']['error'] != 0 && $filesArray['trg_file']['size'] > 0) { + return "Error uploading target file or no target file given."; + } + + + $srcFilePath = "/tmp/".uniqid("srcFile", true); + $trgFilePath = "/tmp/".uniqid("trgFile", true); + move_uploaded_file($filesArray['src_file']['tmp_name'], $srcFilePath); + move_uploaded_file($filesArray['trg_file']['tmp_name'], $trgFilePath); + + $srcLineCount = lineCount($srcFilePath); + $trgLineCount = lineCount($trgFilePath); + + if ($srcLineCount != $trgLineCount) { + return "Files have different number of lines ($srcLineCount and $trgLineCount)."; + } + + $request = array ( + "operation" => "addRequest", + "sourceFilePath" => $srcFilePath, + "targetFilePath" => $trgFilePath, + "sourceLangId" => intval($postArray['src_lang_id']), + "targetLangId" => intval($postArray['trg_lang_id']), + "name" => $postArray['tm_name'], + "type" => intval($postArray['tm_type']), + "tmId" => intval($postArray['tm_id']) + ); + $response = postJson($url, $request); + + return ""; +} + +$url = 'http://@concordia_host@:@concordia_port@'; + +$errorMessage = ""; +if ($_SERVER['REQUEST_METHOD'] == 'POST' ) { + $errorMessage = addRequest($url, $_POST, $_FILES); +} + + + +$tmsData = postJson($url, array("operation" =>"getTmsInfo")); +$requestsData = postJson($url, array("operation" =>"getRequestsInfo")); +$languagesData = postJson($url, array("operation" =>"getLanguages")); + +?> + + + + + + + + + + +
+ + +
+ + +

Available translation memories:

+ + + + + + + + tms as $tm) { + ?> + + + + + + + + +
IdNameSource languageTarget language
id ?>name ?>sourceLanguageCode ?>targetLanguageCode ?>
+

Import new translations

+
+
+ + + + +
+ + + + + + + + + + + + + + +
+ + + + + +
+ + + + + +
+ + +
+

Latest import requests:

+ + + + + + + + + + + + requests as $request) { + ?> + + + + + + + + + + + + +
IdNameSource languageTarget languageStatusTypeTM idCreated
id ?>name ?> + type == 0) { + echo $request->sourceLanguageCode; + } else { + echo "N/A"; + } + ?> + + type == 0) { + echo $request->targetLanguageCode; + } else { + echo "N/A"; + } + ?> + status ?> + type == 0) { + echo "new TM"; + } else { + echo "extend TM"; + } + ?> + + type == 0) { + echo "N/A"; + } else { + echo $request->tmId; + } + ?> + created ?>
+
+ + diff --git a/cat/versions_enabled/europarl_sample.cfg b/cat/versions_enabled/europarl_sample.cfg deleted file mode 120000 index c90ed2e..0000000 --- a/cat/versions_enabled/europarl_sample.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/europarl_sample.cfg \ No newline at end of file diff --git a/cat/versions_enabled/stocznia_enpl.cfg b/cat/versions_enabled/stocznia_enpl.cfg new file mode 120000 index 0000000..884dd56 --- /dev/null +++ b/cat/versions_enabled/stocznia_enpl.cfg @@ -0,0 +1 @@ +../versions_available/stocznia_enpl.cfg \ No newline at end of file diff --git a/cat/versions_enabled/stocznia_plen.cfg b/cat/versions_enabled/stocznia_plen.cfg new file mode 120000 index 0000000..0ba3868 --- /dev/null +++ b/cat/versions_enabled/stocznia_plen.cfg @@ -0,0 +1 @@ +../versions_available/stocznia_plen.cfg \ No newline at end of file diff --git a/concordia-server/concordia_server.cpp b/concordia-server/concordia_server.cpp index c78ece9..be6455f 100644 --- a/concordia-server/concordia_server.cpp +++ b/concordia-server/concordia_server.cpp @@ -13,6 +13,8 @@ #include "config.hpp" #include "logger.hpp" #include "tm.hpp" +#include "request.hpp" +#include "language.hpp" #include "rapidjson/rapidjson.h" #include #include @@ -125,7 +127,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { } } _indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId); - } else if (operation == GET_TMS_INFO_PARAM) { + } else if (operation == GET_TMS_INFO_OP) { std::vector tms = _tmDAO.getTms(); jsonWriter.StartObject(); @@ -148,6 +150,79 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { jsonWriter.EndArray(); jsonWriter.EndObject(); + } else if (operation == GET_REQUESTS_INFO_OP) { + std::vector requests = _requestDAO.getRequests(); + + jsonWriter.StartObject(); + jsonWriter.String("status"); + jsonWriter.String("success"); + jsonWriter.String("requests"); + jsonWriter.StartArray(); + BOOST_FOREACH(Request & request, requests) { + jsonWriter.StartObject(); + jsonWriter.String("id"); + jsonWriter.Int(request.getId()); + jsonWriter.String("sourceFilePath"); + jsonWriter.String(request.getSourceFilePath().c_str()); + jsonWriter.String("targetFilePath"); + jsonWriter.String(request.getTargetFilePath().c_str()); + jsonWriter.String("name"); + jsonWriter.String(request.getName().c_str()); + jsonWriter.String("sourceLanguageCode"); + jsonWriter.String(request.getSourceLanguageCode().c_str()); + jsonWriter.String("targetLanguageCode"); + jsonWriter.String(request.getTargetLanguageCode().c_str()); + jsonWriter.String("status"); + jsonWriter.Int(request.getStatus()); + jsonWriter.String("type"); + jsonWriter.Int(request.getType()); + jsonWriter.String("tmId"); + jsonWriter.Int(request.getTmId()); + jsonWriter.String("created"); + jsonWriter.String(request.getCreated().c_str()); + jsonWriter.EndObject(); + } + jsonWriter.EndArray(); + jsonWriter.EndObject(); + + + } else if (operation == GET_LANGUAGES_OP) { + std::vector languages = _languageDAO.getLanguages(); + + jsonWriter.StartObject(); + jsonWriter.String("status"); + jsonWriter.String("success"); + jsonWriter.String("languages"); + jsonWriter.StartArray(); + BOOST_FOREACH(Language & language, languages) { + jsonWriter.StartObject(); + jsonWriter.String("id"); + jsonWriter.Int(language.getId()); + jsonWriter.String("code"); + jsonWriter.String(language.getCode().c_str()); + jsonWriter.String("name"); + jsonWriter.String(language.getName().c_str()); + jsonWriter.EndObject(); + } + jsonWriter.EndArray(); + jsonWriter.EndObject(); + + + } else if (operation == ADD_REQUEST_OP) { + std::string sourceFilePath = _getStringParameter(d, SOURCE_FILE_PARAM); + std::string targetFilePath = _getStringParameter(d, TARGET_FILE_PARAM); + int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM); + int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM); + std::string name = _getStringParameter(d, NAME_PARAM); + int type = _getIntParameter(d, TYPE_PARAM); + int tmId = _getIntParameter(d, TM_ID_PARAM); + int newId = _requestDAO.addRequest(sourceFilePath, targetFilePath, sourceLangId, targetLangId, name, type, tmId); + jsonWriter.StartObject(); + jsonWriter.String("status"); + jsonWriter.String("success"); + jsonWriter.String("newRequestId"); + jsonWriter.Int(newId); + jsonWriter.EndObject(); } else if (operation == "lemmatize") { std::string sentence = _getStringParameter(d, "sentence"); diff --git a/concordia-server/concordia_server.hpp b/concordia-server/concordia_server.hpp index 2822a9e..635f07b 100644 --- a/concordia-server/concordia_server.hpp +++ b/concordia-server/concordia_server.hpp @@ -12,6 +12,8 @@ #include "rapidjson/error/en.h" #include "tm_dao.hpp" +#include "request_dao.hpp" +#include "language_dao.hpp" #include "index_controller.hpp" #include "searcher_controller.hpp" #include "lemmatizer_facade.hpp" @@ -48,6 +50,10 @@ private: TmDAO _tmDAO; + RequestDAO _requestDAO; + + LanguageDAO _languageDAO; + boost::shared_ptr _indexController; boost::shared_ptr _searcherController; diff --git a/concordia-server/config.hpp.in b/concordia-server/config.hpp.in index c287822..65e5e1f 100644 --- a/concordia-server/config.hpp.in +++ b/concordia-server/config.hpp.in @@ -21,14 +21,20 @@ #define EXAMPLES_PARAM "examples" #define SOURCE_LANG_PARAM "sourceLangId" #define TARGET_LANG_PARAM "targetLangId" +#define SOURCE_FILE_PARAM "sourceFilePath" +#define TARGET_FILE_PARAM "targetFilePath" #define NAME_PARAM "name" +#define TYPE_PARAM "type" #define INTERVALS_PARAM "intervals" -#define GET_TMS_INFO_PARAM "getTmsInfo" #define ADD_SENTENCE_OP "addSentence" #define ADD_SENTENCES_OP "addSentences" #define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences" #define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences" +#define ADD_REQUEST_OP "addRequest" +#define GET_TMS_INFO_OP "getTmsInfo" +#define GET_REQUESTS_INFO_OP "getRequestsInfo" +#define GET_LANGUAGES_OP "getLanguages" #define REFRESH_INDEX_OP "refreshIndex" #define SIMPLE_SEARCH_OP "simpleSearch" #define CONCORDIA_SEARCH_OP "concordiaSearch" diff --git a/concordia-server/language.cpp b/concordia-server/language.cpp new file mode 100644 index 0000000..52cec65 --- /dev/null +++ b/concordia-server/language.cpp @@ -0,0 +1,13 @@ +#include "language.hpp" + +Language::Language( + const int id, + const std::string & code, + const std::string & name) : + _id(id), + _code(code), + _name(name) { +} + +Language::~Language() { +} diff --git a/concordia-server/language.hpp b/concordia-server/language.hpp new file mode 100644 index 0000000..21bde1f --- /dev/null +++ b/concordia-server/language.hpp @@ -0,0 +1,41 @@ +#ifndef LANGUAGE_HDR +#define LANGUAGE_HDR + +#include +#include + +class Language { +public: + /*! Constructor. + */ + Language( + const int id, + const std::string & code, + const std::string & name + ); + /*! Destructor. + */ + virtual ~Language(); + + int getId() const { + return _id; + } + + const std::string & getCode() const { + return _code; + } + + const std::string & getName() const { + return _name; + } + + +private: + int _id; + + std::string _code; + + std::string _name; +}; + +#endif diff --git a/concordia-server/language_dao.cpp b/concordia-server/language_dao.cpp new file mode 100644 index 0000000..01a26d5 --- /dev/null +++ b/concordia-server/language_dao.cpp @@ -0,0 +1,37 @@ +#include "language_dao.hpp" + +#include "query_param.hpp" +#include "string_param.hpp" +#include "int_param.hpp" +#include "bool_param.hpp" +#include "int_array_param.hpp" +#include "logger.hpp" + +#include +#include + +LanguageDAO::LanguageDAO() { +} + +LanguageDAO::~LanguageDAO() { +} + +std::vector LanguageDAO::getLanguages() { + std::vector result; + DBconnection connection; + connection.startTransaction(); + std::string query = "select * from language order by name;"; + PGresult * dbResult = connection.execute(query); + for (int i=0;i +#include +#include + +#include +#include "db_connection.hpp" +#include "language.hpp" + +class LanguageDAO { +public: + /*! Constructor. + */ + LanguageDAO(); + /*! Destructor. + */ + virtual ~LanguageDAO(); + + std::vector getLanguages(); + +private: + +}; + +#endif diff --git a/concordia-server/request.cpp b/concordia-server/request.cpp new file mode 100644 index 0000000..db5620b --- /dev/null +++ b/concordia-server/request.cpp @@ -0,0 +1,27 @@ +#include "request.hpp" + +Request::Request( + const int id, + const std::string & sourceFilePath, + const std::string & targetFilePath, + const std::string & name, + const std::string & sourceLanguageCode, + const std::string & targetLanguageCode, + const int status, + const int type, + const int tmId, + const std::string & created) : + _id(id), + _sourceFilePath(sourceFilePath), + _targetFilePath(targetFilePath), + _name(name), + _sourceLanguageCode(sourceLanguageCode), + _targetLanguageCode(targetLanguageCode), + _status(status), + _type(type), + _tmId(tmId), + _created(created) { +} + +Request::~Request() { +} diff --git a/concordia-server/request.hpp b/concordia-server/request.hpp new file mode 100644 index 0000000..7b08d9f --- /dev/null +++ b/concordia-server/request.hpp @@ -0,0 +1,89 @@ +#ifndef REQUEST_HDR +#define REQUEST_HDR + +#include +#include + +class Request { +public: + /*! Constructor. + */ + Request( + const int id, + const std::string & sourceFilePath, + const std::string & targetFilePath, + const std::string & name, + const std::string & sourceLanguageCode, + const std::string & targetLanguageCode, + const int status, + const int type, + const int tm_id, + const std::string & created + ); + /*! Destructor. + */ + virtual ~Request(); + + int getId() const { + return _id; + } + + const std::string & getSourceFilePath() const { + return _sourceFilePath; + } + + const std::string & getTargetFilePath() const { + return _targetFilePath; + } + + const std::string & getName() const { + return _name; + } + + const std::string & getSourceLanguageCode() const { + return _sourceLanguageCode; + } + + const std::string & getTargetLanguageCode() const { + return _targetLanguageCode; + } + + int getStatus() const { + return _status; + } + + int getType() const { + return _type; + } + + int getTmId() const { + return _tmId; + } + + const std::string & getCreated() const { + return _created; + } + +private: + int _id; + + std::string _sourceFilePath; + + std::string _targetFilePath; + + std::string _name; + + std::string _sourceLanguageCode; + + std::string _targetLanguageCode; + + int _status; + + int _type; + + int _tmId; + + std::string _created; +}; + +#endif diff --git a/concordia-server/request_dao.cpp b/concordia-server/request_dao.cpp new file mode 100644 index 0000000..4f428c3 --- /dev/null +++ b/concordia-server/request_dao.cpp @@ -0,0 +1,80 @@ +#include "request_dao.hpp" + +#include "query_param.hpp" +#include "string_param.hpp" +#include "int_param.hpp" +#include "bool_param.hpp" +#include "int_array_param.hpp" +#include "logger.hpp" + +#include +#include + +RequestDAO::RequestDAO() { +} + +RequestDAO::~RequestDAO() { +} + +std::vector RequestDAO::getRequests() { + std::vector result; + DBconnection connection; + connection.startTransaction(); + std::string query = "select request.id, request.source_file_path, request.target_file_path, request.name, src_lang.code as src_code, trg_lang.code as trg_code, request.status, request.type, request.tm_id, to_char(request.created,'YYYY-MM-DD HH24:MI:SS') from request inner join language as src_lang on src_lang.id = request.source_lang_id inner join language as trg_lang on trg_lang.id = request.target_lang_id order by request.created desc limit 20;"; + PGresult * dbResult = connection.execute(query); + for (int i=0;i params; + params.push_back(new StringParam(sourceFilePath)); + params.push_back(new StringParam(targetFilePath)); + params.push_back(new IntParam(sourceLangId)); + params.push_back(new IntParam(targetLangId)); + params.push_back(new StringParam(name)); + params.push_back(new IntParam(0)); + params.push_back(new IntParam(type)); + params.push_back(new IntParam(tmId)); + + PGresult * result = connection.execute(query, params); + int newId = connection.getIntValue(result, 0, 0); + connection.clearResult(result); + connection.endTransaction(); + BOOST_FOREACH (QueryParam * param, params) { + delete param; + } + + return newId; + +} diff --git a/concordia-server/request_dao.hpp b/concordia-server/request_dao.hpp new file mode 100644 index 0000000..aeb2a15 --- /dev/null +++ b/concordia-server/request_dao.hpp @@ -0,0 +1,29 @@ +#ifndef REQUEST_DAO_HDR +#define REQUEST_DAO_HDR + +#include +#include +#include + +#include +#include "db_connection.hpp" +#include "request.hpp" + +class RequestDAO { +public: + /*! Constructor. + */ + RequestDAO(); + /*! Destructor. + */ + virtual ~RequestDAO(); + + int addRequest(const std::string sourceFilePath, const std::string targetFilePath, const int sourceLangId, const int targetLangId, const std::string name, int type, int tmId); + + std::vector getRequests(); + +private: + +}; + +#endif diff --git a/db/concordia_server.sql b/db/concordia_server.sql index c8a8a21..c0aad11 100644 --- a/db/concordia_server.sql +++ b/db/concordia_server.sql @@ -7,6 +7,21 @@ CREATE TABLE tm ( lemmatized bool DEFAULT false ); +DROP TABLE IF EXISTS request; +CREATE TABLE request ( + id SERIAL PRIMARY KEY, + source_file_path varchar(100), + target_file_path varchar(100), + source_lang_id integer, + target_lang_id integer, + name varchar(40), + status integer, + type integer, + tm_id integer, + created timestamp default now() +); + + DROP TABLE IF EXISTS language; CREATE TABLE language ( id SERIAL PRIMARY KEY, diff --git a/import-requests/handle_requests.py b/import-requests/handle_requests.py new file mode 100755 index 0000000..a94de6a --- /dev/null +++ b/import-requests/handle_requests.py @@ -0,0 +1,115 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import psycopg2, os, shutil, subprocess, urllib3, time, json + +from importlib.machinery import SourceFileLoader + +BUFFER_SIZE = 500 + +def postJson(address, data): + http = urllib3.PoolManager() + response = http.request('POST', address, headers={'Content-Type': 'application/json'},body=json.dumps(data).encode('utf-8')) + + return json.loads(response.data.decode('utf-8')) + + +def add_examples(address, examplesData): + response = postJson(address, examplesData) + if response['status'] == 'error': + raise Exception(response['message']) + +def file_len(fname): + with open(fname) as f: + for i, l in enumerate(f): + pass + return i + 1 + + +mgiza_path = os.path.dirname(os.path.realpath(__file__))+'/../mgiza-aligner' + +conn = psycopg2.connect("dbname='concordia_server' user='concordia' host='localhost' port='6543' password='concordia'") +cur = conn.cursor() +cur.execute("""select request.id, request.source_file_path, request.target_file_path, request.name, src_lang.id as src_lang_id, src_lang.code as src_code, trg_lang.id as trg_lang_id, trg_lang.code as trg_code, request.status, request.type, request.tm_id from request inner join language as src_lang on src_lang.id = request.source_lang_id inner join language as trg_lang on trg_lang.id = request.target_lang_id order by request.created limit 1""") + + +request = cur.fetchone() +request_id, src_file_path, trg_file_path, tm_name, src_lang_id, src_lang_code, trg_lang_id, trg_lang_code, status, tm_type, tm_id = request +request_corpus_path = mgiza_path+'/corpora/request_'+str(request_id) +os.makedirs(request_corpus_path) +shutil.copy(src_file_path, request_corpus_path+'/src.txt') +shutil.copy(trg_file_path, request_corpus_path+'/trg.txt') + +subprocess.run(["make","SRC_LANG="+src_lang_code, "TRG_LANG="+trg_lang_code, "CORPUS_NAME=request_"+str(request_id)], cwd=mgiza_path) + +cur.close() +conn.close() + +host = SourceFileLoader("host", os.path.dirname(os.path.realpath(__file__))+'/../tests/host.py').load_module() +address = 'http://'+host.concordia_host +if len(host.concordia_port) > 0: + address += ':'+host.concordia_port + + +sourceFile = request_corpus_path+'/src_final.txt' +targetFile = request_corpus_path+'/trg_final.txt' +alignmentsFile = request_corpus_path+'/aligned_final.txt' + +if (file_len(sourceFile) != file_len(targetFile)): + raise Exception("source and target files are not of the same length!") + +if (file_len(alignmentsFile) != 3*file_len(sourceFile)): + raise Exception("alignments file is not exactly 3 times longer than source and target") + +data = { + 'operation': 'addTm', + 'sourceLangId':src_lang_id, + 'targetLangId':trg_lang_id, + 'name':tm_name, + 'tmLemmatized':True +} + +response = postJson(address, data) + +tmId = int(response['newTmId']) +print("Added new tm: %d" % tmId) + +data = { + 'operation': 'addAlignedLemmatizedSentences', + 'tmId':tmId +} + +examples = [] +with open(sourceFile) as sf, open(targetFile) as tf, open(alignmentsFile) as af: + for sourceLine in sf: + sourceSentence = sourceLine.strip() + targetSentence = tf.readline().strip() + + # skip to lines of the alignments file, these are lemmatized and we need the raw sentences from the source and target files. + af.readline() + af.readline() + + alignmentString = af.readline().strip() + + examples.append([sourceSentence, targetSentence, alignmentString]) + + if len(examples) >= BUFFER_SIZE: + data['examples'] = examples + add_examples(address, data) + examples = [] + +if len(examples) > 0: + data['examples'] = examples + add_examples(address, data) + +print("Generating index...") +start = time.time() +data = { + 'operation': 'refreshIndex', + 'tmId' : tmId +} + +response = postJson(address, data) + +end = time.time() +print("Index regeneration complete. The operation took %.4f s" % (end - start)) diff --git a/tests/addStocznia.sh b/tests/addStocznia.sh new file mode 100755 index 0000000..b68a82e --- /dev/null +++ b/tests/addStocznia.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +./addTm.py 1 2 placeholder 1 + +./addAlignedLemmatizedTM.py stocznia_plen ../mgiza-aligner/corpora/stocznia_plen/src_final.txt 1 ../mgiza-aligner/corpora/stocznia_plen/trg_final.txt 2 ../mgiza-aligner/corpora/stocznia_plen/aligned_final.txt + +./addTm.py 1 2 placeholder 1 + +./addTm.py 1 2 placeholder 1 + +./addAlignedLemmatizedTM.py stocznia_enpl ../mgiza-aligner/corpora/stocznia_enpl/src_final.txt 2 ../mgiza-aligner/corpora/stocznia_enpl/trg_final.txt 1 ../mgiza-aligner/corpora/stocznia_enpl/aligned_final.txt +