lemmatization
This commit is contained in:
parent
534e14db9f
commit
f73842e21f
@ -43,6 +43,12 @@ with open('concordia_search.php_pattern', 'r') as search_pattern_file, open(root
|
|||||||
line = re.sub('@concordia_port@', concordia_port, line)
|
line = re.sub('@concordia_port@', concordia_port, line)
|
||||||
search_file.write(line)
|
search_file.write(line)
|
||||||
|
|
||||||
|
with open('tm_info.php_pattern', 'r') as tm_info_pattern_file, open(root_dir+'/tm_info.php', 'w') as tm_info_file:
|
||||||
|
for line in tm_info_pattern_file:
|
||||||
|
line = re.sub('@concordia_host@', concordia_host, line)
|
||||||
|
line = re.sub('@concordia_port@', concordia_port, line)
|
||||||
|
tm_info_file.write(line)
|
||||||
|
|
||||||
|
|
||||||
versions_dir = 'versions_enabled'
|
versions_dir = 'versions_enabled'
|
||||||
|
|
||||||
|
25
cat/tm_info.php_pattern
Normal file
25
cat/tm_info.php_pattern
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
$url = 'http://@concordia_host@:@concordia_port@';
|
||||||
|
$request = array("operation" => "getTmsInfo");
|
||||||
|
|
||||||
|
// use key 'http' even if you send the request to https://...
|
||||||
|
$options = array(
|
||||||
|
'http' => array(
|
||||||
|
'header' => "Content-type: application/x-www-form-urlencoded\r\n",
|
||||||
|
'method' => 'POST',
|
||||||
|
'content' => json_encode($request),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
$context = stream_context_create($options);
|
||||||
|
$response = file_get_contents($url, false, $context);
|
||||||
|
|
||||||
|
|
||||||
|
$data = json_decode($response);
|
||||||
|
|
||||||
|
foreach ($data->tms as $tm) {
|
||||||
|
echo $tm->id."\t".$tm->name." (".$tm->sourceLanguageCode."-> ".$tm->targetLanguageCode.")\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
?>
|
@ -1,8 +1,8 @@
|
|||||||
dir@#@jrc_enpl
|
dir@#@jrc_enpl
|
||||||
concordia_host@#@concordia.vm.wmi.amu.edu.pl
|
concordia_host@#@concordia.vm.wmi.amu.edu.pl
|
||||||
concordia_port@#@8800
|
concordia_port@#@8800
|
||||||
tmid@#@2
|
tmid@#@1
|
||||||
desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from English-Polish corpus of European Law. Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context.
|
desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is over 1.5M sentences taken from English-Polish corpus of European Law (Europarl + JRC-Acquis). Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context.
|
||||||
enjoy@#@Enjoy your work with the system!
|
enjoy@#@Enjoy your work with the system!
|
||||||
prompt@#@Enter search pattern (English sentence):
|
prompt@#@Enter search pattern (English sentence):
|
||||||
suggestion@#@Every ship in the European Union must have a crew of 50 or more workers.
|
suggestion@#@Every ship in the European Union must have a crew of 50 or more workers.
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
dir@#@jrc_plen
|
dir@#@jrc_plen
|
||||||
concordia_host@#@concordia.vm.wmi.amu.edu.pl
|
concordia_host@#@concordia.vm.wmi.amu.edu.pl
|
||||||
concordia_port@#@8800
|
concordia_port@#@8800
|
||||||
tmid@#@1
|
tmid@#@2
|
||||||
desc@#@Witamy w interaktywnym demo systemu Concordia. System znajduje najdłuższe fragmenty zdania wejściowego w pamięci tłumaczeń. W pamięci tej znajduje się 200 000 zdań z polsko-angielskiego korpusu ustawodawstwa Unii Europejskiej JRC-Acquis. Proszę wpisać polskie zdanie w poniższe pole i nacisnąć Enter (albo użyć przycisku "search"). Ta wersja Concordii działa najlepiej ze zdaniami prawniczymi, ale jest bardzo prawdopodobne, że znalezione zostaną choćby krótkie fragmenty dowolnego polskiego zdania. Aby zapoznać się z systemem możesz użyć wcześniej przygotowanych przykładów - po prostu kliknij link "apply" przy wybranym przykładzie. Po wyszukaniu, kliknij na wybrany podświetlony fragment, aby zobaczyć jego kontekst.
|
desc@#@Witamy w interaktywnym demo systemu Concordia. System znajduje najdłuższe fragmenty zdania wejściowego w pamięci tłumaczeń. W pamięci tej znajduje się 200 000 zdań z polsko-angielskiego korpusu ustawodawstwa Unii Europejskiej JRC-Acquis. Proszę wpisać polskie zdanie w poniższe pole i nacisnąć Enter (albo użyć przycisku "search"). Ta wersja Concordii działa najlepiej ze zdaniami prawniczymi, ale jest bardzo prawdopodobne, że znalezione zostaną choćby krótkie fragmenty dowolnego polskiego zdania. Aby zapoznać się z systemem możesz użyć wcześniej przygotowanych przykładów - po prostu kliknij link "apply" przy wybranym przykładzie. Po wyszukaniu, kliknij na wybrany podświetlony fragment, aby zobaczyć jego kontekst.
|
||||||
enjoy@#@Życzymy udanej pracy z systemem!
|
enjoy@#@Życzymy udanej pracy z systemem!
|
||||||
prompt@#@Wprowadź zdanie (po polsku):
|
prompt@#@Wprowadź zdanie (po polsku):
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include "json_generator.hpp"
|
#include "json_generator.hpp"
|
||||||
#include "config.hpp"
|
#include "config.hpp"
|
||||||
#include "logger.hpp"
|
#include "logger.hpp"
|
||||||
|
#include "tm.hpp"
|
||||||
#include "rapidjson/rapidjson.h"
|
#include "rapidjson/rapidjson.h"
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <boost/ptr_container/ptr_map.hpp>
|
#include <boost/ptr_container/ptr_map.hpp>
|
||||||
@ -118,6 +119,30 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
_indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId);
|
_indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId);
|
||||||
|
} else if (operation == GET_TMS_INFO_PARAM) {
|
||||||
|
std::vector<Tm> tms = _tmDAO.getTms();
|
||||||
|
|
||||||
|
jsonWriter.StartObject();
|
||||||
|
jsonWriter.String("status");
|
||||||
|
jsonWriter.String("success");
|
||||||
|
jsonWriter.String("tms");
|
||||||
|
jsonWriter.StartArray();
|
||||||
|
BOOST_FOREACH(Tm & tm, tms) {
|
||||||
|
jsonWriter.StartObject();
|
||||||
|
jsonWriter.String("id");
|
||||||
|
jsonWriter.Int(tm.getId());
|
||||||
|
jsonWriter.String("name");
|
||||||
|
jsonWriter.String(tm.getName().c_str());
|
||||||
|
jsonWriter.String("sourceLanguageCode");
|
||||||
|
jsonWriter.String(tm.getSourceLanguageCode().c_str());
|
||||||
|
jsonWriter.String("targetLanguageCode");
|
||||||
|
jsonWriter.String(tm.getTargetLanguageCode().c_str());
|
||||||
|
jsonWriter.EndObject();
|
||||||
|
}
|
||||||
|
jsonWriter.EndArray();
|
||||||
|
jsonWriter.EndObject();
|
||||||
|
|
||||||
|
|
||||||
} else if (operation == "lemmatize") {
|
} else if (operation == "lemmatize") {
|
||||||
std::string sentence = _getStringParameter(d, "sentence");
|
std::string sentence = _getStringParameter(d, "sentence");
|
||||||
std::string languageCode = _getStringParameter(d, "languageCode");
|
std::string languageCode = _getStringParameter(d, "languageCode");
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
#define TARGET_LANG_PARAM "targetLangId"
|
#define TARGET_LANG_PARAM "targetLangId"
|
||||||
#define NAME_PARAM "name"
|
#define NAME_PARAM "name"
|
||||||
#define INTERVALS_PARAM "intervals"
|
#define INTERVALS_PARAM "intervals"
|
||||||
|
#define GET_TMS_INFO_PARAM "getTmsInfo"
|
||||||
|
|
||||||
#define ADD_SENTENCE_OP "addSentence"
|
#define ADD_SENTENCE_OP "addSentence"
|
||||||
#define ADD_SENTENCES_OP "addSentences"
|
#define ADD_SENTENCES_OP "addSentences"
|
||||||
|
@ -115,8 +115,9 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
|
|||||||
|
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||||
if (it != _concordiasMap->end()) {
|
if (it != _concordiasMap->end()) {
|
||||||
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
|
std::string lemmatizedPattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
|
||||||
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
|
TokenizedSentence originalPattern = it->second->tokenize(pattern, true, false);
|
||||||
|
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(lemmatizedPattern), originalPattern);
|
||||||
|
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
jsonWriter.String("status");
|
jsonWriter.String("status");
|
||||||
|
13
concordia-server/tm.cpp
Normal file
13
concordia-server/tm.cpp
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#include "tm.hpp"
|
||||||
|
|
||||||
|
Tm::Tm(const int id,
|
||||||
|
const std::string & name,
|
||||||
|
const std::string & sourceLanguageCode,
|
||||||
|
const std::string & targetLanguageCode) :
|
||||||
|
_id(id),_name(name),
|
||||||
|
_sourceLanguageCode(sourceLanguageCode),
|
||||||
|
_targetLanguageCode(targetLanguageCode) {
|
||||||
|
}
|
||||||
|
|
||||||
|
Tm::~Tm() {
|
||||||
|
}
|
46
concordia-server/tm.hpp
Normal file
46
concordia-server/tm.hpp
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
#ifndef TM_HDR
|
||||||
|
#define TM_HDR
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
class Tm {
|
||||||
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
*/
|
||||||
|
Tm(const int id,
|
||||||
|
const std::string & name,
|
||||||
|
const std::string & sourceLanguageCode,
|
||||||
|
const std::string & targetLanguageCode);
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~Tm();
|
||||||
|
|
||||||
|
int getId() const {
|
||||||
|
return _id;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string & getName() const {
|
||||||
|
return _name;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string & getSourceLanguageCode() const {
|
||||||
|
return _sourceLanguageCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string & getTargetLanguageCode() const {
|
||||||
|
return _targetLanguageCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private:
|
||||||
|
int _id;
|
||||||
|
|
||||||
|
std::string _name;
|
||||||
|
|
||||||
|
std::string _sourceLanguageCode;
|
||||||
|
|
||||||
|
std::string _targetLanguageCode;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -32,6 +32,30 @@ std::vector<int> TmDAO::getTmIds() {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<Tm> TmDAO::getTms() {
|
||||||
|
std::vector<Tm> result;
|
||||||
|
DBconnection connection;
|
||||||
|
connection.startTransaction();
|
||||||
|
std::string query = "select tm.id, tm.name, src_lang.code as src_code, trg_lang.code as trg_code from tm inner join language as src_lang on src_lang.id = tm.source_lang_id inner join language as trg_lang on trg_lang.id = tm.target_lang_id;";
|
||||||
|
PGresult * dbResult = connection.execute(query);
|
||||||
|
for (int i=0;i<connection.getRowCount(dbResult);i++) {
|
||||||
|
int id = connection.getIntValue(dbResult, i, 0);
|
||||||
|
std::string name = connection.getStringValue(dbResult, i, 1);
|
||||||
|
std::string sourceLanguageCode = connection.getStringValue(dbResult, i, 2);
|
||||||
|
std::string targetLanguageCode = connection.getStringValue(dbResult, i, 3);
|
||||||
|
result.push_back(Tm(id, name, sourceLanguageCode, targetLanguageCode));
|
||||||
|
}
|
||||||
|
connection.clearResult(dbResult);
|
||||||
|
connection.endTransaction();
|
||||||
|
|
||||||
|
return result;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) {
|
int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) {
|
||||||
addTm(sourceLangId, targetLangId, name, false);
|
addTm(sourceLangId, targetLangId, name, false);
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
|
|
||||||
#include <concordia/common/config.hpp>
|
#include <concordia/common/config.hpp>
|
||||||
#include "db_connection.hpp"
|
#include "db_connection.hpp"
|
||||||
|
#include "tm.hpp"
|
||||||
|
|
||||||
class TmDAO {
|
class TmDAO {
|
||||||
public:
|
public:
|
||||||
@ -23,6 +24,8 @@ public:
|
|||||||
|
|
||||||
std::vector<int> getTmIds();
|
std::vector<int> getTmIds();
|
||||||
|
|
||||||
|
std::vector<Tm> getTms();
|
||||||
|
|
||||||
std::pair<bool, std::string> getTmInfo(int tmId);
|
std::pair<bool, std::string> getTmInfo(int tmId);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -80,6 +80,15 @@ CompleteConcordiaSearchResult UnitDAO::getConcordiaResult(boost::shared_ptr<Conc
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CompleteConcordiaSearchResult UnitDAO::getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult, TokenizedSentence originalPattern) {
|
||||||
|
CompleteConcordiaSearchResult result(rawConcordiaResult->getBestOverlayScore());
|
||||||
|
_getResultsFromFragments(result.getBestOverlay(),
|
||||||
|
rawConcordiaResult->getBestOverlay(),
|
||||||
|
originalPattern);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void UnitDAO::_getResultsFromFragments(
|
void UnitDAO::_getResultsFromFragments(
|
||||||
std::vector<SimpleSearchResult> & results,
|
std::vector<SimpleSearchResult> & results,
|
||||||
const std::vector<MatchedPatternFragment> & fragments,
|
const std::vector<MatchedPatternFragment> & fragments,
|
||||||
@ -212,7 +221,7 @@ int UnitDAO::_addAlignedUnit (
|
|||||||
// sentence, because giza can truncate the sentence. In this case, we have to
|
// sentence, because giza can truncate the sentence. In this case, we have to
|
||||||
// truncate the source sentence too.
|
// truncate the source sentence too.
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
||||||
|
@ -45,6 +45,8 @@ public:
|
|||||||
|
|
||||||
CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult);
|
CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult);
|
||||||
|
|
||||||
|
CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult, TokenizedSentence originalPattern);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void _getResultsFromFragments(std::vector<SimpleSearchResult> & results,
|
void _getResultsFromFragments(std::vector<SimpleSearchResult> & results,
|
||||||
const std::vector<MatchedPatternFragment> & fragments,
|
const std::vector<MatchedPatternFragment> & fragments,
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
CORPUS_NAME="europarljrc"
|
CORPUS_NAME="setimes_enhr"
|
||||||
SRC_LANG_ID=2
|
SRC_LANG_ID=2
|
||||||
TRG_LANG_ID=1
|
TRG_LANG_ID=6
|
||||||
|
|
||||||
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src.tok $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg.tok $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt
|
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src.tok $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg.tok $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt
|
||||||
|
23
tests/getTmsInfo.py
Executable file
23
tests/getTmsInfo.py
Executable file
@ -0,0 +1,23 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'getTmsInfo'
|
||||||
|
}
|
||||||
|
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
|
||||||
|
print response
|
Loading…
Reference in New Issue
Block a user