example source

This commit is contained in:
Rafał Jaworski 2019-02-25 13:27:35 +01:00
parent d900e806d9
commit 0fc4558ea7
9 changed files with 122 additions and 11 deletions

View File

@ -66,7 +66,8 @@ function presentFullSearchResults(data) {
for (j=0;j<data['result']['occurrences'].length;j++) {
var occurence = data['result']['occurrences'][j];
result += '<table class="example"><tr><td>';
result += '<table class="example">';
result += '<tr><td>';
// source segment
var sourceSegment = occurence['sourceSegment'];
@ -88,7 +89,8 @@ function presentFullSearchResults(data) {
currStart = occurence['targetFragments'][i][1];
}
result += targetSegment.slice(currStart);
result += '</td></tr></table>';
result += '</td></tr>';
result += '<tr><td colspan="2" style="text-align:right;font-style:italic;font-size:70%">Źródło: <a target="_blank" href="'+occurence['sourceLink']+'">'+occurence['sourceName']+'</a><td></tr></table>';
}
$('#occurrences').html(result);

View File

@ -0,0 +1,8 @@
dir@#@opensubtitles_plen
concordia_host@#@concordia.poleng
concordia_port@#@8800
tmid@#@2
desc@#@Witamy w interaktywnym demo systemu Concordia. System znajduje najdłuższe fragmenty zdania wejściowego w pamięci tłumaczeń. Proszę wpisać polskie zdanie w poniższe pole i nacisnąć Enter (albo użyć przycisku "search"). Aby zapoznać się z systemem możesz użyć wcześniej przygotowanych przykładów - po prostu kliknij link "apply" przy wybranym przykładzie. Po wyszukaniu, kliknij na wybrany podświetlony fragment, aby zobaczyć jego kontekst.
enjoy@#@Życzymy udanej pracy z systemem!
prompt@#@Wprowadź zdanie (po polsku):
suggestion@#@Nawet zepsute zegary pokazują dwa razy dziennie właściwą godzinę

View File

@ -0,0 +1 @@
../versions_available/opensubtitles_plen.cfg

View File

@ -5,12 +5,16 @@ ExampleOccurrence::ExampleOccurrence(
const int matchedExampleStart,
const int matchedExampleEnd,
const std::string & sourceSegment,
const std::string & targetSegment):
const std::string & targetSegment,
const std::string & sourceName,
const std::string & sourceLink):
_id(id),
_matchedExampleStart(matchedExampleStart),
_matchedExampleEnd(matchedExampleEnd),
_sourceSegment(sourceSegment),
_targetSegment(targetSegment) {
_targetSegment(targetSegment),
_sourceName(sourceName),
_sourceLink(sourceLink) {
}
ExampleOccurrence::~ExampleOccurrence() {

View File

@ -12,7 +12,9 @@ public:
const int matchedExampleStart,
const int matchedExampleEnd,
const std::string & sourceSegment,
const std::string & targetSegment
const std::string & targetSegment,
const std::string & sourceName,
const std::string & sourceLink
);
/*! Destructor.
*/
@ -42,6 +44,14 @@ public:
return _targetFragments;
}
const std::string & getSourceName() const {
return _sourceName;
}
const std::string & getSourceLink() const {
return _sourceLink;
}
void addMatchedTargetFragment(const std::pair<int,int> & targetFragment);
private:
@ -56,6 +66,10 @@ private:
std::string _targetSegment;
std::vector<std::pair<int,int> > _targetFragments;
std::string _sourceName;
std::string _sourceLink;
};
#endif

View File

@ -72,10 +72,12 @@ void IndexController::addSentences(rapidjson::Writer<rapidjson::StringBuffer> &
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<TokenizedSentence> tokenizedLemmatizedSentences = it->second->tokenizeAll(_lemmatizerFacade->lemmatizeSentencesIfNeeded(sourceSentences, tmId));
std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
it->second->addAllTokenizedExamples(tokenizedLemmatizedSentences, sentenceIds);
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, false, false);
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences = it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, false, false);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, alignments, sourceIds, tmId);
it->second->addAllTokenizedExamples(tokenizedLemmatizedSourceSentences, sentenceIds);
jsonWriter.StartObject();
jsonWriter.String("status");

View File

@ -82,6 +82,10 @@ void JsonGenerator::writeFullSearchResult(rapidjson::Writer<rapidjson::StringBuf
jsonWriter.String(occurrence.getSourceSegment().c_str());
jsonWriter.String("targetSegment");
jsonWriter.String(occurrence.getTargetSegment().c_str());
jsonWriter.String("sourceName");
jsonWriter.String(occurrence.getSourceName().c_str());
jsonWriter.String("sourceLink");
jsonWriter.String(occurrence.getSourceLink().c_str());
jsonWriter.String("targetFragments");
jsonWriter.StartArray(); // all target fragments
for (std::vector<std::pair<int,int> >::const_iterator it = occurrence.getTargetFragments().begin();

View File

@ -54,6 +54,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
return newIds;
}
std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<TokenizedSentence> & targetSentences,
@ -72,6 +73,26 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
return newIds;
}
std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<TokenizedSentence> & targetSentences,
const std::vector<std::vector<std::vector<int> > > & allAlignments,
const std::vector<int> & sourceIds,
const int tmId) throw (ConcordiaException) {
DBconnection connection;
std::vector<SUFFIX_MARKER_TYPE> newIds;
connection.startTransaction();
for (int i=0; i< sourceSentences.size(); i++) {
newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), sourceIds.at(i), tmId));
}
connection.endTransaction();
return newIds;
}
SimpleSearchResult UnitDAO::getSimpleSearchResult(const MatchedPatternFragment & fragment) {
SimpleSearchResult result(fragment.getStart(), fragment.getEnd());
TokenizedSentence ts("");
@ -146,7 +167,8 @@ SimpleSearchResult UnitDAO::_getResultFromFragment(
}
ExampleOccurrence UnitDAO::_getExampleOccurrence(DBconnection & connection, const SubstringOccurrence sOccurrence, const int matchedLength) {
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer], target_tokens, alignments FROM unit WHERE id = $3::integer;";
Logger::log("_getExampleOccurence");
std::string query = "SELECT unit.id, unit.source_segment, unit.target_segment, unit.source_tokens[$1::integer], unit.source_tokens[$2::integer], unit.target_tokens, unit.alignments, source.name, source.link FROM unit left join source on unit.source_id = source.external_id where unit.id = $3::integer;";
std::vector<QueryParam*> params;
params.push_back(new IntParam(2*sOccurrence.getOffset()+1));
params.push_back(new IntParam(2*(sOccurrence.getOffset()+matchedLength)));
@ -156,7 +178,9 @@ ExampleOccurrence UnitDAO::_getExampleOccurrence(DBconnection & connection, cons
connection.getIntValue(result,0,3), // matched example start
connection.getIntValue(result,0,4), // matched example end
connection.getStringValue(result,0,1), // source segment
connection.getStringValue(result,0,2)); // target segment
connection.getStringValue(result,0,2), // target segment
connection.getStringValue(result,0,7), // source name
connection.getStringValue(result,0,8)); // source link
std::string targetTokensRaw = connection.getStringValue(result,0,5);
std::string alignmentsRaw = connection.getStringValue(result,0,6);
@ -273,6 +297,43 @@ int UnitDAO::_addAlignedUnit (
return newId;
}
int UnitDAO::_addAlignedUnit (
DBconnection & connection,
const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments,
const int sourceId,
const int tmId) throw(ConcordiaException) {
if (sourceSentence.getTokens().size() != alignments.size()) {
// Here we check if the source sentence, taken from src.tok,
// is shorter than alignments array.
std::stringstream ss;
ss << "The size of source sentence is different than the size of alignments array. Source sentence: " << sourceSentence.getSentence() << ", alignments size:" << alignments.size();
throw ConcordiaException(ss.str());
}
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens, alignments, source_id) values($1::text,$2::text,$3::integer,$4,$5,$6,$7) RETURNING id";
std::vector<QueryParam*> params;
params.push_back(new StringParam(sourceSentence.getOriginalSentence()));
params.push_back(new StringParam(targetSentence.getOriginalSentence()));
params.push_back(new IntParam(tmId));
params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
params.push_back(new Int2DArrayParam(alignments));
params.push_back(new IntParam(sourceId));
PGresult * result = connection.execute(query, params);
int newId = connection.getIntValue(result, 0, 0);
connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
return newId;
}
std::vector<int> UnitDAO::_getArray(std::string arrayString) {
std::vector<int> result;
if (arrayString.length()>2) {

View File

@ -43,6 +43,13 @@ public:
const std::vector<std::vector<std::vector<int> > > & allAlignments,
const int tmId) throw (ConcordiaException);
std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<TokenizedSentence> & targetSentences,
const std::vector<std::vector<std::vector<int> > > & allAlignments,
const std::vector<int> & sourceIds,
const int tmId) throw (ConcordiaException);
SimpleSearchResult getSimpleSearchResult(const MatchedPatternFragment & fragment);
FullSearchResult getFullSearchResult(const OccurrencesList & occurrencesList, const int patternLength);
@ -78,6 +85,14 @@ private:
const std::vector<std::vector<int> > & alignments,
const int tmId) throw(ConcordiaException);
int _addAlignedUnit(
DBconnection & connection,
const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments,
const int sourceId,
const int tmId) throw(ConcordiaException);
std::vector<int> _getArray(std::string arrayString);
std::vector<std::vector<int> > _get2DArray(std::string arrayString);