example source

2019-02-25 13:27:35 +01:00 · 2019-02-25 13:27:35 +01:00 · 0fc4558ea7
commit 0fc4558ea7
parent d900e806d9
9 changed files with 122 additions and 11 deletions
--- a/cat/js/cat.js
+++ b/cat/js/cat.js
@ -66,7 +66,8 @@ function presentFullSearchResults(data) {

    for (j=0;j<data['result']['occurrences'].length;j++) {
        var occurence = data['result']['occurrences'][j];
-        result += '<table class="example"><tr><td>';
+        result += '<table class="example">';
+        result += '<tr><td>';

        // source segment
        var sourceSegment = occurence['sourceSegment'];
@ -88,7 +89,8 @@ function presentFullSearchResults(data) {
            currStart = occurence['targetFragments'][i][1];
        }
        result += targetSegment.slice(currStart);
-        result += '</td></tr></table>';
+        result += '</td></tr>';
+        result += '<tr><td colspan="2" style="text-align:right;font-style:italic;font-size:70%">Źródło: <a target="_blank" href="'+occurence['sourceLink']+'">'+occurence['sourceName']+'</a><td></tr></table>';
    }

    $('#occurrences').html(result);
--- a/cat/versions_available/opensubtitles_plen.cfg
+++ b/cat/versions_available/opensubtitles_plen.cfg
@ -0,0 +1,8 @@
+dir@#@opensubtitles_plen
+concordia_host@#@concordia.poleng
+concordia_port@#@8800
+tmid@#@2
+desc@#@Witamy w interaktywnym demo systemu Concordia. System znajduje najdłuższe fragmenty zdania wejściowego w pamięci tłumaczeń. Proszę wpisać polskie zdanie w poniższe pole i nacisnąć Enter (albo użyć przycisku "search"). Aby zapoznać się z systemem możesz użyć wcześniej przygotowanych przykładów - po prostu kliknij link "apply" przy wybranym przykładzie. Po wyszukaniu, kliknij na wybrany podświetlony fragment, aby zobaczyć jego kontekst.
+enjoy@#@Życzymy udanej pracy z systemem!
+prompt@#@Wprowadź zdanie (po polsku):
+suggestion@#@Nawet zepsute zegary pokazują dwa razy dziennie właściwą godzinę
--- a/cat/versions_enabled/opensubtitles_plen.cfg
+++ b/cat/versions_enabled/opensubtitles_plen.cfg
@ -0,0 +1 @@
+../versions_available/opensubtitles_plen.cfg
--- a/concordia-server/example_occurrence.cpp
+++ b/concordia-server/example_occurrence.cpp
@ -5,12 +5,16 @@ ExampleOccurrence::ExampleOccurrence(
                       const int matchedExampleStart,
                       const int matchedExampleEnd,
                       const std::string & sourceSegment,
-                       const std::string & targetSegment):
+                       const std::string & targetSegment,
+                       const std::string & sourceName,
+                       const std::string & sourceLink):
                       _id(id),
                       _matchedExampleStart(matchedExampleStart),
                       _matchedExampleEnd(matchedExampleEnd),
                       _sourceSegment(sourceSegment),
-                       _targetSegment(targetSegment) {
+                       _targetSegment(targetSegment),
+                       _sourceName(sourceName),
+                       _sourceLink(sourceLink) {
 }

 ExampleOccurrence::~ExampleOccurrence() {
--- a/concordia-server/example_occurrence.hpp
+++ b/concordia-server/example_occurrence.hpp
@ -12,7 +12,9 @@ public:
                       const int matchedExampleStart,
                       const int matchedExampleEnd,
                       const std::string & sourceSegment,
-                       const std::string & targetSegment
+                       const std::string & targetSegment,
+                       const std::string & sourceName,
+                       const std::string & sourceLink
                      );
    /*! Destructor.
    */
@ -42,6 +44,14 @@ public:
        return _targetFragments;
    }

+    const std::string & getSourceName() const {
+        return _sourceName;
+    }
+
+    const std::string & getSourceLink() const {
+        return _sourceLink;
+    }
+
    void addMatchedTargetFragment(const std::pair<int,int> & targetFragment);

 private:
@ -56,6 +66,10 @@ private:
    std::string _targetSegment;

    std::vector<std::pair<int,int> > _targetFragments;
+
+    std::string _sourceName;
+
+    std::string _sourceLink;
 };

 #endif
--- a/concordia-server/index_controller.cpp
+++ b/concordia-server/index_controller.cpp
@ -72,10 +72,12 @@ void IndexController::addSentences(rapidjson::Writer<rapidjson::StringBuffer> &
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
-            std::vector<TokenizedSentence> tokenizedLemmatizedSentences = it->second->tokenizeAll(_lemmatizerFacade->lemmatizeSentencesIfNeeded(sourceSentences, tmId));
-            std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
-            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
-            it->second->addAllTokenizedExamples(tokenizedLemmatizedSentences, sentenceIds);
+            std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, false, false);
+            std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences = it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
+            std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, false, false);
+
+            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, alignments, sourceIds, tmId);
+            it->second->addAllTokenizedExamples(tokenizedLemmatizedSourceSentences, sentenceIds);

            jsonWriter.StartObject();
            jsonWriter.String("status");
--- a/concordia-server/json_generator.cpp
+++ b/concordia-server/json_generator.cpp
@ -82,6 +82,10 @@ void JsonGenerator::writeFullSearchResult(rapidjson::Writer<rapidjson::StringBuf
        jsonWriter.String(occurrence.getSourceSegment().c_str());
        jsonWriter.String("targetSegment");
        jsonWriter.String(occurrence.getTargetSegment().c_str());
+        jsonWriter.String("sourceName");
+        jsonWriter.String(occurrence.getSourceName().c_str());
+        jsonWriter.String("sourceLink");
+        jsonWriter.String(occurrence.getSourceLink().c_str());
        jsonWriter.String("targetFragments");
        jsonWriter.StartArray(); // all target fragments
        for (std::vector<std::pair<int,int> >::const_iterator it = occurrence.getTargetFragments().begin();
--- a/concordia-server/unit_dao.cpp
+++ b/concordia-server/unit_dao.cpp
@ -54,6 +54,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
    return newIds;
 }

+
 std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
         const std::vector<TokenizedSentence> & sourceSentences,
         const std::vector<TokenizedSentence> & targetSentences,
@ -72,6 +73,26 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
    return newIds;
 }

+std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
+         const std::vector<TokenizedSentence> & sourceSentences,
+         const std::vector<TokenizedSentence> & targetSentences,
+         const std::vector<std::vector<std::vector<int> > > & allAlignments,
+         const std::vector<int> & sourceIds,
+         const int tmId) throw (ConcordiaException) {
+
+    DBconnection connection;
+    std::vector<SUFFIX_MARKER_TYPE> newIds;
+    connection.startTransaction();
+
+    for (int i=0; i< sourceSentences.size(); i++) {
+        newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), sourceIds.at(i), tmId));
+    }
+
+    connection.endTransaction();
+    return newIds;
+}
+
+
 SimpleSearchResult UnitDAO::getSimpleSearchResult(const MatchedPatternFragment & fragment) {
    SimpleSearchResult result(fragment.getStart(), fragment.getEnd());
    TokenizedSentence ts("");
@ -146,7 +167,8 @@ SimpleSearchResult UnitDAO::_getResultFromFragment(
 }

 ExampleOccurrence UnitDAO::_getExampleOccurrence(DBconnection & connection, const SubstringOccurrence sOccurrence, const int matchedLength) {
-    std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer], target_tokens, alignments FROM unit WHERE id = $3::integer;";
+    Logger::log("_getExampleOccurence");
+    std::string query = "SELECT unit.id, unit.source_segment, unit.target_segment, unit.source_tokens[$1::integer], unit.source_tokens[$2::integer], unit.target_tokens, unit.alignments, source.name, source.link FROM unit left join source on unit.source_id = source.external_id where unit.id = $3::integer;";
    std::vector<QueryParam*> params;
    params.push_back(new IntParam(2*sOccurrence.getOffset()+1));
    params.push_back(new IntParam(2*(sOccurrence.getOffset()+matchedLength)));
@ -156,7 +178,9 @@ ExampleOccurrence UnitDAO::_getExampleOccurrence(DBconnection & connection, cons
                                connection.getIntValue(result,0,3),      // matched example start
                                connection.getIntValue(result,0,4),      // matched example end
                                connection.getStringValue(result,0,1),   // source segment
-                                connection.getStringValue(result,0,2));  // target segment
+                                connection.getStringValue(result,0,2),   // target segment
+                                connection.getStringValue(result,0,7),   // source name
+                                connection.getStringValue(result,0,8));  // source link
    std::string targetTokensRaw = connection.getStringValue(result,0,5);
    std::string alignmentsRaw = connection.getStringValue(result,0,6);

@ -273,6 +297,43 @@ int UnitDAO::_addAlignedUnit (
    return newId;
 }

+int UnitDAO::_addAlignedUnit (
+     DBconnection & connection,
+     const TokenizedSentence & sourceSentence,
+     const TokenizedSentence & targetSentence,
+     const std::vector<std::vector<int> > & alignments,
+     const int sourceId,
+     const int tmId) throw(ConcordiaException) {
+
+    if (sourceSentence.getTokens().size() != alignments.size()) {
+        // Here we check if the source sentence, taken from src.tok,
+        // is shorter than alignments array.
+        std::stringstream ss;
+        ss << "The size of source sentence is different than the size of alignments array. Source sentence: " << sourceSentence.getSentence() << ", alignments size:" << alignments.size();
+        throw ConcordiaException(ss.str());
+    }
+
+    std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens, alignments, source_id) values($1::text,$2::text,$3::integer,$4,$5,$6,$7) RETURNING id";
+    std::vector<QueryParam*> params;
+    params.push_back(new StringParam(sourceSentence.getOriginalSentence()));
+    params.push_back(new StringParam(targetSentence.getOriginalSentence()));
+    params.push_back(new IntParam(tmId));
+    params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
+    params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
+    params.push_back(new Int2DArrayParam(alignments));
+    params.push_back(new IntParam(sourceId));
+
+    PGresult * result = connection.execute(query, params);
+    int newId = connection.getIntValue(result, 0, 0);
+    connection.clearResult(result);
+    BOOST_FOREACH (QueryParam * param, params) {
+        delete param;
+    }
+
+    return newId;
+}
+
+
 std::vector<int> UnitDAO::_getArray(std::string arrayString) {
    std::vector<int> result;
    if (arrayString.length()>2) {
--- a/concordia-server/unit_dao.hpp
+++ b/concordia-server/unit_dao.hpp
@ -43,6 +43,13 @@ public:
             const std::vector<std::vector<std::vector<int> > > & allAlignments,
             const int tmId) throw (ConcordiaException);

+    std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
+             const std::vector<TokenizedSentence> & sourceSentences,
+             const std::vector<TokenizedSentence> & targetSentences,
+             const std::vector<std::vector<std::vector<int> > > & allAlignments,
+             const std::vector<int> & sourceIds,
+             const int tmId) throw (ConcordiaException);
+
    SimpleSearchResult getSimpleSearchResult(const MatchedPatternFragment & fragment);

    FullSearchResult getFullSearchResult(const OccurrencesList & occurrencesList, const int patternLength);
@ -78,6 +85,14 @@ private:
         const std::vector<std::vector<int> > & alignments,
         const int tmId) throw(ConcordiaException);

+    int _addAlignedUnit(
+         DBconnection & connection,
+         const TokenizedSentence & sourceSentence,
+         const TokenizedSentence & targetSentence,
+         const std::vector<std::vector<int> > & alignments,
+         const int sourceId, 
+         const int tmId) throw(ConcordiaException);
+
    std::vector<int> _getArray(std::string arrayString);

    std::vector<std::vector<int> > _get2DArray(std::string arrayString);
				`@ -0,0 +1 @@`
				`../versions_available/opensubtitles_plen.cfg`