diff --git a/cat/index.html_pattern b/cat/index.html_pattern index 19f8b43..29e7a3d 100644 --- a/cat/index.html_pattern +++ b/cat/index.html_pattern @@ -3,6 +3,7 @@ + Concordia diff --git a/concordia-server/int_2d_array_param.cpp b/concordia-server/int_2d_array_param.cpp new file mode 100644 index 0000000..be5edb3 --- /dev/null +++ b/concordia-server/int_2d_array_param.cpp @@ -0,0 +1,44 @@ +#include "int_2d_array_param.hpp" + +#include +#include + +Int2DArrayParam::Int2DArrayParam(std::vector > array) { + std::stringstream ss; + ss << "["; + int i = 0; + BOOST_FOREACH(std::vector & intArray, array) { + ss << "["; + int j = 0; + BOOST_FOREACH(int & number, intArray) { + ss << number; + if (j < intArray.size() - 1) { + ss << ","; + } + j++; + } + ss << "]"; + + if (i < array.size() -1) { + ss << ";"; + } + i++; + } + ss << "]"; + _arrayString = ss.str(); +} + +Int2DArrayParam::~Int2DArrayParam() { +} + +const char * Int2DArrayParam::getValue() { + return _arrayString.c_str(); +} + +const int Int2DArrayParam::getLength() { + return _arrayString.size(); +} + +const int Int2DArrayParam::isBinary() { + return 0; +} diff --git a/concordia-server/int_2d_array_param.hpp b/concordia-server/int_2d_array_param.hpp new file mode 100644 index 0000000..08e72d6 --- /dev/null +++ b/concordia-server/int_2d_array_param.hpp @@ -0,0 +1,27 @@ +#ifndef INT_2D_ARRAY_PARAM_HDR +#define INT_2D_ARRAY_PARAM_HDR + +#include "query_param.hpp" + +#include +#include + +class Int2DArrayParam : public QueryParam { +public: + /*! Constructor. + */ + Int2DArrayParam(std::vector > array); + /*! Destructor. + */ + virtual ~Int2DArrayParam(); + + const char * getValue(); + + const int getLength(); + + const int isBinary(); +private: + std::string _arrayString; +}; + +#endif diff --git a/concordia-server/unit_dao.cpp b/concordia-server/unit_dao.cpp index 2bc6c6c..b236410 100644 --- a/concordia-server/unit_dao.cpp +++ b/concordia-server/unit_dao.cpp @@ -2,11 +2,15 @@ #include #include +#include +#include +#include #include "query_param.hpp" #include "string_param.hpp" #include "int_param.hpp" #include "int_array_param.hpp" +#include "int_2d_array_param.hpp" #include "logger.hpp" #include "example_occurence.hpp" @@ -14,6 +18,8 @@ #include #include +#include + UnitDAO::UnitDAO() { } @@ -140,7 +146,7 @@ SimpleSearchResult UnitDAO::_getResultFromFragment( } ExampleOccurence UnitDAO::_getExampleOccurence(DBconnection & connection, const SubstringOccurence sOccurence, const int matchedLength) { - std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;"; + std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer], target_tokens, alignments FROM unit WHERE id = $3::integer;"; std::vector params; params.push_back(new IntParam(2*sOccurence.getOffset()+1)); params.push_back(new IntParam(2*(sOccurence.getOffset()+matchedLength))); @@ -151,28 +157,34 @@ ExampleOccurence UnitDAO::_getExampleOccurence(DBconnection & connection, const connection.getIntValue(result,0,4), // matched example end connection.getStringValue(result,0,1), // source segment connection.getStringValue(result,0,2)); // target segment + std::string targetTokensRaw = connection.getStringValue(result,0,5); + std::string alignmentsRaw = connection.getStringValue(result,0,6); + connection.clearResult(result); BOOST_FOREACH (QueryParam * param, params) { delete param; } - // now add all target fragments matched with this fragment - std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos"; - std::vector targetParams; - targetParams.push_back(new IntParam(sOccurence.getId())); - targetParams.push_back(new IntParam(sOccurence.getOffset())); - targetParams.push_back(new IntParam(sOccurence.getOffset() + matchedLength - 1)); - PGresult * targetResult = connection.execute(targetQuery, targetParams); + std::vector targetTokens = _getArray(targetTokensRaw); + std::vector > alignments = _get2DArray(alignmentsRaw); + + std::set matchedTargetTokens; + for(int sourceTokenIndex = sOccurence.getOffset(); sourceTokenIndex < sOccurence.getOffset()+matchedLength; sourceTokenIndex++) { + BOOST_FOREACH(int & targetTokenIndex, alignments.at(sourceTokenIndex)) { + matchedTargetTokens.insert(targetTokenIndex); + } + } + int prevPos = -2; int currStart = -1; int currEnd = -1; - for (int i=0;i::iterator iter; + for(iter=matchedTargetTokens.begin(); iter!=matchedTargetTokens.end();++iter) { + int targetPos = *iter; + int targetStart = targetTokens.at(2*targetPos); + int targetEnd = targetTokens.at(2*targetPos+1); if (prevPos < targetPos - 1) { // beginning of detached fragment // check if there is a fragment to end @@ -186,16 +198,12 @@ ExampleOccurence UnitDAO::_getExampleOccurence(DBconnection & connection, const prevPos = targetPos; } + // check if there are remaining fragments if (currStart >= 0) { occurence.addMatchedTargetFragment(std::pair(currStart,currEnd)); } - connection.clearResult(targetResult); - BOOST_FOREACH (QueryParam * param, targetParams) { - delete param; - } - return occurence; } @@ -247,13 +255,14 @@ int UnitDAO::_addAlignedUnit ( throw ConcordiaException(ss.str()); } - std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id"; + std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens, alignments) values($1::text,$2::text,$3::integer,$4,$5,$6) RETURNING id"; std::vector params; params.push_back(new StringParam(sourceSentence.getOriginalSentence())); params.push_back(new StringParam(targetSentence.getOriginalSentence())); params.push_back(new IntParam(tmId)); params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence))); params.push_back(new IntArrayParam(_getTokenPositions(targetSentence))); + params.push_back(new Int2DArrayParam(alignments)); PGresult * result = connection.execute(query, params); int newId = connection.getIntValue(result, 0, 0); @@ -262,23 +271,31 @@ int UnitDAO::_addAlignedUnit ( delete param; } - // add alignments - bool nonEmpty = false; - std::stringstream alignmentsQuery; - alignmentsQuery << "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values "; - - for(int i=0;i UnitDAO::_getArray(std::string arrayString) { + std::vector result; + if (arrayString.length()>2) { + std::vector numberStrings; + std::string strippedArrayString = arrayString.substr(1,arrayString.length()-2); + boost::split(numberStrings,strippedArrayString,boost::is_any_of(",")); + BOOST_FOREACH (std::string & numberString, numberStrings) { + result.push_back(atoi(numberString.c_str())); + } + } + + return result; +} + +std::vector > UnitDAO::_get2DArray(std::string arrayString) { + std::vector > result; + std::vector arrayStrings; + std::string strippedArrayString = arrayString.substr(1,arrayString.length()-2); + boost::split(arrayStrings,strippedArrayString,boost::is_any_of(";")); + BOOST_FOREACH (std::string & arrayString, arrayStrings) { + result.push_back(_getArray(arrayString)); + } + + return result; +} diff --git a/concordia-server/unit_dao.hpp b/concordia-server/unit_dao.hpp index 8a157b6..3c3939e 100644 --- a/concordia-server/unit_dao.hpp +++ b/concordia-server/unit_dao.hpp @@ -77,6 +77,10 @@ private: const TokenizedSentence & targetSentence, const std::vector > & alignments, const int tmId) throw(ConcordiaException); + + std::vector _getArray(std::string arrayString); + + std::vector > _get2DArray(std::string arrayString); }; #endif diff --git a/db/concordia_server.sql b/db/concordia_server.sql index 88f4b43..43d873e 100644 --- a/db/concordia_server.sql +++ b/db/concordia_server.sql @@ -36,17 +36,8 @@ CREATE TABLE unit ( source_segment text, target_segment text, source_tokens integer[], - target_tokens integer[] + target_tokens integer[], + alignments text ); -DROP TABLE IF EXISTS alignment; -CREATE TABLE alignment ( - id SERIAL PRIMARY KEY, - unit_id integer, - source_token_pos integer, - target_token_pos integer -); - -CREATE INDEX ON alignment(unit_id, source_token_pos); - CREATE INDEX ON unit(tm_id);