mutliple results

This commit is contained in:
rjawor 2017-04-21 14:51:58 +02:00
parent c3826919ba
commit 31e4f091ad
16 changed files with 317 additions and 185 deletions

View File

@ -174,17 +174,18 @@ int main(int argc, char** argv) {
std::cout << "\tSearching for pattern: \"" << pattern << std::cout << "\tSearching for pattern: \"" << pattern <<
"\"" << std::endl; "\"" << std::endl;
time_start = boost::posix_time::microsec_clock::local_time(); time_start = boost::posix_time::microsec_clock::local_time();
std::vector<MatchedPatternFragment> result = MatchedPatternFragment result =
concordia.simpleSearch(pattern); concordia.simpleSearch(pattern);
time_end = boost::posix_time::microsec_clock::local_time(); time_end = boost::posix_time::microsec_clock::local_time();
msdiff = time_end - time_start; msdiff = time_end - time_start;
std::cout << "\tFound: " << result.size() << " matches. " std::cout << "\tFound: " << result.getOccurences().size()
<< "Search took: " << << " matches. " << "Search took: "
msdiff.total_milliseconds() << "ms." << std::endl; << msdiff.total_milliseconds() << "ms." << std::endl;
if (!cli.count("silent")) { if (!cli.count("silent")) {
BOOST_FOREACH(MatchedPatternFragment occurence, result) { BOOST_FOREACH(SubstringOccurence occurence,
result.getOccurences()) {
std::cout << "\t\tfound match in sentence number: " std::cout << "\t\tfound match in sentence number: "
<< occurence.getExampleId() << std::endl; << occurence.getId() << std::endl;
} }
} }
} else if (cli.count("anubis-search")) { } else if (cli.count("anubis-search")) {
@ -234,10 +235,9 @@ int main(int argc, char** argv) {
result->getBestOverlay()) { result->getBestOverlay()) {
std::cout << "\t\tfragment [" << fragment.getStart() std::cout << "\t\tfragment [" << fragment.getStart()
<< "," << fragment.getEnd() << "," << fragment.getEnd()
<< "] (exampleId, exampleOffset," << "] (exampleCount,"
<< " patternOffset, length): " << " patternOffset, length): "
<< fragment.getExampleId() << "," << fragment.getOccurences().size() << ","
<< fragment.getExampleOffset() << ","
<< fragment.getPatternOffset() << "," << fragment.getPatternOffset() << ","
<< fragment.getMatchedLength() << fragment.getMatchedLength()
<< std::endl; << std::endl;
@ -248,10 +248,9 @@ int main(int argc, char** argv) {
result->getFragments()) { result->getFragments()) {
std::cout << "\t\tfragment [" << fragment.getStart() std::cout << "\t\tfragment [" << fragment.getStart()
<< "," << fragment.getEnd() << "," << fragment.getEnd()
<< "] (exampleId, exampleOffset," << "] (exampleCount,"
<< " patternOffset, length): " << " patternOffset, length): "
<< fragment.getExampleId() << "," << fragment.getOccurences().size() << ","
<< fragment.getExampleOffset() << ","
<< fragment.getPatternOffset() << "," << fragment.getPatternOffset() << ","
<< fragment.getMatchedLength() << fragment.getMatchedLength()
<< std::endl; << std::endl;

View File

@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset // sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length. // and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
#define CONCORDIA_SEARCH_MAX_RESULTS 3 #define CONCORDIA_SEARCH_MAX_RESULTS 5
#define WORD_MAP_FILE_NAME "word_map.bin" #define WORD_MAP_FILE_NAME "word_map.bin"
#define MARKERS_FILE_NAME "markers.bin" #define MARKERS_FILE_NAME "markers.bin"

View File

@ -72,13 +72,15 @@ std::vector<TokenizedSentence> Concordia::tokenizeAll(
if (generateCodes) { if (generateCodes) {
BOOST_FOREACH(std::string sentence, sentences) { BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace)); result.push_back(_hashGenerator->generateHash(sentence,
byWhitespace));
} }
_hashGenerator->serializeWordMap(); _hashGenerator->serializeWordMap();
} else { } else {
BOOST_FOREACH(std::string sentence, sentences) { BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace)); result.push_back(_hashGenerator->generateTokens(sentence,
byWhitespace));
} }
} }
return result; return result;
@ -208,14 +210,14 @@ SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
} }
std::vector<MatchedPatternFragment> Concordia::simpleSearch( MatchedPatternFragment Concordia::simpleSearch(
const std::string & pattern) const std::string & pattern)
throw(ConcordiaException) { throw(ConcordiaException) {
if (_T->size() > 0) { if (_T->size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T, return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern); _markers, _SA, pattern);
} else { } else {
std::vector<MatchedPatternFragment> result; MatchedPatternFragment result(0, 0);
return result; return result;
} }
} }
@ -269,4 +271,3 @@ std::string Concordia::_getHashedIndexFilePath() {
std::string Concordia::_getMarkersFilePath() { std::string Concordia::_getMarkersFilePath() {
return _indexPath+"/"+MARKERS_FILE_NAME; return _indexPath+"/"+MARKERS_FILE_NAME;
} }

View File

@ -126,11 +126,10 @@ public:
/*! Performs a simple substring lookup on the index. /*! Performs a simple substring lookup on the index.
For more info see \ref tutorial1_2. For more info see \ref tutorial1_2.
\param pattern pattern to be searched in the index \param pattern pattern to be searched in the index
\returns vector of matched results \returns matched pattern fragment containing vector of occurences
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<MatchedPatternFragment> simpleSearch( MatchedPatternFragment simpleSearch(const std::string & pattern)
const std::string & pattern)
throw(ConcordiaException); throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern) SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)

View File

@ -75,6 +75,24 @@ public:
return _bestOverlayScore; return _bestOverlayScore;
} }
friend std::ostream & operator << (std::ostream & o,
const ConcordiaSearchResult & result) {
o << "Best overlay {" << std::endl;
BOOST_FOREACH(MatchedPatternFragment fragment,
result.getBestOverlay()) {
o << fragment << std::endl;
}
o << "}" << std::endl;
o << "All fragments {" << std::endl;
BOOST_FOREACH(MatchedPatternFragment fragment,
result.getFragments()) {
o << fragment << std::endl;
}
o << "}";
return o;
}
private: private:
void _checkPossibleOverlays( void _checkPossibleOverlays(
std::vector<MatchedPatternFragment> currentOverlay, std::vector<MatchedPatternFragment> currentOverlay,

View File

@ -36,12 +36,14 @@ void ConcordiaSearcher::concordiaSearch(
std::vector<SubstringOccurence> occurences = std::vector<SubstringOccurence> occurences =
lcpSearch(T, markers, SA, currentPattern, lcpLength); lcpSearch(T, markers, SA, currentPattern, lcpLength);
if (occurences.size() > 0) {
MatchedPatternFragment fragment(offset,
lcpLength / sizeof(INDEX_CHARACTER_TYPE));
BOOST_FOREACH(SubstringOccurence occurence, occurences) { BOOST_FOREACH(SubstringOccurence occurence, occurences) {
result->addFragment(MatchedPatternFragment( fragment.addOccurence(occurence);
occurence.getId(), }
occurence.getOffset(), result->addFragment(fragment);
offset,
lcpLength / sizeof(INDEX_CHARACTER_TYPE)));
} }
} }

View File

@ -13,14 +13,12 @@ IndexSearcher::IndexSearcher() {
IndexSearcher::~IndexSearcher() { IndexSearcher::~IndexSearcher() {
} }
std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch( MatchedPatternFragment IndexSearcher::simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) { const std::string & pattern) throw(ConcordiaException) {
std::vector<MatchedPatternFragment> result;
int left; int left;
std::vector<INDEX_CHARACTER_TYPE> hash = std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern).getCodes(); hashGenerator->generateHash(pattern).getCodes();
@ -30,6 +28,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
int size = sa_search(T->data(), (saidx_t) T->size(), int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength, (const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left); SA->data(), (saidx_t) SA->size(), &left);
MatchedPatternFragment result(0, hash.size());
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
saidx_t resultPos = SA->at(left + i); saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
@ -40,12 +39,11 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
// removes these accidental results. // removes these accidental results.
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
result.push_back(MatchedPatternFragment(
Utils::getIdFromMarker(marker), SubstringOccurence occurence;
Utils::getOffsetFromMarker(marker), occurence.enterDataFromMarker(marker);
0, result.addOccurence(occurence);
hash.size())); if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
if (result.size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
break; break;
} }
} }

View File

@ -42,10 +42,10 @@ public:
\param markers markers array for the needs of searching \param markers markers array for the needs of searching
\param SA suffix array for the needs of searching \param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index. \param pattern string pattern to be searched in the index.
\returns vector of occurences of the pattern in the index \returns matched pattern fragment, containing occurences of the pattern in the index
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<MatchedPatternFragment> simpleSearch( MatchedPatternFragment simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,

View File

@ -1,14 +1,10 @@
#include "concordia/matched_pattern_fragment.hpp" #include "concordia/matched_pattern_fragment.hpp"
MatchedPatternFragment::MatchedPatternFragment( MatchedPatternFragment::MatchedPatternFragment(
const SUFFIX_MARKER_TYPE & exampleId,
const SUFFIX_MARKER_TYPE & exampleOffset,
const SUFFIX_MARKER_TYPE & patternOffset, const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & matchedLength): const SUFFIX_MARKER_TYPE & matchedLength):
Interval(patternOffset, Interval(patternOffset,
patternOffset + matchedLength), patternOffset + matchedLength),
_exampleId(exampleId),
_exampleOffset(exampleOffset),
_patternOffset(patternOffset), _patternOffset(patternOffset),
_matchedLength(matchedLength) { _matchedLength(matchedLength) {
} }
@ -16,3 +12,7 @@ MatchedPatternFragment::MatchedPatternFragment(
MatchedPatternFragment::~MatchedPatternFragment() { MatchedPatternFragment::~MatchedPatternFragment() {
} }
void MatchedPatternFragment::addOccurence(
const SubstringOccurence & occurence) {
_occurences.push_back(occurence);
}

View File

@ -3,6 +3,10 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/interval.hpp" #include "concordia/interval.hpp"
#include "concordia/substring_occurence.hpp"
#include <vector>
#include <iostream>
#include <boost/foreach.hpp>
/*! /*!
Class representing matched pattern fragment in concordia search. Class representing matched pattern fragment in concordia search.
@ -17,32 +21,26 @@
class MatchedPatternFragment : public Interval { class MatchedPatternFragment : public Interval {
public: public:
/*! Constructor. /*! Constructor.
\param exampleId id of the example where the pattern fragment was matched
\param exampleOffset offset of the matched fragment in the example
\param patternOffset offset of the matched fragment in the pattern \param patternOffset offset of the matched fragment in the pattern
\param matchedLength length of the matched pattern \param matchedLength length of the matched pattern
*/ */
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId, MatchedPatternFragment(const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & exampleOffset,
const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & matchedLength); const SUFFIX_MARKER_TYPE & matchedLength);
/*! Destructor. /*! Destructor.
*/ */
virtual ~MatchedPatternFragment(); virtual ~MatchedPatternFragment();
/*! Getter for example id. /*! Getter for occurences.
\returns example id \returns occurences
*/ */
SUFFIX_MARKER_TYPE getExampleId() const { std::vector<SubstringOccurence> getOccurences() const {
return _exampleId; return _occurences;
} }
/*! Getter for example offset. /*! Adds an occurence to the list.
\returns example offset \param fragment occurence to be added
*/ */
SUFFIX_MARKER_TYPE getExampleOffset() const { void addOccurence(const SubstringOccurence & occurence);
return _exampleOffset;
}
/*! Getter for pattern offset. /*! Getter for pattern offset.
\returns pattern offset \returns pattern offset
@ -65,10 +63,22 @@ public:
return (_matchedLength > other.getMatchedLength()); return (_matchedLength > other.getMatchedLength());
} }
private: friend std::ostream & operator << (std::ostream & o,
SUFFIX_MARKER_TYPE _exampleId; const MatchedPatternFragment & fragment) {
o << "fragment(patternOffset=" << fragment.getPatternOffset()
<< ", matchedLength=" << fragment.getMatchedLength() << ") {"
<< std::endl;
BOOST_FOREACH(SubstringOccurence occurence, fragment.getOccurences()) {
o << "\t" << occurence << std::endl;
}
SUFFIX_MARKER_TYPE _exampleOffset; o << "}";
return o;
}
private:
std::vector<SubstringOccurence> _occurences;
SUFFIX_MARKER_TYPE _patternOffset; SUFFIX_MARKER_TYPE _patternOffset;

View File

@ -3,6 +3,7 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include <string> #include <string>
#include <iostream>
/*! /*!
Class representing occurence of a searched substring. Class representing occurence of a searched substring.
@ -65,6 +66,13 @@ public:
*/ */
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker); void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
friend std::ostream & operator << (std::ostream & o,
const SubstringOccurence & occurence) {
return o << "occurence(exampleId=" << occurence.getId()
<< ", offset=" << occurence.getOffset() << ")";
}
private: private:
SUFFIX_MARKER_TYPE _id; SUFFIX_MARKER_TYPE _id;

View File

@ -67,21 +67,19 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
*/ */
std::vector<MatchedPatternFragment> searchResult1 = concordia.simpleSearch("posiada rysia"); MatchedPatternFragment searchResult1 = concordia.simpleSearch("posiada rysia");
std::vector<MatchedPatternFragment> searchResult2 = concordia.simpleSearch("posiada kota Ala"); MatchedPatternFragment searchResult2 = concordia.simpleSearch("posiada kota Ala");
concordia.clearIndex(); concordia.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 123); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 51); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
// Checking pattern spanning over 2 segments // Checking pattern spanning over 2 segments
BOOST_CHECK_EQUAL(searchResult2.size(), 0); BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0);
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
@ -133,29 +131,24 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(), Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("xto xjest"); MatchedPatternFragment searchResult1 = concordia2.simpleSearch("xto xjest");
std::vector<MatchedPatternFragment> searchResult2 = concordia2.simpleSearch("xjest okno"); MatchedPatternFragment searchResult2 = concordia2.simpleSearch("xjest okno");
concordia2.clearIndex(); concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 3); BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 3);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 0); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 45); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getId(), 29);
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleId(), 29);
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(2).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult2.size(), 2); BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 202); BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getId(), 202);
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(0).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getId(), 312);
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 312); BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getMatchedLength(), 2);
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
@ -169,14 +162,13 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(), Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia"); MatchedPatternFragment searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
concordia2.clearIndex(); concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 1); BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 6);
} }
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
@ -247,53 +239,50 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
/* /*
addFragment 45,2,0,2 adding fragment: offset=0, length=2
addFragment 51,1,0,2 adding occurence: example id=167, offset=2
addFragment 123,1,0,2 adding occurence: example id=45, offset=3
addFragment 45,3,1,1 adding occurence: example id=51, offset=1
addFragment 51,2,1,1 adding occurence: example id=123, offset=1
addFragment 123,2,1,1 adding fragment: offset=1, length=1
addFragment 167,1,2,1 adding occurence: example id=167, offset=3
adding occurence: example id=45, offset=4
adding occurence: example id=51, offset=2
adding occurence: example id=123, offset=2
adding fragment: offset=2, length=1
adding occurence: example id=167, offset=1
*/ */
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167); BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1);
concordia.clearIndex(); concordia.clearIndex();
} }
@ -326,6 +315,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers"); boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
// best overlay: // best overlay:
// std::cout << *searchResult1 << std::endl;
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.655, 0.1); BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.655, 0.1);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 1); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 1);
@ -344,10 +335,46 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2 Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2
Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2 Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2
Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1 Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1
Best overlay {
fragment(patternOffset=1, matchedLength=4) {
occurence(exampleId=321, offset=0)
}
fragment(patternOffset=5, matchedLength=4) {
occurence(exampleId=14, offset=7)
}
}
All fragments {
fragment(patternOffset=4, matchedLength=5) {
occurence(exampleId=14, offset=6)
}
fragment(patternOffset=1, matchedLength=4) {
occurence(exampleId=321, offset=0)
}
fragment(patternOffset=5, matchedLength=4) {
occurence(exampleId=14, offset=7)
}
fragment(patternOffset=2, matchedLength=3) {
occurence(exampleId=321, offset=1)
}
fragment(patternOffset=6, matchedLength=3) {
occurence(exampleId=14, offset=8)
}
fragment(patternOffset=3, matchedLength=2) {
occurence(exampleId=321, offset=2)
}
fragment(patternOffset=7, matchedLength=2) {
occurence(exampleId=14, offset=9)
}
fragment(patternOffset=8, matchedLength=1) {
occurence(exampleId=14, offset=10)
}
}
*/ */
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 14); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 14);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 6); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 6);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4);

BIN
scripts/concordia_json.zip Normal file

Binary file not shown.

View File

@ -0,0 +1,22 @@
{
"status": "success", //status operacji
"result": {
"bestOverlayScore" : <liczba 0-1> // Concordia podaje score znalezionego przez siebie dopasowania
"bestOverlay" : [ // lista fragmentów zdania wejściowego, które znalazły się w pamięci tłumaczeń
{ // jeden fragment
"matchedPatternStart": 0, // index (character-based) początku fragmentu zdania wejściowego
"matchedPatternEnd": 8, // index końca fragmentu (exclusive, przedział prawostronnie otwarty)
"occurences": [{ // lista przykładów z pamięci tłumaczeń, w których znalazł się dany fragment zdania wejściowego
"id": 1782145, // id przykładu z pamięci tłumaczeń (przykład to para zdań źródłowe-docelowe)
"matchedExampleStart": 305, // index początku fragmentu w zdaniu źródłowym przykładu
"matchedExampleEnd": 314, // index końca fragmentu w zdaniu źródłowym przykładu (exclusive)
"sourceSegment": <text>, // pełny tekst zdania źródłowego przykładu
"targetSegment": <text>, // pełny tekst zdania docelowego przykładu
"targetFragments": [ // lista fragmentów zdania docelowego, do których urównoleglony jest znaleziony fragment zdania źródłowego
[257, 264]
]
}, .... // mogą być jeszcze kolejne fragmenty
]
}
}

View File

@ -0,0 +1,5 @@
{
"operation": "concordiaSearch",
"pattern":"Ala ma kota",
"tmId":1
}

View File

@ -0,0 +1,43 @@
{
"status": "success",
"result": {
"bestOverlayScore": 0.5,
"bestOverlay": [{
"matchedPatternStart": 0,
"matchedPatternEnd": 8,
"occurences": [{
"id": 1782145,
"matchedExampleStart": 305,
"matchedExampleEnd": 314,
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
"targetFragments": [
[257, 264]
]
}, {
"id": 1782145,
"matchedExampleStart": 326,
"matchedExampleEnd": 335,
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
"targetFragments": [
[300, 315]
]
}]
}, {
"matchedPatternStart": 9,
"matchedPatternEnd": 47,
"occurences": [{
"id": 1623941,
"matchedExampleStart": 54,
"matchedExampleEnd": 93,
"sourceSegment": "wszelkie spory między albo islandią , albo norwegią a państwem członkowskim unii europejskiej dotyczące interpretacji lub stosowania niniejszej umowy mogą być przekazywane przez stronę sporu na posiedzeniu przedstawicieli rządów państw członkowskich unii europejskiej oraz islandii i norwegii w celu rozstrzygnięcia sporu w terminie sześciu miesięcy .",
"targetSegment": "any dispute between either iceland or norway and a member state of the european union regarding the interpretation or the application of this agreement may be referred by a party to the dispute to a meeting of representatives of the governments of the member states of the european union and of iceland and norway , with a view to its settlement within six months .",
"targetFragments": [
[51, 85],
[96, 99]
]
}]
}]
}
}