mutliple results
This commit is contained in:
parent
c3826919ba
commit
31e4f091ad
@ -174,17 +174,18 @@ int main(int argc, char** argv) {
|
||||
std::cout << "\tSearching for pattern: \"" << pattern <<
|
||||
"\"" << std::endl;
|
||||
time_start = boost::posix_time::microsec_clock::local_time();
|
||||
std::vector<MatchedPatternFragment> result =
|
||||
MatchedPatternFragment result =
|
||||
concordia.simpleSearch(pattern);
|
||||
time_end = boost::posix_time::microsec_clock::local_time();
|
||||
msdiff = time_end - time_start;
|
||||
std::cout << "\tFound: " << result.size() << " matches. "
|
||||
<< "Search took: " <<
|
||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||
std::cout << "\tFound: " << result.getOccurences().size()
|
||||
<< " matches. " << "Search took: "
|
||||
<< msdiff.total_milliseconds() << "ms." << std::endl;
|
||||
if (!cli.count("silent")) {
|
||||
BOOST_FOREACH(MatchedPatternFragment occurence, result) {
|
||||
BOOST_FOREACH(SubstringOccurence occurence,
|
||||
result.getOccurences()) {
|
||||
std::cout << "\t\tfound match in sentence number: "
|
||||
<< occurence.getExampleId() << std::endl;
|
||||
<< occurence.getId() << std::endl;
|
||||
}
|
||||
}
|
||||
} else if (cli.count("anubis-search")) {
|
||||
@ -234,10 +235,9 @@ int main(int argc, char** argv) {
|
||||
result->getBestOverlay()) {
|
||||
std::cout << "\t\tfragment [" << fragment.getStart()
|
||||
<< "," << fragment.getEnd()
|
||||
<< "] (exampleId, exampleOffset,"
|
||||
<< "] (exampleCount,"
|
||||
<< " patternOffset, length): "
|
||||
<< fragment.getExampleId() << ","
|
||||
<< fragment.getExampleOffset() << ","
|
||||
<< fragment.getOccurences().size() << ","
|
||||
<< fragment.getPatternOffset() << ","
|
||||
<< fragment.getMatchedLength()
|
||||
<< std::endl;
|
||||
@ -248,10 +248,9 @@ int main(int argc, char** argv) {
|
||||
result->getFragments()) {
|
||||
std::cout << "\t\tfragment [" << fragment.getStart()
|
||||
<< "," << fragment.getEnd()
|
||||
<< "] (exampleId, exampleOffset,"
|
||||
<< "] (exampleCount,"
|
||||
<< " patternOffset, length): "
|
||||
<< fragment.getExampleId() << ","
|
||||
<< fragment.getExampleOffset() << ","
|
||||
<< fragment.getOccurences().size() << ","
|
||||
<< fragment.getPatternOffset() << ","
|
||||
<< fragment.getMatchedLength()
|
||||
<< std::endl;
|
||||
|
@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
||||
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
||||
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
||||
|
||||
#define CONCORDIA_SEARCH_MAX_RESULTS 3
|
||||
#define CONCORDIA_SEARCH_MAX_RESULTS 5
|
||||
|
||||
#define WORD_MAP_FILE_NAME "word_map.bin"
|
||||
#define MARKERS_FILE_NAME "markers.bin"
|
||||
|
@ -72,13 +72,15 @@ std::vector<TokenizedSentence> Concordia::tokenizeAll(
|
||||
|
||||
if (generateCodes) {
|
||||
BOOST_FOREACH(std::string sentence, sentences) {
|
||||
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
|
||||
result.push_back(_hashGenerator->generateHash(sentence,
|
||||
byWhitespace));
|
||||
}
|
||||
|
||||
_hashGenerator->serializeWordMap();
|
||||
} else {
|
||||
BOOST_FOREACH(std::string sentence, sentences) {
|
||||
result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
|
||||
result.push_back(_hashGenerator->generateTokens(sentence,
|
||||
byWhitespace));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
@ -208,14 +210,14 @@ SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
|
||||
}
|
||||
|
||||
|
||||
std::vector<MatchedPatternFragment> Concordia::simpleSearch(
|
||||
const std::string & pattern)
|
||||
MatchedPatternFragment Concordia::simpleSearch(
|
||||
const std::string & pattern)
|
||||
throw(ConcordiaException) {
|
||||
if (_T->size() > 0) {
|
||||
return _searcher->simpleSearch(_hashGenerator, _T,
|
||||
_markers, _SA, pattern);
|
||||
} else {
|
||||
std::vector<MatchedPatternFragment> result;
|
||||
MatchedPatternFragment result(0, 0);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@ -269,4 +271,3 @@ std::string Concordia::_getHashedIndexFilePath() {
|
||||
std::string Concordia::_getMarkersFilePath() {
|
||||
return _indexPath+"/"+MARKERS_FILE_NAME;
|
||||
}
|
||||
|
||||
|
@ -126,12 +126,11 @@ public:
|
||||
/*! Performs a simple substring lookup on the index.
|
||||
For more info see \ref tutorial1_2.
|
||||
\param pattern pattern to be searched in the index
|
||||
\returns vector of matched results
|
||||
\returns matched pattern fragment containing vector of occurences
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
std::vector<MatchedPatternFragment> simpleSearch(
|
||||
const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
MatchedPatternFragment simpleSearch(const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
@ -75,6 +75,24 @@ public:
|
||||
return _bestOverlayScore;
|
||||
}
|
||||
|
||||
friend std::ostream & operator << (std::ostream & o,
|
||||
const ConcordiaSearchResult & result) {
|
||||
o << "Best overlay {" << std::endl;
|
||||
BOOST_FOREACH(MatchedPatternFragment fragment,
|
||||
result.getBestOverlay()) {
|
||||
o << fragment << std::endl;
|
||||
}
|
||||
o << "}" << std::endl;
|
||||
o << "All fragments {" << std::endl;
|
||||
BOOST_FOREACH(MatchedPatternFragment fragment,
|
||||
result.getFragments()) {
|
||||
o << fragment << std::endl;
|
||||
}
|
||||
o << "}";
|
||||
return o;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
void _checkPossibleOverlays(
|
||||
std::vector<MatchedPatternFragment> currentOverlay,
|
||||
|
@ -36,12 +36,14 @@ void ConcordiaSearcher::concordiaSearch(
|
||||
std::vector<SubstringOccurence> occurences =
|
||||
lcpSearch(T, markers, SA, currentPattern, lcpLength);
|
||||
|
||||
BOOST_FOREACH(SubstringOccurence occurence, occurences) {
|
||||
result->addFragment(MatchedPatternFragment(
|
||||
occurence.getId(),
|
||||
occurence.getOffset(),
|
||||
offset,
|
||||
lcpLength / sizeof(INDEX_CHARACTER_TYPE)));
|
||||
if (occurences.size() > 0) {
|
||||
MatchedPatternFragment fragment(offset,
|
||||
lcpLength / sizeof(INDEX_CHARACTER_TYPE));
|
||||
|
||||
BOOST_FOREACH(SubstringOccurence occurence, occurences) {
|
||||
fragment.addOccurence(occurence);
|
||||
}
|
||||
result->addFragment(fragment);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13,14 +13,12 @@ IndexSearcher::IndexSearcher() {
|
||||
IndexSearcher::~IndexSearcher() {
|
||||
}
|
||||
|
||||
std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
|
||||
MatchedPatternFragment IndexSearcher::simpleSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern) throw(ConcordiaException) {
|
||||
std::vector<MatchedPatternFragment> result;
|
||||
|
||||
int left;
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||
hashGenerator->generateHash(pattern).getCodes();
|
||||
@ -30,6 +28,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
|
||||
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||
(const sauchar_t *) patternArray, patternLength,
|
||||
SA->data(), (saidx_t) SA->size(), &left);
|
||||
MatchedPatternFragment result(0, hash.size());
|
||||
for (int i = 0; i < size; ++i) {
|
||||
saidx_t resultPos = SA->at(left + i);
|
||||
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||
@ -40,12 +39,11 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
|
||||
// removes these accidental results.
|
||||
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
||||
result.push_back(MatchedPatternFragment(
|
||||
Utils::getIdFromMarker(marker),
|
||||
Utils::getOffsetFromMarker(marker),
|
||||
0,
|
||||
hash.size()));
|
||||
if (result.size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
|
||||
|
||||
SubstringOccurence occurence;
|
||||
occurence.enterDataFromMarker(marker);
|
||||
result.addOccurence(occurence);
|
||||
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -42,10 +42,10 @@ public:
|
||||
\param markers markers array for the needs of searching
|
||||
\param SA suffix array for the needs of searching
|
||||
\param pattern string pattern to be searched in the index.
|
||||
\returns vector of occurences of the pattern in the index
|
||||
\returns matched pattern fragment, containing occurences of the pattern in the index
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
std::vector<MatchedPatternFragment> simpleSearch(
|
||||
MatchedPatternFragment simpleSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
|
@ -1,14 +1,10 @@
|
||||
#include "concordia/matched_pattern_fragment.hpp"
|
||||
|
||||
MatchedPatternFragment::MatchedPatternFragment(
|
||||
const SUFFIX_MARKER_TYPE & exampleId,
|
||||
const SUFFIX_MARKER_TYPE & exampleOffset,
|
||||
const SUFFIX_MARKER_TYPE & patternOffset,
|
||||
const SUFFIX_MARKER_TYPE & matchedLength):
|
||||
Interval(patternOffset,
|
||||
patternOffset + matchedLength),
|
||||
_exampleId(exampleId),
|
||||
_exampleOffset(exampleOffset),
|
||||
_patternOffset(patternOffset),
|
||||
_matchedLength(matchedLength) {
|
||||
}
|
||||
@ -16,3 +12,7 @@ MatchedPatternFragment::MatchedPatternFragment(
|
||||
MatchedPatternFragment::~MatchedPatternFragment() {
|
||||
}
|
||||
|
||||
void MatchedPatternFragment::addOccurence(
|
||||
const SubstringOccurence & occurence) {
|
||||
_occurences.push_back(occurence);
|
||||
}
|
||||
|
@ -3,6 +3,10 @@
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/interval.hpp"
|
||||
#include "concordia/substring_occurence.hpp"
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
/*!
|
||||
Class representing matched pattern fragment in concordia search.
|
||||
@ -17,32 +21,26 @@
|
||||
class MatchedPatternFragment : public Interval {
|
||||
public:
|
||||
/*! Constructor.
|
||||
\param exampleId id of the example where the pattern fragment was matched
|
||||
\param exampleOffset offset of the matched fragment in the example
|
||||
\param patternOffset offset of the matched fragment in the pattern
|
||||
\param matchedLength length of the matched pattern
|
||||
*/
|
||||
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
|
||||
const SUFFIX_MARKER_TYPE & exampleOffset,
|
||||
const SUFFIX_MARKER_TYPE & patternOffset,
|
||||
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & patternOffset,
|
||||
const SUFFIX_MARKER_TYPE & matchedLength);
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~MatchedPatternFragment();
|
||||
|
||||
/*! Getter for example id.
|
||||
\returns example id
|
||||
/*! Getter for occurences.
|
||||
\returns occurences
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getExampleId() const {
|
||||
return _exampleId;
|
||||
std::vector<SubstringOccurence> getOccurences() const {
|
||||
return _occurences;
|
||||
}
|
||||
|
||||
/*! Getter for example offset.
|
||||
\returns example offset
|
||||
/*! Adds an occurence to the list.
|
||||
\param fragment occurence to be added
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getExampleOffset() const {
|
||||
return _exampleOffset;
|
||||
}
|
||||
void addOccurence(const SubstringOccurence & occurence);
|
||||
|
||||
/*! Getter for pattern offset.
|
||||
\returns pattern offset
|
||||
@ -65,10 +63,22 @@ public:
|
||||
return (_matchedLength > other.getMatchedLength());
|
||||
}
|
||||
|
||||
private:
|
||||
SUFFIX_MARKER_TYPE _exampleId;
|
||||
friend std::ostream & operator << (std::ostream & o,
|
||||
const MatchedPatternFragment & fragment) {
|
||||
o << "fragment(patternOffset=" << fragment.getPatternOffset()
|
||||
<< ", matchedLength=" << fragment.getMatchedLength() << ") {"
|
||||
<< std::endl;
|
||||
BOOST_FOREACH(SubstringOccurence occurence, fragment.getOccurences()) {
|
||||
o << "\t" << occurence << std::endl;
|
||||
}
|
||||
|
||||
SUFFIX_MARKER_TYPE _exampleOffset;
|
||||
o << "}";
|
||||
return o;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
std::vector<SubstringOccurence> _occurences;
|
||||
|
||||
SUFFIX_MARKER_TYPE _patternOffset;
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
|
||||
/*!
|
||||
Class representing occurence of a searched substring.
|
||||
@ -65,6 +66,13 @@ public:
|
||||
*/
|
||||
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
|
||||
|
||||
friend std::ostream & operator << (std::ostream & o,
|
||||
const SubstringOccurence & occurence) {
|
||||
return o << "occurence(exampleId=" << occurence.getId()
|
||||
<< ", offset=" << occurence.getOffset() << ")";
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
SUFFIX_MARKER_TYPE _id;
|
||||
|
||||
|
@ -67,21 +67,19 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||
|
||||
*/
|
||||
|
||||
std::vector<MatchedPatternFragment> searchResult1 = concordia.simpleSearch("posiada rysia");
|
||||
std::vector<MatchedPatternFragment> searchResult2 = concordia.simpleSearch("posiada kota Ala");
|
||||
MatchedPatternFragment searchResult1 = concordia.simpleSearch("posiada rysia");
|
||||
MatchedPatternFragment searchResult2 = concordia.simpleSearch("posiada kota Ala");
|
||||
|
||||
concordia.clearIndex();
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
|
||||
|
||||
// Checking pattern spanning over 2 segments
|
||||
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
@ -133,29 +131,24 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
|
||||
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
|
||||
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("xto xjest");
|
||||
std::vector<MatchedPatternFragment> searchResult2 = concordia2.simpleSearch("xjest okno");
|
||||
MatchedPatternFragment searchResult1 = concordia2.simpleSearch("xto xjest");
|
||||
MatchedPatternFragment searchResult2 = concordia2.simpleSearch("xjest okno");
|
||||
|
||||
concordia2.clearIndex();
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1.size(), 3);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 45);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleId(), 29);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleOffset(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(2).getMatchedLength(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 3);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 45);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getId(), 29);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getOffset(), 0);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 202);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getMatchedLength(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 312);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getMatchedLength(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getId(), 202);
|
||||
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getId(), 312);
|
||||
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getOffset(), 1);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||
@ -169,14 +162,13 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||
|
||||
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
|
||||
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
||||
MatchedPatternFragment searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
||||
|
||||
concordia2.clearIndex();
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 6);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
|
||||
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
||||
@ -247,53 +239,50 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
|
||||
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
|
||||
|
||||
/*
|
||||
addFragment 45,2,0,2
|
||||
addFragment 51,1,0,2
|
||||
addFragment 123,1,0,2
|
||||
addFragment 45,3,1,1
|
||||
addFragment 51,2,1,1
|
||||
addFragment 123,2,1,1
|
||||
addFragment 167,1,2,1
|
||||
adding fragment: offset=0, length=2
|
||||
adding occurence: example id=167, offset=2
|
||||
adding occurence: example id=45, offset=3
|
||||
adding occurence: example id=51, offset=1
|
||||
adding occurence: example id=123, offset=1
|
||||
adding fragment: offset=1, length=1
|
||||
adding occurence: example id=167, offset=3
|
||||
adding occurence: example id=45, offset=4
|
||||
adding occurence: example id=51, offset=2
|
||||
adding occurence: example id=123, offset=2
|
||||
adding fragment: offset=2, length=1
|
||||
adding occurence: example id=167, offset=1
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 3);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 167);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getId(), 45);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getId(), 167);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getOffset(), 3);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getId(), 45);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);
|
||||
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleId(), 167);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getId(), 167);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getOffset(), 1);
|
||||
|
||||
concordia.clearIndex();
|
||||
}
|
||||
@ -326,6 +315,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
|
||||
// best overlay:
|
||||
|
||||
// std::cout << *searchResult1 << std::endl;
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
||||
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.655, 0.1);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 1);
|
||||
@ -344,10 +335,46 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
||||
Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2
|
||||
Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2
|
||||
Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1
|
||||
|
||||
Best overlay {
|
||||
fragment(patternOffset=1, matchedLength=4) {
|
||||
occurence(exampleId=321, offset=0)
|
||||
}
|
||||
fragment(patternOffset=5, matchedLength=4) {
|
||||
occurence(exampleId=14, offset=7)
|
||||
}
|
||||
}
|
||||
All fragments {
|
||||
fragment(patternOffset=4, matchedLength=5) {
|
||||
occurence(exampleId=14, offset=6)
|
||||
}
|
||||
fragment(patternOffset=1, matchedLength=4) {
|
||||
occurence(exampleId=321, offset=0)
|
||||
}
|
||||
fragment(patternOffset=5, matchedLength=4) {
|
||||
occurence(exampleId=14, offset=7)
|
||||
}
|
||||
fragment(patternOffset=2, matchedLength=3) {
|
||||
occurence(exampleId=321, offset=1)
|
||||
}
|
||||
fragment(patternOffset=6, matchedLength=3) {
|
||||
occurence(exampleId=14, offset=8)
|
||||
}
|
||||
fragment(patternOffset=3, matchedLength=2) {
|
||||
occurence(exampleId=321, offset=2)
|
||||
}
|
||||
fragment(patternOffset=7, matchedLength=2) {
|
||||
occurence(exampleId=14, offset=9)
|
||||
}
|
||||
fragment(patternOffset=8, matchedLength=1) {
|
||||
occurence(exampleId=14, offset=10)
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 14);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 6);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 14);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 6);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5);
|
||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4);
|
||||
|
BIN
scripts/concordia_json.zip
Normal file
BIN
scripts/concordia_json.zip
Normal file
Binary file not shown.
22
scripts/responseExplained.txt
Normal file
22
scripts/responseExplained.txt
Normal file
@ -0,0 +1,22 @@
|
||||
{
|
||||
"status": "success", //status operacji
|
||||
"result": {
|
||||
"bestOverlayScore" : <liczba 0-1> // Concordia podaje score znalezionego przez siebie dopasowania
|
||||
"bestOverlay" : [ // lista fragmentów zdania wejściowego, które znalazły się w pamięci tłumaczeń
|
||||
{ // jeden fragment
|
||||
"matchedPatternStart": 0, // index (character-based) początku fragmentu zdania wejściowego
|
||||
"matchedPatternEnd": 8, // index końca fragmentu (exclusive, przedział prawostronnie otwarty)
|
||||
"occurences": [{ // lista przykładów z pamięci tłumaczeń, w których znalazł się dany fragment zdania wejściowego
|
||||
"id": 1782145, // id przykładu z pamięci tłumaczeń (przykład to para zdań źródłowe-docelowe)
|
||||
"matchedExampleStart": 305, // index początku fragmentu w zdaniu źródłowym przykładu
|
||||
"matchedExampleEnd": 314, // index końca fragmentu w zdaniu źródłowym przykładu (exclusive)
|
||||
"sourceSegment": <text>, // pełny tekst zdania źródłowego przykładu
|
||||
"targetSegment": <text>, // pełny tekst zdania docelowego przykładu
|
||||
"targetFragments": [ // lista fragmentów zdania docelowego, do których urównoleglony jest znaleziony fragment zdania źródłowego
|
||||
[257, 264]
|
||||
]
|
||||
}, .... // mogą być jeszcze kolejne fragmenty
|
||||
|
||||
]
|
||||
}
|
||||
}
|
5
scripts/sampleRequest.json
Normal file
5
scripts/sampleRequest.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"operation": "concordiaSearch",
|
||||
"pattern":"Ala ma kota",
|
||||
"tmId":1
|
||||
}
|
43
scripts/sampleResponse.json
Normal file
43
scripts/sampleResponse.json
Normal file
@ -0,0 +1,43 @@
|
||||
{
|
||||
"status": "success",
|
||||
"result": {
|
||||
"bestOverlayScore": 0.5,
|
||||
"bestOverlay": [{
|
||||
"matchedPatternStart": 0,
|
||||
"matchedPatternEnd": 8,
|
||||
"occurences": [{
|
||||
"id": 1782145,
|
||||
"matchedExampleStart": 305,
|
||||
"matchedExampleEnd": 314,
|
||||
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
|
||||
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
|
||||
"targetFragments": [
|
||||
[257, 264]
|
||||
]
|
||||
}, {
|
||||
"id": 1782145,
|
||||
"matchedExampleStart": 326,
|
||||
"matchedExampleEnd": 335,
|
||||
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
|
||||
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
|
||||
"targetFragments": [
|
||||
[300, 315]
|
||||
]
|
||||
}]
|
||||
}, {
|
||||
"matchedPatternStart": 9,
|
||||
"matchedPatternEnd": 47,
|
||||
"occurences": [{
|
||||
"id": 1623941,
|
||||
"matchedExampleStart": 54,
|
||||
"matchedExampleEnd": 93,
|
||||
"sourceSegment": "wszelkie spory między albo islandią , albo norwegią a państwem członkowskim unii europejskiej dotyczące interpretacji lub stosowania niniejszej umowy mogą być przekazywane przez stronę sporu na posiedzeniu przedstawicieli rządów państw członkowskich unii europejskiej oraz islandii i norwegii w celu rozstrzygnięcia sporu w terminie sześciu miesięcy .",
|
||||
"targetSegment": "any dispute between either iceland or norway and a member state of the european union regarding the interpretation or the application of this agreement may be referred by a party to the dispute to a meeting of representatives of the governments of the member states of the european union and of iceland and norway , with a view to its settlement within six months .",
|
||||
"targetFragments": [
|
||||
[51, 85],
|
||||
[96, 99]
|
||||
]
|
||||
}]
|
||||
}]
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user