mutliple results

This commit is contained in:
rjawor 2017-04-21 14:51:58 +02:00
parent c3826919ba
commit 31e4f091ad
16 changed files with 317 additions and 185 deletions

View File

@ -174,17 +174,18 @@ int main(int argc, char** argv) {
std::cout << "\tSearching for pattern: \"" << pattern <<
"\"" << std::endl;
time_start = boost::posix_time::microsec_clock::local_time();
std::vector<MatchedPatternFragment> result =
MatchedPatternFragment result =
concordia.simpleSearch(pattern);
time_end = boost::posix_time::microsec_clock::local_time();
msdiff = time_end - time_start;
std::cout << "\tFound: " << result.size() << " matches. "
<< "Search took: " <<
msdiff.total_milliseconds() << "ms." << std::endl;
std::cout << "\tFound: " << result.getOccurences().size()
<< " matches. " << "Search took: "
<< msdiff.total_milliseconds() << "ms." << std::endl;
if (!cli.count("silent")) {
BOOST_FOREACH(MatchedPatternFragment occurence, result) {
BOOST_FOREACH(SubstringOccurence occurence,
result.getOccurences()) {
std::cout << "\t\tfound match in sentence number: "
<< occurence.getExampleId() << std::endl;
<< occurence.getId() << std::endl;
}
}
} else if (cli.count("anubis-search")) {
@ -234,10 +235,9 @@ int main(int argc, char** argv) {
result->getBestOverlay()) {
std::cout << "\t\tfragment [" << fragment.getStart()
<< "," << fragment.getEnd()
<< "] (exampleId, exampleOffset,"
<< "] (exampleCount,"
<< " patternOffset, length): "
<< fragment.getExampleId() << ","
<< fragment.getExampleOffset() << ","
<< fragment.getOccurences().size() << ","
<< fragment.getPatternOffset() << ","
<< fragment.getMatchedLength()
<< std::endl;
@ -248,10 +248,9 @@ int main(int argc, char** argv) {
result->getFragments()) {
std::cout << "\t\tfragment [" << fragment.getStart()
<< "," << fragment.getEnd()
<< "] (exampleId, exampleOffset,"
<< "] (exampleCount,"
<< " patternOffset, length): "
<< fragment.getExampleId() << ","
<< fragment.getExampleOffset() << ","
<< fragment.getOccurences().size() << ","
<< fragment.getPatternOffset() << ","
<< fragment.getMatchedLength()
<< std::endl;

View File

@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
#define CONCORDIA_SEARCH_MAX_RESULTS 3
#define CONCORDIA_SEARCH_MAX_RESULTS 5
#define WORD_MAP_FILE_NAME "word_map.bin"
#define MARKERS_FILE_NAME "markers.bin"

View File

@ -72,13 +72,15 @@ std::vector<TokenizedSentence> Concordia::tokenizeAll(
if (generateCodes) {
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
result.push_back(_hashGenerator->generateHash(sentence,
byWhitespace));
}
_hashGenerator->serializeWordMap();
} else {
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
result.push_back(_hashGenerator->generateTokens(sentence,
byWhitespace));
}
}
return result;
@ -208,14 +210,14 @@ SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
}
std::vector<MatchedPatternFragment> Concordia::simpleSearch(
MatchedPatternFragment Concordia::simpleSearch(
const std::string & pattern)
throw(ConcordiaException) {
if (_T->size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern);
} else {
std::vector<MatchedPatternFragment> result;
MatchedPatternFragment result(0, 0);
return result;
}
}
@ -269,4 +271,3 @@ std::string Concordia::_getHashedIndexFilePath() {
std::string Concordia::_getMarkersFilePath() {
return _indexPath+"/"+MARKERS_FILE_NAME;
}

View File

@ -126,11 +126,10 @@ public:
/*! Performs a simple substring lookup on the index.
For more info see \ref tutorial1_2.
\param pattern pattern to be searched in the index
\returns vector of matched results
\returns matched pattern fragment containing vector of occurences
\throws ConcordiaException
*/
std::vector<MatchedPatternFragment> simpleSearch(
const std::string & pattern)
MatchedPatternFragment simpleSearch(const std::string & pattern)
throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)

View File

@ -75,6 +75,24 @@ public:
return _bestOverlayScore;
}
friend std::ostream & operator << (std::ostream & o,
const ConcordiaSearchResult & result) {
o << "Best overlay {" << std::endl;
BOOST_FOREACH(MatchedPatternFragment fragment,
result.getBestOverlay()) {
o << fragment << std::endl;
}
o << "}" << std::endl;
o << "All fragments {" << std::endl;
BOOST_FOREACH(MatchedPatternFragment fragment,
result.getFragments()) {
o << fragment << std::endl;
}
o << "}";
return o;
}
private:
void _checkPossibleOverlays(
std::vector<MatchedPatternFragment> currentOverlay,

View File

@ -36,12 +36,14 @@ void ConcordiaSearcher::concordiaSearch(
std::vector<SubstringOccurence> occurences =
lcpSearch(T, markers, SA, currentPattern, lcpLength);
if (occurences.size() > 0) {
MatchedPatternFragment fragment(offset,
lcpLength / sizeof(INDEX_CHARACTER_TYPE));
BOOST_FOREACH(SubstringOccurence occurence, occurences) {
result->addFragment(MatchedPatternFragment(
occurence.getId(),
occurence.getOffset(),
offset,
lcpLength / sizeof(INDEX_CHARACTER_TYPE)));
fragment.addOccurence(occurence);
}
result->addFragment(fragment);
}
}

View File

@ -13,14 +13,12 @@ IndexSearcher::IndexSearcher() {
IndexSearcher::~IndexSearcher() {
}
std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
MatchedPatternFragment IndexSearcher::simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
std::vector<MatchedPatternFragment> result;
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern).getCodes();
@ -30,6 +28,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left);
MatchedPatternFragment result(0, hash.size());
for (int i = 0; i < size; ++i) {
saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
@ -40,12 +39,11 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
// removes these accidental results.
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
result.push_back(MatchedPatternFragment(
Utils::getIdFromMarker(marker),
Utils::getOffsetFromMarker(marker),
0,
hash.size()));
if (result.size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
SubstringOccurence occurence;
occurence.enterDataFromMarker(marker);
result.addOccurence(occurence);
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
break;
}
}

View File

@ -42,10 +42,10 @@ public:
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\returns vector of occurences of the pattern in the index
\returns matched pattern fragment, containing occurences of the pattern in the index
\throws ConcordiaException
*/
std::vector<MatchedPatternFragment> simpleSearch(
MatchedPatternFragment simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,

View File

@ -1,14 +1,10 @@
#include "concordia/matched_pattern_fragment.hpp"
MatchedPatternFragment::MatchedPatternFragment(
const SUFFIX_MARKER_TYPE & exampleId,
const SUFFIX_MARKER_TYPE & exampleOffset,
const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & matchedLength):
Interval(patternOffset,
patternOffset + matchedLength),
_exampleId(exampleId),
_exampleOffset(exampleOffset),
_patternOffset(patternOffset),
_matchedLength(matchedLength) {
}
@ -16,3 +12,7 @@ MatchedPatternFragment::MatchedPatternFragment(
MatchedPatternFragment::~MatchedPatternFragment() {
}
void MatchedPatternFragment::addOccurence(
const SubstringOccurence & occurence) {
_occurences.push_back(occurence);
}

View File

@ -3,6 +3,10 @@
#include "concordia/common/config.hpp"
#include "concordia/interval.hpp"
#include "concordia/substring_occurence.hpp"
#include <vector>
#include <iostream>
#include <boost/foreach.hpp>
/*!
Class representing matched pattern fragment in concordia search.
@ -17,32 +21,26 @@
class MatchedPatternFragment : public Interval {
public:
/*! Constructor.
\param exampleId id of the example where the pattern fragment was matched
\param exampleOffset offset of the matched fragment in the example
\param patternOffset offset of the matched fragment in the pattern
\param matchedLength length of the matched pattern
*/
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
const SUFFIX_MARKER_TYPE & exampleOffset,
const SUFFIX_MARKER_TYPE & patternOffset,
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & matchedLength);
/*! Destructor.
*/
virtual ~MatchedPatternFragment();
/*! Getter for example id.
\returns example id
/*! Getter for occurences.
\returns occurences
*/
SUFFIX_MARKER_TYPE getExampleId() const {
return _exampleId;
std::vector<SubstringOccurence> getOccurences() const {
return _occurences;
}
/*! Getter for example offset.
\returns example offset
/*! Adds an occurence to the list.
\param fragment occurence to be added
*/
SUFFIX_MARKER_TYPE getExampleOffset() const {
return _exampleOffset;
}
void addOccurence(const SubstringOccurence & occurence);
/*! Getter for pattern offset.
\returns pattern offset
@ -65,10 +63,22 @@ public:
return (_matchedLength > other.getMatchedLength());
}
private:
SUFFIX_MARKER_TYPE _exampleId;
friend std::ostream & operator << (std::ostream & o,
const MatchedPatternFragment & fragment) {
o << "fragment(patternOffset=" << fragment.getPatternOffset()
<< ", matchedLength=" << fragment.getMatchedLength() << ") {"
<< std::endl;
BOOST_FOREACH(SubstringOccurence occurence, fragment.getOccurences()) {
o << "\t" << occurence << std::endl;
}
SUFFIX_MARKER_TYPE _exampleOffset;
o << "}";
return o;
}
private:
std::vector<SubstringOccurence> _occurences;
SUFFIX_MARKER_TYPE _patternOffset;

View File

@ -3,6 +3,7 @@
#include "concordia/common/config.hpp"
#include <string>
#include <iostream>
/*!
Class representing occurence of a searched substring.
@ -65,6 +66,13 @@ public:
*/
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
friend std::ostream & operator << (std::ostream & o,
const SubstringOccurence & occurence) {
return o << "occurence(exampleId=" << occurence.getId()
<< ", offset=" << occurence.getOffset() << ")";
}
private:
SUFFIX_MARKER_TYPE _id;

View File

@ -67,21 +67,19 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
*/
std::vector<MatchedPatternFragment> searchResult1 = concordia.simpleSearch("posiada rysia");
std::vector<MatchedPatternFragment> searchResult2 = concordia.simpleSearch("posiada kota Ala");
MatchedPatternFragment searchResult1 = concordia.simpleSearch("posiada rysia");
MatchedPatternFragment searchResult2 = concordia.simpleSearch("posiada kota Ala");
concordia.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 123);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 2);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
// Checking pattern spanning over 2 segments
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0);
}
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
@ -133,29 +131,24 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("xto xjest");
std::vector<MatchedPatternFragment> searchResult2 = concordia2.simpleSearch("xjest okno");
MatchedPatternFragment searchResult1 = concordia2.simpleSearch("xto xjest");
MatchedPatternFragment searchResult2 = concordia2.simpleSearch("xjest okno");
concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 3);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleId(), 29);
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(2).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 3);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getId(), 29);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 202);
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 312);
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 2);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getId(), 202);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getId(), 312);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getOffset(), 1);
}
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
@ -169,14 +162,13 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
MatchedPatternFragment searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 6);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 1);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
}
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
@ -247,53 +239,50 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
/*
addFragment 45,2,0,2
addFragment 51,1,0,2
addFragment 123,1,0,2
addFragment 45,3,1,1
addFragment 51,2,1,1
addFragment 123,2,1,1
addFragment 167,1,2,1
adding fragment: offset=0, length=2
adding occurence: example id=167, offset=2
adding occurence: example id=45, offset=3
adding occurence: example id=51, offset=1
adding occurence: example id=123, offset=1
adding fragment: offset=1, length=1
adding occurence: example id=167, offset=3
adding occurence: example id=45, offset=4
adding occurence: example id=51, offset=2
adding occurence: example id=123, offset=2
adding fragment: offset=2, length=1
adding occurence: example id=167, offset=1
*/
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getOffset(), 1);
concordia.clearIndex();
}
@ -326,6 +315,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
// best overlay:
// std::cout << *searchResult1 << std::endl;
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.655, 0.1);
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 1);
@ -344,10 +335,46 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2
Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2
Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1
Best overlay {
fragment(patternOffset=1, matchedLength=4) {
occurence(exampleId=321, offset=0)
}
fragment(patternOffset=5, matchedLength=4) {
occurence(exampleId=14, offset=7)
}
}
All fragments {
fragment(patternOffset=4, matchedLength=5) {
occurence(exampleId=14, offset=6)
}
fragment(patternOffset=1, matchedLength=4) {
occurence(exampleId=321, offset=0)
}
fragment(patternOffset=5, matchedLength=4) {
occurence(exampleId=14, offset=7)
}
fragment(patternOffset=2, matchedLength=3) {
occurence(exampleId=321, offset=1)
}
fragment(patternOffset=6, matchedLength=3) {
occurence(exampleId=14, offset=8)
}
fragment(patternOffset=3, matchedLength=2) {
occurence(exampleId=321, offset=2)
}
fragment(patternOffset=7, matchedLength=2) {
occurence(exampleId=14, offset=9)
}
fragment(patternOffset=8, matchedLength=1) {
occurence(exampleId=14, offset=10)
}
}
*/
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 14);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 6);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 14);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 6);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4);

BIN
scripts/concordia_json.zip Normal file

Binary file not shown.

View File

@ -0,0 +1,22 @@
{
"status": "success", //status operacji
"result": {
"bestOverlayScore" : <liczba 0-1> // Concordia podaje score znalezionego przez siebie dopasowania
"bestOverlay" : [ // lista fragmentów zdania wejściowego, które znalazły się w pamięci tłumaczeń
{ // jeden fragment
"matchedPatternStart": 0, // index (character-based) początku fragmentu zdania wejściowego
"matchedPatternEnd": 8, // index końca fragmentu (exclusive, przedział prawostronnie otwarty)
"occurences": [{ // lista przykładów z pamięci tłumaczeń, w których znalazł się dany fragment zdania wejściowego
"id": 1782145, // id przykładu z pamięci tłumaczeń (przykład to para zdań źródłowe-docelowe)
"matchedExampleStart": 305, // index początku fragmentu w zdaniu źródłowym przykładu
"matchedExampleEnd": 314, // index końca fragmentu w zdaniu źródłowym przykładu (exclusive)
"sourceSegment": <text>, // pełny tekst zdania źródłowego przykładu
"targetSegment": <text>, // pełny tekst zdania docelowego przykładu
"targetFragments": [ // lista fragmentów zdania docelowego, do których urównoleglony jest znaleziony fragment zdania źródłowego
[257, 264]
]
}, .... // mogą być jeszcze kolejne fragmenty
]
}
}

View File

@ -0,0 +1,5 @@
{
"operation": "concordiaSearch",
"pattern":"Ala ma kota",
"tmId":1
}

View File

@ -0,0 +1,43 @@
{
"status": "success",
"result": {
"bestOverlayScore": 0.5,
"bestOverlay": [{
"matchedPatternStart": 0,
"matchedPatternEnd": 8,
"occurences": [{
"id": 1782145,
"matchedExampleStart": 305,
"matchedExampleEnd": 314,
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
"targetFragments": [
[257, 264]
]
}, {
"id": 1782145,
"matchedExampleStart": 326,
"matchedExampleEnd": 335,
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
"targetFragments": [
[300, 315]
]
}]
}, {
"matchedPatternStart": 9,
"matchedPatternEnd": 47,
"occurences": [{
"id": 1623941,
"matchedExampleStart": 54,
"matchedExampleEnd": 93,
"sourceSegment": "wszelkie spory między albo islandią , albo norwegią a państwem członkowskim unii europejskiej dotyczące interpretacji lub stosowania niniejszej umowy mogą być przekazywane przez stronę sporu na posiedzeniu przedstawicieli rządów państw członkowskich unii europejskiej oraz islandii i norwegii w celu rozstrzygnięcia sporu w terminie sześciu miesięcy .",
"targetSegment": "any dispute between either iceland or norway and a member state of the european union regarding the interpretation or the application of this agreement may be referred by a party to the dispute to a meeting of representatives of the governments of the member states of the european union and of iceland and norway , with a view to its settlement within six months .",
"targetFragments": [
[51, 85],
[96, 99]
]
}]
}]
}
}