mutliple results

This commit is contained in:
rjawor 2017-04-21 14:51:58 +02:00
parent c3826919ba
commit 31e4f091ad
16 changed files with 317 additions and 185 deletions

View File

@ -174,17 +174,18 @@ int main(int argc, char** argv) {
std::cout << "\tSearching for pattern: \"" << pattern <<
"\"" << std::endl;
time_start = boost::posix_time::microsec_clock::local_time();
std::vector<MatchedPatternFragment> result =
MatchedPatternFragment result =
concordia.simpleSearch(pattern);
time_end = boost::posix_time::microsec_clock::local_time();
msdiff = time_end - time_start;
std::cout << "\tFound: " << result.size() << " matches. "
<< "Search took: " <<
msdiff.total_milliseconds() << "ms." << std::endl;
std::cout << "\tFound: " << result.getOccurences().size()
<< " matches. " << "Search took: "
<< msdiff.total_milliseconds() << "ms." << std::endl;
if (!cli.count("silent")) {
BOOST_FOREACH(MatchedPatternFragment occurence, result) {
BOOST_FOREACH(SubstringOccurence occurence,
result.getOccurences()) {
std::cout << "\t\tfound match in sentence number: "
<< occurence.getExampleId() << std::endl;
<< occurence.getId() << std::endl;
}
}
} else if (cli.count("anubis-search")) {
@ -234,10 +235,9 @@ int main(int argc, char** argv) {
result->getBestOverlay()) {
std::cout << "\t\tfragment [" << fragment.getStart()
<< "," << fragment.getEnd()
<< "] (exampleId, exampleOffset,"
<< "] (exampleCount,"
<< " patternOffset, length): "
<< fragment.getExampleId() << ","
<< fragment.getExampleOffset() << ","
<< fragment.getOccurences().size() << ","
<< fragment.getPatternOffset() << ","
<< fragment.getMatchedLength()
<< std::endl;
@ -248,10 +248,9 @@ int main(int argc, char** argv) {
result->getFragments()) {
std::cout << "\t\tfragment [" << fragment.getStart()
<< "," << fragment.getEnd()
<< "] (exampleId, exampleOffset,"
<< "] (exampleCount,"
<< " patternOffset, length): "
<< fragment.getExampleId() << ","
<< fragment.getExampleOffset() << ","
<< fragment.getOccurences().size() << ","
<< fragment.getPatternOffset() << ","
<< fragment.getMatchedLength()
<< std::endl;

View File

@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
#define CONCORDIA_SEARCH_MAX_RESULTS 3
#define CONCORDIA_SEARCH_MAX_RESULTS 5
#define WORD_MAP_FILE_NAME "word_map.bin"
#define MARKERS_FILE_NAME "markers.bin"

View File

@ -69,17 +69,19 @@ std::vector<TokenizedSentence> Concordia::tokenizeAll(
bool generateCodes)
throw(ConcordiaException) {
std::vector<TokenizedSentence> result;
if (generateCodes) {
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
result.push_back(_hashGenerator->generateHash(sentence,
byWhitespace));
}
_hashGenerator->serializeWordMap();
} else {
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
}
result.push_back(_hashGenerator->generateTokens(sentence,
byWhitespace));
}
}
return result;
}
@ -208,14 +210,14 @@ SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
}
std::vector<MatchedPatternFragment> Concordia::simpleSearch(
const std::string & pattern)
MatchedPatternFragment Concordia::simpleSearch(
const std::string & pattern)
throw(ConcordiaException) {
if (_T->size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern);
} else {
std::vector<MatchedPatternFragment> result;
MatchedPatternFragment result(0, 0);
return result;
}
}
@ -269,4 +271,3 @@ std::string Concordia::_getHashedIndexFilePath() {
std::string Concordia::_getMarkersFilePath() {
return _indexPath+"/"+MARKERS_FILE_NAME;
}

View File

@ -126,12 +126,11 @@ public:
/*! Performs a simple substring lookup on the index.
For more info see \ref tutorial1_2.
\param pattern pattern to be searched in the index
\returns vector of matched results
\returns matched pattern fragment containing vector of occurences
\throws ConcordiaException
*/
std::vector<MatchedPatternFragment> simpleSearch(
const std::string & pattern)
throw(ConcordiaException);
MatchedPatternFragment simpleSearch(const std::string & pattern)
throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
throw(ConcordiaException);
@ -160,7 +159,7 @@ public:
/*! Loads HDD stored index files to RAM and generates
suffix array based on RAM stored data structures.
For more info see \ref tutorial2.
For more info see \ref tutorial2.
\throws ConcordiaException
*/
void loadRAMIndexFromDisk() throw(ConcordiaException);

View File

@ -16,7 +16,7 @@
- list of longest matched fragments sorted in descending order by length
- the best overlay
- the score of the best overlay.
For more info about concordia searching see \ref tutorial1_3.
*/
@ -75,6 +75,24 @@ public:
return _bestOverlayScore;
}
friend std::ostream & operator << (std::ostream & o,
const ConcordiaSearchResult & result) {
o << "Best overlay {" << std::endl;
BOOST_FOREACH(MatchedPatternFragment fragment,
result.getBestOverlay()) {
o << fragment << std::endl;
}
o << "}" << std::endl;
o << "All fragments {" << std::endl;
BOOST_FOREACH(MatchedPatternFragment fragment,
result.getFragments()) {
o << fragment << std::endl;
}
o << "}";
return o;
}
private:
void _checkPossibleOverlays(
std::vector<MatchedPatternFragment> currentOverlay,

View File

@ -36,12 +36,14 @@ void ConcordiaSearcher::concordiaSearch(
std::vector<SubstringOccurence> occurences =
lcpSearch(T, markers, SA, currentPattern, lcpLength);
BOOST_FOREACH(SubstringOccurence occurence, occurences) {
result->addFragment(MatchedPatternFragment(
occurence.getId(),
occurence.getOffset(),
offset,
lcpLength / sizeof(INDEX_CHARACTER_TYPE)));
if (occurences.size() > 0) {
MatchedPatternFragment fragment(offset,
lcpLength / sizeof(INDEX_CHARACTER_TYPE));
BOOST_FOREACH(SubstringOccurence occurence, occurences) {
fragment.addOccurence(occurence);
}
result->addFragment(fragment);
}
}

View File

@ -13,14 +13,12 @@ IndexSearcher::IndexSearcher() {
IndexSearcher::~IndexSearcher() {
}
std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
MatchedPatternFragment IndexSearcher::simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) {
std::vector<MatchedPatternFragment> result;
int left;
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern).getCodes();
@ -30,6 +28,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left);
MatchedPatternFragment result(0, hash.size());
for (int i = 0; i < size; ++i) {
saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
@ -40,12 +39,11 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
// removes these accidental results.
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
result.push_back(MatchedPatternFragment(
Utils::getIdFromMarker(marker),
Utils::getOffsetFromMarker(marker),
0,
hash.size()));
if (result.size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
SubstringOccurence occurence;
occurence.enterDataFromMarker(marker);
result.addOccurence(occurence);
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
break;
}
}

View File

@ -18,7 +18,7 @@
/*!
Class for searching the index with a sentence. In all searches the sentence
is first hashed and then used as a query.
IndexSearcher performs the simpleSearch on its own, but uses a
ConcordiaSearcher object to carry out concordiaSearch.
@ -42,10 +42,10 @@ public:
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\returns vector of occurences of the pattern in the index
\returns matched pattern fragment, containing occurences of the pattern in the index
\throws ConcordiaException
*/
std::vector<MatchedPatternFragment> simpleSearch(
MatchedPatternFragment simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,

View File

@ -1,14 +1,10 @@
#include "concordia/matched_pattern_fragment.hpp"
MatchedPatternFragment::MatchedPatternFragment(
const SUFFIX_MARKER_TYPE & exampleId,
const SUFFIX_MARKER_TYPE & exampleOffset,
const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & matchedLength):
Interval(patternOffset,
patternOffset + matchedLength),
_exampleId(exampleId),
_exampleOffset(exampleOffset),
_patternOffset(patternOffset),
_matchedLength(matchedLength) {
}
@ -16,3 +12,7 @@ MatchedPatternFragment::MatchedPatternFragment(
MatchedPatternFragment::~MatchedPatternFragment() {
}
void MatchedPatternFragment::addOccurence(
const SubstringOccurence & occurence) {
_occurences.push_back(occurence);
}

View File

@ -3,11 +3,15 @@
#include "concordia/common/config.hpp"
#include "concordia/interval.hpp"
#include "concordia/substring_occurence.hpp"
#include <vector>
#include <iostream>
#include <boost/foreach.hpp>
/*!
Class representing matched pattern fragment in concordia search.
This fragment can be seen as a word interval of the pattern.
This class holds information about:
- where the pattern fragment was matched (example id and example offset)
- where the fragment is located within the pattern
@ -17,32 +21,26 @@
class MatchedPatternFragment : public Interval {
public:
/*! Constructor.
\param exampleId id of the example where the pattern fragment was matched
\param exampleOffset offset of the matched fragment in the example
\param patternOffset offset of the matched fragment in the pattern
\param matchedLength length of the matched pattern
*/
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
const SUFFIX_MARKER_TYPE & exampleOffset,
const SUFFIX_MARKER_TYPE & patternOffset,
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & matchedLength);
/*! Destructor.
*/
virtual ~MatchedPatternFragment();
/*! Getter for example id.
\returns example id
/*! Getter for occurences.
\returns occurences
*/
SUFFIX_MARKER_TYPE getExampleId() const {
return _exampleId;
std::vector<SubstringOccurence> getOccurences() const {
return _occurences;
}
/*! Getter for example offset.
\returns example offset
/*! Adds an occurence to the list.
\param fragment occurence to be added
*/
SUFFIX_MARKER_TYPE getExampleOffset() const {
return _exampleOffset;
}
void addOccurence(const SubstringOccurence & occurence);
/*! Getter for pattern offset.
\returns pattern offset
@ -65,10 +63,22 @@ public:
return (_matchedLength > other.getMatchedLength());
}
private:
SUFFIX_MARKER_TYPE _exampleId;
friend std::ostream & operator << (std::ostream & o,
const MatchedPatternFragment & fragment) {
o << "fragment(patternOffset=" << fragment.getPatternOffset()
<< ", matchedLength=" << fragment.getMatchedLength() << ") {"
<< std::endl;
BOOST_FOREACH(SubstringOccurence occurence, fragment.getOccurences()) {
o << "\t" << occurence << std::endl;
}
SUFFIX_MARKER_TYPE _exampleOffset;
o << "}";
return o;
}
private:
std::vector<SubstringOccurence> _occurences;
SUFFIX_MARKER_TYPE _patternOffset;

View File

@ -3,6 +3,7 @@
#include "concordia/common/config.hpp"
#include <string>
#include <iostream>
/*!
Class representing occurence of a searched substring.
@ -65,6 +66,13 @@ public:
*/
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
friend std::ostream & operator << (std::ostream & o,
const SubstringOccurence & occurence) {
return o << "occurence(exampleId=" << occurence.getId()
<< ", offset=" << occurence.getOffset() << ")";
}
private:
SUFFIX_MARKER_TYPE _id;

View File

@ -40,53 +40,51 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Marysia posiada rysia",123));
concordia.refreshSAfromRAM();
/*The test index contains 3 sentences:
/*The test index contains 3 sentences:
14: "Ala posiada kota"
51: "Ala posiada rysia"
123: "Marysia posiada rysia"
Test word map:
Ala -> 0
posiada -> 1
kota -> 2
rysia -> 3
Marysia -> 4
Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
*/
std::vector<MatchedPatternFragment> searchResult1 = concordia.simpleSearch("posiada rysia");
std::vector<MatchedPatternFragment> searchResult2 = concordia.simpleSearch("posiada kota Ala");
MatchedPatternFragment searchResult1 = concordia.simpleSearch("posiada rysia");
MatchedPatternFragment searchResult2 = concordia.simpleSearch("posiada kota Ala");
concordia.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 123);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 2);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
// Checking pattern spanning over 2 segments
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0);
}
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
{
// modified stop words to avoid anonymization
// modified stop words to avoid anonymization
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<Example> testExamples;
@ -106,12 +104,12 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getType(), 1);
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getValue(), "xjest");
/*The test index contains 4 sentences:
/*The test index contains 4 sentences:
312: "xto xjest okno"
202: "czy xjest okno otwarte"
45: "chyba xto xjest xtutaj"
29: "xto xjest"
Test word map:
xto -> 0
xjest -> 1
@ -120,42 +118,37 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
otwarte -> 4
chyba -> 5
xtutaj -> 6
Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11 12
T[n]: 0 1 2 3 1 2 4 5 0 1 6 0 1
Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11 12
SA[n]: 11 0 8 12 1 4 9 2 5 3 6 7 10
*/
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("xto xjest");
std::vector<MatchedPatternFragment> searchResult2 = concordia2.simpleSearch("xjest okno");
MatchedPatternFragment searchResult1 = concordia2.simpleSearch("xto xjest");
MatchedPatternFragment searchResult2 = concordia2.simpleSearch("xjest okno");
concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 3);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleId(), 29);
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(2).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 3);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getId(), 29);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 202);
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 312);
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 2);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getId(), 202);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getId(), 312);
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getOffset(), 1);
}
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
@ -166,17 +159,16 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
testExamples.push_back(Example("czy xjest żółte otwarte",202));
concordia.addAllExamples(testExamples);
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
MatchedPatternFragment searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 6);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 1);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
}
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
@ -187,15 +179,15 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Marysia posiada rysia",123));
concordia.refreshSAfromRAM();
/*The test index contains 3 sentences:
/*The test index contains 3 sentences:
14: "Ala posiada kota"
51: "Ala posiada rysia"
123: "Marysia posiada rysia"
*/
// the below expectations assume 0.3 anubis threshold
std::vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 51);
@ -235,9 +227,9 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
concordia.addExample(Example("Gosia chyba posiada rysia też",167));
concordia.addExample(Example("Ania od wczoraj posiada rysia",45));
concordia.refreshSAfromRAM();
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
// best overlay: [0,2], [2,3], score = 0.829
// best overlay: [0,2], [2,3], score = 0.829
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.829, 0.1);
@ -247,53 +239,50 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
/*
addFragment 45,2,0,2
addFragment 51,1,0,2
addFragment 123,1,0,2
addFragment 45,3,1,1
addFragment 51,2,1,1
addFragment 123,2,1,1
addFragment 167,1,2,1
adding fragment: offset=0, length=2
adding occurence: example id=167, offset=2
adding occurence: example id=45, offset=3
adding occurence: example id=51, offset=1
adding occurence: example id=123, offset=1
adding fragment: offset=1, length=1
adding occurence: example id=167, offset=3
adding occurence: example id=45, offset=4
adding occurence: example id=51, offset=2
adding occurence: example id=123, offset=2
adding fragment: offset=2, length=1
adding occurence: example id=167, offset=1
*/
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getOffset(), 1);
concordia.clearIndex();
}
@ -322,9 +311,11 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
concordia.addTokenizedExample(ts, 14);
concordia.refreshSAfromRAM();
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
// best overlay:
// best overlay:
// std::cout << *searchResult1 << std::endl;
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.655, 0.1);
@ -344,10 +335,46 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2
Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2
Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1
Best overlay {
fragment(patternOffset=1, matchedLength=4) {
occurence(exampleId=321, offset=0)
}
fragment(patternOffset=5, matchedLength=4) {
occurence(exampleId=14, offset=7)
}
}
All fragments {
fragment(patternOffset=4, matchedLength=5) {
occurence(exampleId=14, offset=6)
}
fragment(patternOffset=1, matchedLength=4) {
occurence(exampleId=321, offset=0)
}
fragment(patternOffset=5, matchedLength=4) {
occurence(exampleId=14, offset=7)
}
fragment(patternOffset=2, matchedLength=3) {
occurence(exampleId=321, offset=1)
}
fragment(patternOffset=6, matchedLength=3) {
occurence(exampleId=14, offset=8)
}
fragment(patternOffset=3, matchedLength=2) {
occurence(exampleId=321, offset=2)
}
fragment(patternOffset=7, matchedLength=2) {
occurence(exampleId=14, offset=9)
}
fragment(patternOffset=8, matchedLength=1) {
occurence(exampleId=14, offset=10)
}
}
*/
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 14);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 6);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 14);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 6);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4);
@ -373,13 +400,13 @@ BOOST_AUTO_TEST_CASE( Tokenize )
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
std::vector<std::string> sentences;
sentences.push_back("Marysia, ma rysia;");
sentences.push_back("Testing complete;");
sentences.push_back("This, is (a) weird;! sentence <>");
std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
concordia.clearIndex();
@ -387,7 +414,7 @@ BOOST_AUTO_TEST_CASE( Tokenize )
BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3);
BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2);
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
}
BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
@ -400,30 +427,30 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
concordia.addExample(Example("Marysia posiada rysia",123));
concordia.addExample(Example("Ala posiada kota i psa",542));
concordia.refreshSAfromRAM();
/*The test index contains 3 sentences:
/*The test index contains 3 sentences:
14: "Ala posiada kota"
51: "Ala posiada rysia"
123: "Marysia posiada rysia"
Test word map:
Ala -> 0
posiada -> 1
kota -> 2
rysia -> 3
Marysia -> 4
Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
*/
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1);
@ -446,7 +473,7 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
@ -455,7 +482,7 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
concordia.clearIndex();
}
BOOST_AUTO_TEST_CASE( TokenizeOnly )
@ -469,7 +496,7 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly )
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
@ -478,7 +505,7 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly )
BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
concordia.clearIndex();
}
BOOST_AUTO_TEST_SUITE_END()

BIN
scripts/concordia_json.zip Normal file

Binary file not shown.

View File

@ -0,0 +1,22 @@
{
"status": "success", //status operacji
"result": {
"bestOverlayScore" : <liczba 0-1> // Concordia podaje score znalezionego przez siebie dopasowania
"bestOverlay" : [ // lista fragmentów zdania wejściowego, które znalazły się w pamięci tłumaczeń
{ // jeden fragment
"matchedPatternStart": 0, // index (character-based) początku fragmentu zdania wejściowego
"matchedPatternEnd": 8, // index końca fragmentu (exclusive, przedział prawostronnie otwarty)
"occurences": [{ // lista przykładów z pamięci tłumaczeń, w których znalazł się dany fragment zdania wejściowego
"id": 1782145, // id przykładu z pamięci tłumaczeń (przykład to para zdań źródłowe-docelowe)
"matchedExampleStart": 305, // index początku fragmentu w zdaniu źródłowym przykładu
"matchedExampleEnd": 314, // index końca fragmentu w zdaniu źródłowym przykładu (exclusive)
"sourceSegment": <text>, // pełny tekst zdania źródłowego przykładu
"targetSegment": <text>, // pełny tekst zdania docelowego przykładu
"targetFragments": [ // lista fragmentów zdania docelowego, do których urównoleglony jest znaleziony fragment zdania źródłowego
[257, 264]
]
}, .... // mogą być jeszcze kolejne fragmenty
]
}
}

View File

@ -0,0 +1,5 @@
{
"operation": "concordiaSearch",
"pattern":"Ala ma kota",
"tmId":1
}

View File

@ -0,0 +1,43 @@
{
"status": "success",
"result": {
"bestOverlayScore": 0.5,
"bestOverlay": [{
"matchedPatternStart": 0,
"matchedPatternEnd": 8,
"occurences": [{
"id": 1782145,
"matchedExampleStart": 305,
"matchedExampleEnd": 314,
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
"targetFragments": [
[257, 264]
]
}, {
"id": 1782145,
"matchedExampleStart": 326,
"matchedExampleEnd": 335,
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
"targetFragments": [
[300, 315]
]
}]
}, {
"matchedPatternStart": 9,
"matchedPatternEnd": 47,
"occurences": [{
"id": 1623941,
"matchedExampleStart": 54,
"matchedExampleEnd": 93,
"sourceSegment": "wszelkie spory między albo islandią , albo norwegią a państwem członkowskim unii europejskiej dotyczące interpretacji lub stosowania niniejszej umowy mogą być przekazywane przez stronę sporu na posiedzeniu przedstawicieli rządów państw członkowskich unii europejskiej oraz islandii i norwegii w celu rozstrzygnięcia sporu w terminie sześciu miesięcy .",
"targetSegment": "any dispute between either iceland or norway and a member state of the european union regarding the interpretation or the application of this agreement may be referred by a party to the dispute to a meeting of representatives of the governments of the member states of the european union and of iceland and norway , with a view to its settlement within six months .",
"targetFragments": [
[51, 85],
[96, 99]
]
}]
}]
}
}