mutliple results

This commit is contained in:
rjawor 2017-04-21 14:51:58 +02:00
parent c3826919ba
commit 31e4f091ad
16 changed files with 317 additions and 185 deletions

View File

@ -174,17 +174,18 @@ int main(int argc, char** argv) {
std::cout << "\tSearching for pattern: \"" << pattern << std::cout << "\tSearching for pattern: \"" << pattern <<
"\"" << std::endl; "\"" << std::endl;
time_start = boost::posix_time::microsec_clock::local_time(); time_start = boost::posix_time::microsec_clock::local_time();
std::vector<MatchedPatternFragment> result = MatchedPatternFragment result =
concordia.simpleSearch(pattern); concordia.simpleSearch(pattern);
time_end = boost::posix_time::microsec_clock::local_time(); time_end = boost::posix_time::microsec_clock::local_time();
msdiff = time_end - time_start; msdiff = time_end - time_start;
std::cout << "\tFound: " << result.size() << " matches. " std::cout << "\tFound: " << result.getOccurences().size()
<< "Search took: " << << " matches. " << "Search took: "
msdiff.total_milliseconds() << "ms." << std::endl; << msdiff.total_milliseconds() << "ms." << std::endl;
if (!cli.count("silent")) { if (!cli.count("silent")) {
BOOST_FOREACH(MatchedPatternFragment occurence, result) { BOOST_FOREACH(SubstringOccurence occurence,
result.getOccurences()) {
std::cout << "\t\tfound match in sentence number: " std::cout << "\t\tfound match in sentence number: "
<< occurence.getExampleId() << std::endl; << occurence.getId() << std::endl;
} }
} }
} else if (cli.count("anubis-search")) { } else if (cli.count("anubis-search")) {
@ -234,10 +235,9 @@ int main(int argc, char** argv) {
result->getBestOverlay()) { result->getBestOverlay()) {
std::cout << "\t\tfragment [" << fragment.getStart() std::cout << "\t\tfragment [" << fragment.getStart()
<< "," << fragment.getEnd() << "," << fragment.getEnd()
<< "] (exampleId, exampleOffset," << "] (exampleCount,"
<< " patternOffset, length): " << " patternOffset, length): "
<< fragment.getExampleId() << "," << fragment.getOccurences().size() << ","
<< fragment.getExampleOffset() << ","
<< fragment.getPatternOffset() << "," << fragment.getPatternOffset() << ","
<< fragment.getMatchedLength() << fragment.getMatchedLength()
<< std::endl; << std::endl;
@ -248,10 +248,9 @@ int main(int argc, char** argv) {
result->getFragments()) { result->getFragments()) {
std::cout << "\t\tfragment [" << fragment.getStart() std::cout << "\t\tfragment [" << fragment.getStart()
<< "," << fragment.getEnd() << "," << fragment.getEnd()
<< "] (exampleId, exampleOffset," << "] (exampleCount,"
<< " patternOffset, length): " << " patternOffset, length): "
<< fragment.getExampleId() << "," << fragment.getOccurences().size() << ","
<< fragment.getExampleOffset() << ","
<< fragment.getPatternOffset() << "," << fragment.getPatternOffset() << ","
<< fragment.getMatchedLength() << fragment.getMatchedLength()
<< std::endl; << std::endl;

View File

@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset // sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length. // and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
#define CONCORDIA_SEARCH_MAX_RESULTS 3 #define CONCORDIA_SEARCH_MAX_RESULTS 5
#define WORD_MAP_FILE_NAME "word_map.bin" #define WORD_MAP_FILE_NAME "word_map.bin"
#define MARKERS_FILE_NAME "markers.bin" #define MARKERS_FILE_NAME "markers.bin"

View File

@ -69,17 +69,19 @@ std::vector<TokenizedSentence> Concordia::tokenizeAll(
bool generateCodes) bool generateCodes)
throw(ConcordiaException) { throw(ConcordiaException) {
std::vector<TokenizedSentence> result; std::vector<TokenizedSentence> result;
if (generateCodes) { if (generateCodes) {
BOOST_FOREACH(std::string sentence, sentences) { BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace)); result.push_back(_hashGenerator->generateHash(sentence,
byWhitespace));
} }
_hashGenerator->serializeWordMap(); _hashGenerator->serializeWordMap();
} else { } else {
BOOST_FOREACH(std::string sentence, sentences) { BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace)); result.push_back(_hashGenerator->generateTokens(sentence,
} byWhitespace));
}
} }
return result; return result;
} }
@ -208,14 +210,14 @@ SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
} }
std::vector<MatchedPatternFragment> Concordia::simpleSearch( MatchedPatternFragment Concordia::simpleSearch(
const std::string & pattern) const std::string & pattern)
throw(ConcordiaException) { throw(ConcordiaException) {
if (_T->size() > 0) { if (_T->size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T, return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern); _markers, _SA, pattern);
} else { } else {
std::vector<MatchedPatternFragment> result; MatchedPatternFragment result(0, 0);
return result; return result;
} }
} }
@ -269,4 +271,3 @@ std::string Concordia::_getHashedIndexFilePath() {
std::string Concordia::_getMarkersFilePath() { std::string Concordia::_getMarkersFilePath() {
return _indexPath+"/"+MARKERS_FILE_NAME; return _indexPath+"/"+MARKERS_FILE_NAME;
} }

View File

@ -126,12 +126,11 @@ public:
/*! Performs a simple substring lookup on the index. /*! Performs a simple substring lookup on the index.
For more info see \ref tutorial1_2. For more info see \ref tutorial1_2.
\param pattern pattern to be searched in the index \param pattern pattern to be searched in the index
\returns vector of matched results \returns matched pattern fragment containing vector of occurences
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<MatchedPatternFragment> simpleSearch( MatchedPatternFragment simpleSearch(const std::string & pattern)
const std::string & pattern) throw(ConcordiaException);
throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern) SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
throw(ConcordiaException); throw(ConcordiaException);
@ -160,7 +159,7 @@ public:
/*! Loads HDD stored index files to RAM and generates /*! Loads HDD stored index files to RAM and generates
suffix array based on RAM stored data structures. suffix array based on RAM stored data structures.
For more info see \ref tutorial2. For more info see \ref tutorial2.
\throws ConcordiaException \throws ConcordiaException
*/ */
void loadRAMIndexFromDisk() throw(ConcordiaException); void loadRAMIndexFromDisk() throw(ConcordiaException);

View File

@ -16,7 +16,7 @@
- list of longest matched fragments sorted in descending order by length - list of longest matched fragments sorted in descending order by length
- the best overlay - the best overlay
- the score of the best overlay. - the score of the best overlay.
For more info about concordia searching see \ref tutorial1_3. For more info about concordia searching see \ref tutorial1_3.
*/ */
@ -75,6 +75,24 @@ public:
return _bestOverlayScore; return _bestOverlayScore;
} }
friend std::ostream & operator << (std::ostream & o,
const ConcordiaSearchResult & result) {
o << "Best overlay {" << std::endl;
BOOST_FOREACH(MatchedPatternFragment fragment,
result.getBestOverlay()) {
o << fragment << std::endl;
}
o << "}" << std::endl;
o << "All fragments {" << std::endl;
BOOST_FOREACH(MatchedPatternFragment fragment,
result.getFragments()) {
o << fragment << std::endl;
}
o << "}";
return o;
}
private: private:
void _checkPossibleOverlays( void _checkPossibleOverlays(
std::vector<MatchedPatternFragment> currentOverlay, std::vector<MatchedPatternFragment> currentOverlay,

View File

@ -36,12 +36,14 @@ void ConcordiaSearcher::concordiaSearch(
std::vector<SubstringOccurence> occurences = std::vector<SubstringOccurence> occurences =
lcpSearch(T, markers, SA, currentPattern, lcpLength); lcpSearch(T, markers, SA, currentPattern, lcpLength);
BOOST_FOREACH(SubstringOccurence occurence, occurences) { if (occurences.size() > 0) {
result->addFragment(MatchedPatternFragment( MatchedPatternFragment fragment(offset,
occurence.getId(), lcpLength / sizeof(INDEX_CHARACTER_TYPE));
occurence.getOffset(),
offset, BOOST_FOREACH(SubstringOccurence occurence, occurences) {
lcpLength / sizeof(INDEX_CHARACTER_TYPE))); fragment.addOccurence(occurence);
}
result->addFragment(fragment);
} }
} }

View File

@ -13,14 +13,12 @@ IndexSearcher::IndexSearcher() {
IndexSearcher::~IndexSearcher() { IndexSearcher::~IndexSearcher() {
} }
std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch( MatchedPatternFragment IndexSearcher::simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) { const std::string & pattern) throw(ConcordiaException) {
std::vector<MatchedPatternFragment> result;
int left; int left;
std::vector<INDEX_CHARACTER_TYPE> hash = std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern).getCodes(); hashGenerator->generateHash(pattern).getCodes();
@ -30,6 +28,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
int size = sa_search(T->data(), (saidx_t) T->size(), int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength, (const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left); SA->data(), (saidx_t) SA->size(), &left);
MatchedPatternFragment result(0, hash.size());
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
saidx_t resultPos = SA->at(left + i); saidx_t resultPos = SA->at(left + i);
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
@ -40,12 +39,11 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
// removes these accidental results. // removes these accidental results.
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
result.push_back(MatchedPatternFragment(
Utils::getIdFromMarker(marker), SubstringOccurence occurence;
Utils::getOffsetFromMarker(marker), occurence.enterDataFromMarker(marker);
0, result.addOccurence(occurence);
hash.size())); if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
if (result.size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
break; break;
} }
} }

View File

@ -18,7 +18,7 @@
/*! /*!
Class for searching the index with a sentence. In all searches the sentence Class for searching the index with a sentence. In all searches the sentence
is first hashed and then used as a query. is first hashed and then used as a query.
IndexSearcher performs the simpleSearch on its own, but uses a IndexSearcher performs the simpleSearch on its own, but uses a
ConcordiaSearcher object to carry out concordiaSearch. ConcordiaSearcher object to carry out concordiaSearch.
@ -42,10 +42,10 @@ public:
\param markers markers array for the needs of searching \param markers markers array for the needs of searching
\param SA suffix array for the needs of searching \param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index. \param pattern string pattern to be searched in the index.
\returns vector of occurences of the pattern in the index \returns matched pattern fragment, containing occurences of the pattern in the index
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<MatchedPatternFragment> simpleSearch( MatchedPatternFragment simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,

View File

@ -1,14 +1,10 @@
#include "concordia/matched_pattern_fragment.hpp" #include "concordia/matched_pattern_fragment.hpp"
MatchedPatternFragment::MatchedPatternFragment( MatchedPatternFragment::MatchedPatternFragment(
const SUFFIX_MARKER_TYPE & exampleId,
const SUFFIX_MARKER_TYPE & exampleOffset,
const SUFFIX_MARKER_TYPE & patternOffset, const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & matchedLength): const SUFFIX_MARKER_TYPE & matchedLength):
Interval(patternOffset, Interval(patternOffset,
patternOffset + matchedLength), patternOffset + matchedLength),
_exampleId(exampleId),
_exampleOffset(exampleOffset),
_patternOffset(patternOffset), _patternOffset(patternOffset),
_matchedLength(matchedLength) { _matchedLength(matchedLength) {
} }
@ -16,3 +12,7 @@ MatchedPatternFragment::MatchedPatternFragment(
MatchedPatternFragment::~MatchedPatternFragment() { MatchedPatternFragment::~MatchedPatternFragment() {
} }
void MatchedPatternFragment::addOccurence(
const SubstringOccurence & occurence) {
_occurences.push_back(occurence);
}

View File

@ -3,11 +3,15 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/interval.hpp" #include "concordia/interval.hpp"
#include "concordia/substring_occurence.hpp"
#include <vector>
#include <iostream>
#include <boost/foreach.hpp>
/*! /*!
Class representing matched pattern fragment in concordia search. Class representing matched pattern fragment in concordia search.
This fragment can be seen as a word interval of the pattern. This fragment can be seen as a word interval of the pattern.
This class holds information about: This class holds information about:
- where the pattern fragment was matched (example id and example offset) - where the pattern fragment was matched (example id and example offset)
- where the fragment is located within the pattern - where the fragment is located within the pattern
@ -17,32 +21,26 @@
class MatchedPatternFragment : public Interval { class MatchedPatternFragment : public Interval {
public: public:
/*! Constructor. /*! Constructor.
\param exampleId id of the example where the pattern fragment was matched
\param exampleOffset offset of the matched fragment in the example
\param patternOffset offset of the matched fragment in the pattern \param patternOffset offset of the matched fragment in the pattern
\param matchedLength length of the matched pattern \param matchedLength length of the matched pattern
*/ */
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId, MatchedPatternFragment(const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & exampleOffset,
const SUFFIX_MARKER_TYPE & patternOffset,
const SUFFIX_MARKER_TYPE & matchedLength); const SUFFIX_MARKER_TYPE & matchedLength);
/*! Destructor. /*! Destructor.
*/ */
virtual ~MatchedPatternFragment(); virtual ~MatchedPatternFragment();
/*! Getter for example id. /*! Getter for occurences.
\returns example id \returns occurences
*/ */
SUFFIX_MARKER_TYPE getExampleId() const { std::vector<SubstringOccurence> getOccurences() const {
return _exampleId; return _occurences;
} }
/*! Getter for example offset. /*! Adds an occurence to the list.
\returns example offset \param fragment occurence to be added
*/ */
SUFFIX_MARKER_TYPE getExampleOffset() const { void addOccurence(const SubstringOccurence & occurence);
return _exampleOffset;
}
/*! Getter for pattern offset. /*! Getter for pattern offset.
\returns pattern offset \returns pattern offset
@ -65,10 +63,22 @@ public:
return (_matchedLength > other.getMatchedLength()); return (_matchedLength > other.getMatchedLength());
} }
private: friend std::ostream & operator << (std::ostream & o,
SUFFIX_MARKER_TYPE _exampleId; const MatchedPatternFragment & fragment) {
o << "fragment(patternOffset=" << fragment.getPatternOffset()
<< ", matchedLength=" << fragment.getMatchedLength() << ") {"
<< std::endl;
BOOST_FOREACH(SubstringOccurence occurence, fragment.getOccurences()) {
o << "\t" << occurence << std::endl;
}
SUFFIX_MARKER_TYPE _exampleOffset; o << "}";
return o;
}
private:
std::vector<SubstringOccurence> _occurences;
SUFFIX_MARKER_TYPE _patternOffset; SUFFIX_MARKER_TYPE _patternOffset;

View File

@ -3,6 +3,7 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include <string> #include <string>
#include <iostream>
/*! /*!
Class representing occurence of a searched substring. Class representing occurence of a searched substring.
@ -65,6 +66,13 @@ public:
*/ */
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker); void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
friend std::ostream & operator << (std::ostream & o,
const SubstringOccurence & occurence) {
return o << "occurence(exampleId=" << occurence.getId()
<< ", offset=" << occurence.getOffset() << ")";
}
private: private:
SUFFIX_MARKER_TYPE _id; SUFFIX_MARKER_TYPE _id;

View File

@ -40,53 +40,51 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada"); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
concordia.addExample(Example("Ala posiada rysia",51)); concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Marysia posiada rysia",123)); concordia.addExample(Example("Marysia posiada rysia",123));
concordia.refreshSAfromRAM(); concordia.refreshSAfromRAM();
/*The test index contains 3 sentences: /*The test index contains 3 sentences:
14: "Ala posiada kota" 14: "Ala posiada kota"
51: "Ala posiada rysia" 51: "Ala posiada rysia"
123: "Marysia posiada rysia" 123: "Marysia posiada rysia"
Test word map: Test word map:
Ala -> 0 Ala -> 0
posiada -> 1 posiada -> 1
kota -> 2 kota -> 2
rysia -> 3 rysia -> 3
Marysia -> 4 Marysia -> 4
Test hashed index: Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11 n: 0 1 2 3 4 5 6 7 8 9 10 11
T[n]: 0 1 2 | 0 1 3 | 4 1 3 | T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
Test suffix array: Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11 n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
*/ */
std::vector<MatchedPatternFragment> searchResult1 = concordia.simpleSearch("posiada rysia"); MatchedPatternFragment searchResult1 = concordia.simpleSearch("posiada rysia");
std::vector<MatchedPatternFragment> searchResult2 = concordia.simpleSearch("posiada kota Ala"); MatchedPatternFragment searchResult2 = concordia.simpleSearch("posiada kota Ala");
concordia.clearIndex(); concordia.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 123); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 51); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
// Checking pattern spanning over 2 segments // Checking pattern spanning over 2 segments
BOOST_CHECK_EQUAL(searchResult2.size(), 0); BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0);
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
{ {
// modified stop words to avoid anonymization // modified stop words to avoid anonymization
Concordia concordia = Concordia(TestResourcesManager::getTempPath(), Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<Example> testExamples; std::vector<Example> testExamples;
@ -106,12 +104,12 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getType(), 1); BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getType(), 1);
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getValue(), "xjest"); BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getValue(), "xjest");
/*The test index contains 4 sentences: /*The test index contains 4 sentences:
312: "xto xjest okno" 312: "xto xjest okno"
202: "czy xjest okno otwarte" 202: "czy xjest okno otwarte"
45: "chyba xto xjest xtutaj" 45: "chyba xto xjest xtutaj"
29: "xto xjest" 29: "xto xjest"
Test word map: Test word map:
xto -> 0 xto -> 0
xjest -> 1 xjest -> 1
@ -120,42 +118,37 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
otwarte -> 4 otwarte -> 4
chyba -> 5 chyba -> 5
xtutaj -> 6 xtutaj -> 6
Test hashed index: Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11 12 n: 0 1 2 3 4 5 6 7 8 9 10 11 12
T[n]: 0 1 2 3 1 2 4 5 0 1 6 0 1 T[n]: 0 1 2 3 1 2 4 5 0 1 6 0 1
Test suffix array: Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11 12 n: 0 1 2 3 4 5 6 7 8 9 10 11 12
SA[n]: 11 0 8 12 1 4 9 2 5 3 6 7 10 SA[n]: 11 0 8 12 1 4 9 2 5 3 6 7 10
*/ */
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(), Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("xto xjest"); MatchedPatternFragment searchResult1 = concordia2.simpleSearch("xto xjest");
std::vector<MatchedPatternFragment> searchResult2 = concordia2.simpleSearch("xjest okno"); MatchedPatternFragment searchResult2 = concordia2.simpleSearch("xjest okno");
concordia2.clearIndex(); concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 3); BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 3);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 0); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 45); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getId(), 29);
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleId(), 29);
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(2).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult2.size(), 2); BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 202); BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getId(), 202);
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(0).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getId(), 312);
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 312); BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getMatchedLength(), 2);
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
@ -166,17 +159,16 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312)); testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
testExamples.push_back(Example("czy xjest żółte otwarte",202)); testExamples.push_back(Example("czy xjest żółte otwarte",202));
concordia.addAllExamples(testExamples); concordia.addAllExamples(testExamples);
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(), Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia"); MatchedPatternFragment searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
concordia2.clearIndex(); concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 1); BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 2); BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 6);
} }
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
@ -187,15 +179,15 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
concordia.addExample(Example("Ala posiada rysia",51)); concordia.addExample(Example("Ala posiada rysia",51));
concordia.addExample(Example("Marysia posiada rysia",123)); concordia.addExample(Example("Marysia posiada rysia",123));
concordia.refreshSAfromRAM(); concordia.refreshSAfromRAM();
/*The test index contains 3 sentences: /*The test index contains 3 sentences:
14: "Ala posiada kota" 14: "Ala posiada kota"
51: "Ala posiada rysia" 51: "Ala posiada rysia"
123: "Marysia posiada rysia" 123: "Marysia posiada rysia"
*/ */
// the below expectations assume 0.3 anubis threshold // the below expectations assume 0.3 anubis threshold
std::vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba"); std::vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
BOOST_CHECK_EQUAL(searchResult1.size(), 2); BOOST_CHECK_EQUAL(searchResult1.size(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 51); BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 51);
@ -235,9 +227,9 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
concordia.addExample(Example("Gosia chyba posiada rysia też",167)); concordia.addExample(Example("Gosia chyba posiada rysia też",167));
concordia.addExample(Example("Ania od wczoraj posiada rysia",45)); concordia.addExample(Example("Ania od wczoraj posiada rysia",45));
concordia.refreshSAfromRAM(); concordia.refreshSAfromRAM();
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba"); boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
// best overlay: [0,2], [2,3], score = 0.829 // best overlay: [0,2], [2,3], score = 0.829
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.829, 0.1); BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.829, 0.1);
@ -247,53 +239,50 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
/* /*
addFragment 45,2,0,2 adding fragment: offset=0, length=2
addFragment 51,1,0,2 adding occurence: example id=167, offset=2
addFragment 123,1,0,2 adding occurence: example id=45, offset=3
addFragment 45,3,1,1 adding occurence: example id=51, offset=1
addFragment 51,2,1,1 adding occurence: example id=123, offset=1
addFragment 123,2,1,1 adding fragment: offset=1, length=1
addFragment 167,1,2,1 adding occurence: example id=167, offset=3
adding occurence: example id=45, offset=4
adding occurence: example id=51, offset=2
adding occurence: example id=123, offset=2
adding fragment: offset=2, length=1
adding occurence: example id=167, offset=1
*/ */
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167); BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleId(), 167);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1);
concordia.clearIndex(); concordia.clearIndex();
} }
@ -322,9 +311,11 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
concordia.addTokenizedExample(ts, 14); concordia.addTokenizedExample(ts, 14);
concordia.refreshSAfromRAM(); concordia.refreshSAfromRAM();
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers"); boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
// best overlay: // best overlay:
// std::cout << *searchResult1 << std::endl;
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2); BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.655, 0.1); BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.655, 0.1);
@ -344,10 +335,46 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2 Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2
Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2 Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2
Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1 Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1
Best overlay {
fragment(patternOffset=1, matchedLength=4) {
occurence(exampleId=321, offset=0)
}
fragment(patternOffset=5, matchedLength=4) {
occurence(exampleId=14, offset=7)
}
}
All fragments {
fragment(patternOffset=4, matchedLength=5) {
occurence(exampleId=14, offset=6)
}
fragment(patternOffset=1, matchedLength=4) {
occurence(exampleId=321, offset=0)
}
fragment(patternOffset=5, matchedLength=4) {
occurence(exampleId=14, offset=7)
}
fragment(patternOffset=2, matchedLength=3) {
occurence(exampleId=321, offset=1)
}
fragment(patternOffset=6, matchedLength=3) {
occurence(exampleId=14, offset=8)
}
fragment(patternOffset=3, matchedLength=2) {
occurence(exampleId=321, offset=2)
}
fragment(patternOffset=7, matchedLength=2) {
occurence(exampleId=14, offset=9)
}
fragment(patternOffset=8, matchedLength=1) {
occurence(exampleId=14, offset=10)
}
}
*/ */
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 14); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 14);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 6); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 6);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5);
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4); BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4);
@ -373,13 +400,13 @@ BOOST_AUTO_TEST_CASE( Tokenize )
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada"); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
std::vector<std::string> sentences; std::vector<std::string> sentences;
sentences.push_back("Marysia, ma rysia;"); sentences.push_back("Marysia, ma rysia;");
sentences.push_back("Testing complete;"); sentences.push_back("Testing complete;");
sentences.push_back("This, is (a) weird;! sentence <>"); sentences.push_back("This, is (a) weird;! sentence <>");
std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences); std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
concordia.clearIndex(); concordia.clearIndex();
@ -387,7 +414,7 @@ BOOST_AUTO_TEST_CASE( Tokenize )
BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3); BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3);
BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2); BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2);
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5); BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
} }
BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences ) BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
@ -400,30 +427,30 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
concordia.addExample(Example("Marysia posiada rysia",123)); concordia.addExample(Example("Marysia posiada rysia",123));
concordia.addExample(Example("Ala posiada kota i psa",542)); concordia.addExample(Example("Ala posiada kota i psa",542));
concordia.refreshSAfromRAM(); concordia.refreshSAfromRAM();
/*The test index contains 3 sentences: /*The test index contains 3 sentences:
14: "Ala posiada kota" 14: "Ala posiada kota"
51: "Ala posiada rysia" 51: "Ala posiada rysia"
123: "Marysia posiada rysia" 123: "Marysia posiada rysia"
Test word map: Test word map:
Ala -> 0 Ala -> 0
posiada -> 1 posiada -> 1
kota -> 2 kota -> 2
rysia -> 3 rysia -> 3
Marysia -> 4 Marysia -> 4
Test hashed index: Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11 n: 0 1 2 3 4 5 6 7 8 9 10 11
T[n]: 0 1 2 | 0 1 3 | 4 1 3 | T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
Test suffix array: Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11 n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
*/ */
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0); BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0); BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0);
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1); BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1);
@ -446,7 +473,7 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23"); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
@ -455,7 +482,7 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
BOOST_CHECK_EQUAL(ts.getCodes().size(), 7); BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
concordia.clearIndex(); concordia.clearIndex();
} }
BOOST_AUTO_TEST_CASE( TokenizeOnly ) BOOST_AUTO_TEST_CASE( TokenizeOnly )
@ -469,7 +496,7 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly )
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23"); BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
@ -478,7 +505,7 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly )
BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
concordia.clearIndex(); concordia.clearIndex();
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

BIN
scripts/concordia_json.zip Normal file

Binary file not shown.

View File

@ -0,0 +1,22 @@
{
"status": "success", //status operacji
"result": {
"bestOverlayScore" : <liczba 0-1> // Concordia podaje score znalezionego przez siebie dopasowania
"bestOverlay" : [ // lista fragmentów zdania wejściowego, które znalazły się w pamięci tłumaczeń
{ // jeden fragment
"matchedPatternStart": 0, // index (character-based) początku fragmentu zdania wejściowego
"matchedPatternEnd": 8, // index końca fragmentu (exclusive, przedział prawostronnie otwarty)
"occurences": [{ // lista przykładów z pamięci tłumaczeń, w których znalazł się dany fragment zdania wejściowego
"id": 1782145, // id przykładu z pamięci tłumaczeń (przykład to para zdań źródłowe-docelowe)
"matchedExampleStart": 305, // index początku fragmentu w zdaniu źródłowym przykładu
"matchedExampleEnd": 314, // index końca fragmentu w zdaniu źródłowym przykładu (exclusive)
"sourceSegment": <text>, // pełny tekst zdania źródłowego przykładu
"targetSegment": <text>, // pełny tekst zdania docelowego przykładu
"targetFragments": [ // lista fragmentów zdania docelowego, do których urównoleglony jest znaleziony fragment zdania źródłowego
[257, 264]
]
}, .... // mogą być jeszcze kolejne fragmenty
]
}
}

View File

@ -0,0 +1,5 @@
{
"operation": "concordiaSearch",
"pattern":"Ala ma kota",
"tmId":1
}

View File

@ -0,0 +1,43 @@
{
"status": "success",
"result": {
"bestOverlayScore": 0.5,
"bestOverlay": [{
"matchedPatternStart": 0,
"matchedPatternEnd": 8,
"occurences": [{
"id": 1782145,
"matchedExampleStart": 305,
"matchedExampleEnd": 314,
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
"targetFragments": [
[257, 264]
]
}, {
"id": 1782145,
"matchedExampleStart": 326,
"matchedExampleEnd": 335,
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
"targetFragments": [
[300, 315]
]
}]
}, {
"matchedPatternStart": 9,
"matchedPatternEnd": 47,
"occurences": [{
"id": 1623941,
"matchedExampleStart": 54,
"matchedExampleEnd": 93,
"sourceSegment": "wszelkie spory między albo islandią , albo norwegią a państwem członkowskim unii europejskiej dotyczące interpretacji lub stosowania niniejszej umowy mogą być przekazywane przez stronę sporu na posiedzeniu przedstawicieli rządów państw członkowskich unii europejskiej oraz islandii i norwegii w celu rozstrzygnięcia sporu w terminie sześciu miesięcy .",
"targetSegment": "any dispute between either iceland or norway and a member state of the european union regarding the interpretation or the application of this agreement may be referred by a party to the dispute to a meeting of representatives of the governments of the member states of the european union and of iceland and norway , with a view to its settlement within six months .",
"targetFragments": [
[51, 85],
[96, 99]
]
}]
}]
}
}