mutliple results
This commit is contained in:
parent
c3826919ba
commit
31e4f091ad
@ -174,17 +174,18 @@ int main(int argc, char** argv) {
|
|||||||
std::cout << "\tSearching for pattern: \"" << pattern <<
|
std::cout << "\tSearching for pattern: \"" << pattern <<
|
||||||
"\"" << std::endl;
|
"\"" << std::endl;
|
||||||
time_start = boost::posix_time::microsec_clock::local_time();
|
time_start = boost::posix_time::microsec_clock::local_time();
|
||||||
std::vector<MatchedPatternFragment> result =
|
MatchedPatternFragment result =
|
||||||
concordia.simpleSearch(pattern);
|
concordia.simpleSearch(pattern);
|
||||||
time_end = boost::posix_time::microsec_clock::local_time();
|
time_end = boost::posix_time::microsec_clock::local_time();
|
||||||
msdiff = time_end - time_start;
|
msdiff = time_end - time_start;
|
||||||
std::cout << "\tFound: " << result.size() << " matches. "
|
std::cout << "\tFound: " << result.getOccurences().size()
|
||||||
<< "Search took: " <<
|
<< " matches. " << "Search took: "
|
||||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
<< msdiff.total_milliseconds() << "ms." << std::endl;
|
||||||
if (!cli.count("silent")) {
|
if (!cli.count("silent")) {
|
||||||
BOOST_FOREACH(MatchedPatternFragment occurence, result) {
|
BOOST_FOREACH(SubstringOccurence occurence,
|
||||||
|
result.getOccurences()) {
|
||||||
std::cout << "\t\tfound match in sentence number: "
|
std::cout << "\t\tfound match in sentence number: "
|
||||||
<< occurence.getExampleId() << std::endl;
|
<< occurence.getId() << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (cli.count("anubis-search")) {
|
} else if (cli.count("anubis-search")) {
|
||||||
@ -234,10 +235,9 @@ int main(int argc, char** argv) {
|
|||||||
result->getBestOverlay()) {
|
result->getBestOverlay()) {
|
||||||
std::cout << "\t\tfragment [" << fragment.getStart()
|
std::cout << "\t\tfragment [" << fragment.getStart()
|
||||||
<< "," << fragment.getEnd()
|
<< "," << fragment.getEnd()
|
||||||
<< "] (exampleId, exampleOffset,"
|
<< "] (exampleCount,"
|
||||||
<< " patternOffset, length): "
|
<< " patternOffset, length): "
|
||||||
<< fragment.getExampleId() << ","
|
<< fragment.getOccurences().size() << ","
|
||||||
<< fragment.getExampleOffset() << ","
|
|
||||||
<< fragment.getPatternOffset() << ","
|
<< fragment.getPatternOffset() << ","
|
||||||
<< fragment.getMatchedLength()
|
<< fragment.getMatchedLength()
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
@ -248,10 +248,9 @@ int main(int argc, char** argv) {
|
|||||||
result->getFragments()) {
|
result->getFragments()) {
|
||||||
std::cout << "\t\tfragment [" << fragment.getStart()
|
std::cout << "\t\tfragment [" << fragment.getStart()
|
||||||
<< "," << fragment.getEnd()
|
<< "," << fragment.getEnd()
|
||||||
<< "] (exampleId, exampleOffset,"
|
<< "] (exampleCount,"
|
||||||
<< " patternOffset, length): "
|
<< " patternOffset, length): "
|
||||||
<< fragment.getExampleId() << ","
|
<< fragment.getOccurences().size() << ","
|
||||||
<< fragment.getExampleOffset() << ","
|
|
||||||
<< fragment.getPatternOffset() << ","
|
<< fragment.getPatternOffset() << ","
|
||||||
<< fragment.getMatchedLength()
|
<< fragment.getMatchedLength()
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
|
@ -25,7 +25,7 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
|||||||
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
||||||
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
||||||
|
|
||||||
#define CONCORDIA_SEARCH_MAX_RESULTS 3
|
#define CONCORDIA_SEARCH_MAX_RESULTS 5
|
||||||
|
|
||||||
#define WORD_MAP_FILE_NAME "word_map.bin"
|
#define WORD_MAP_FILE_NAME "word_map.bin"
|
||||||
#define MARKERS_FILE_NAME "markers.bin"
|
#define MARKERS_FILE_NAME "markers.bin"
|
||||||
|
@ -69,17 +69,19 @@ std::vector<TokenizedSentence> Concordia::tokenizeAll(
|
|||||||
bool generateCodes)
|
bool generateCodes)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
std::vector<TokenizedSentence> result;
|
std::vector<TokenizedSentence> result;
|
||||||
|
|
||||||
if (generateCodes) {
|
if (generateCodes) {
|
||||||
BOOST_FOREACH(std::string sentence, sentences) {
|
BOOST_FOREACH(std::string sentence, sentences) {
|
||||||
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
|
result.push_back(_hashGenerator->generateHash(sentence,
|
||||||
|
byWhitespace));
|
||||||
}
|
}
|
||||||
|
|
||||||
_hashGenerator->serializeWordMap();
|
_hashGenerator->serializeWordMap();
|
||||||
} else {
|
} else {
|
||||||
BOOST_FOREACH(std::string sentence, sentences) {
|
BOOST_FOREACH(std::string sentence, sentences) {
|
||||||
result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
|
result.push_back(_hashGenerator->generateTokens(sentence,
|
||||||
}
|
byWhitespace));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -208,14 +210,14 @@ SUFFIX_MARKER_TYPE Concordia::countOccurences(const std::string & pattern)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::vector<MatchedPatternFragment> Concordia::simpleSearch(
|
MatchedPatternFragment Concordia::simpleSearch(
|
||||||
const std::string & pattern)
|
const std::string & pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
if (_T->size() > 0) {
|
if (_T->size() > 0) {
|
||||||
return _searcher->simpleSearch(_hashGenerator, _T,
|
return _searcher->simpleSearch(_hashGenerator, _T,
|
||||||
_markers, _SA, pattern);
|
_markers, _SA, pattern);
|
||||||
} else {
|
} else {
|
||||||
std::vector<MatchedPatternFragment> result;
|
MatchedPatternFragment result(0, 0);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -269,4 +271,3 @@ std::string Concordia::_getHashedIndexFilePath() {
|
|||||||
std::string Concordia::_getMarkersFilePath() {
|
std::string Concordia::_getMarkersFilePath() {
|
||||||
return _indexPath+"/"+MARKERS_FILE_NAME;
|
return _indexPath+"/"+MARKERS_FILE_NAME;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -126,12 +126,11 @@ public:
|
|||||||
/*! Performs a simple substring lookup on the index.
|
/*! Performs a simple substring lookup on the index.
|
||||||
For more info see \ref tutorial1_2.
|
For more info see \ref tutorial1_2.
|
||||||
\param pattern pattern to be searched in the index
|
\param pattern pattern to be searched in the index
|
||||||
\returns vector of matched results
|
\returns matched pattern fragment containing vector of occurences
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
std::vector<MatchedPatternFragment> simpleSearch(
|
MatchedPatternFragment simpleSearch(const std::string & pattern)
|
||||||
const std::string & pattern)
|
throw(ConcordiaException);
|
||||||
throw(ConcordiaException);
|
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
|
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
@ -160,7 +159,7 @@ public:
|
|||||||
|
|
||||||
/*! Loads HDD stored index files to RAM and generates
|
/*! Loads HDD stored index files to RAM and generates
|
||||||
suffix array based on RAM stored data structures.
|
suffix array based on RAM stored data structures.
|
||||||
For more info see \ref tutorial2.
|
For more info see \ref tutorial2.
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
- list of longest matched fragments sorted in descending order by length
|
- list of longest matched fragments sorted in descending order by length
|
||||||
- the best overlay
|
- the best overlay
|
||||||
- the score of the best overlay.
|
- the score of the best overlay.
|
||||||
|
|
||||||
For more info about concordia searching see \ref tutorial1_3.
|
For more info about concordia searching see \ref tutorial1_3.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
@ -75,6 +75,24 @@ public:
|
|||||||
return _bestOverlayScore;
|
return _bestOverlayScore;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
friend std::ostream & operator << (std::ostream & o,
|
||||||
|
const ConcordiaSearchResult & result) {
|
||||||
|
o << "Best overlay {" << std::endl;
|
||||||
|
BOOST_FOREACH(MatchedPatternFragment fragment,
|
||||||
|
result.getBestOverlay()) {
|
||||||
|
o << fragment << std::endl;
|
||||||
|
}
|
||||||
|
o << "}" << std::endl;
|
||||||
|
o << "All fragments {" << std::endl;
|
||||||
|
BOOST_FOREACH(MatchedPatternFragment fragment,
|
||||||
|
result.getFragments()) {
|
||||||
|
o << fragment << std::endl;
|
||||||
|
}
|
||||||
|
o << "}";
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void _checkPossibleOverlays(
|
void _checkPossibleOverlays(
|
||||||
std::vector<MatchedPatternFragment> currentOverlay,
|
std::vector<MatchedPatternFragment> currentOverlay,
|
||||||
|
@ -36,12 +36,14 @@ void ConcordiaSearcher::concordiaSearch(
|
|||||||
std::vector<SubstringOccurence> occurences =
|
std::vector<SubstringOccurence> occurences =
|
||||||
lcpSearch(T, markers, SA, currentPattern, lcpLength);
|
lcpSearch(T, markers, SA, currentPattern, lcpLength);
|
||||||
|
|
||||||
BOOST_FOREACH(SubstringOccurence occurence, occurences) {
|
if (occurences.size() > 0) {
|
||||||
result->addFragment(MatchedPatternFragment(
|
MatchedPatternFragment fragment(offset,
|
||||||
occurence.getId(),
|
lcpLength / sizeof(INDEX_CHARACTER_TYPE));
|
||||||
occurence.getOffset(),
|
|
||||||
offset,
|
BOOST_FOREACH(SubstringOccurence occurence, occurences) {
|
||||||
lcpLength / sizeof(INDEX_CHARACTER_TYPE)));
|
fragment.addOccurence(occurence);
|
||||||
|
}
|
||||||
|
result->addFragment(fragment);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,14 +13,12 @@ IndexSearcher::IndexSearcher() {
|
|||||||
IndexSearcher::~IndexSearcher() {
|
IndexSearcher::~IndexSearcher() {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
|
MatchedPatternFragment IndexSearcher::simpleSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const std::string & pattern) throw(ConcordiaException) {
|
const std::string & pattern) throw(ConcordiaException) {
|
||||||
std::vector<MatchedPatternFragment> result;
|
|
||||||
|
|
||||||
int left;
|
int left;
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
hashGenerator->generateHash(pattern).getCodes();
|
hashGenerator->generateHash(pattern).getCodes();
|
||||||
@ -30,6 +28,7 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
|
|||||||
int size = sa_search(T->data(), (saidx_t) T->size(),
|
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||||
(const sauchar_t *) patternArray, patternLength,
|
(const sauchar_t *) patternArray, patternLength,
|
||||||
SA->data(), (saidx_t) SA->size(), &left);
|
SA->data(), (saidx_t) SA->size(), &left);
|
||||||
|
MatchedPatternFragment result(0, hash.size());
|
||||||
for (int i = 0; i < size; ++i) {
|
for (int i = 0; i < size; ++i) {
|
||||||
saidx_t resultPos = SA->at(left + i);
|
saidx_t resultPos = SA->at(left + i);
|
||||||
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||||
@ -40,12 +39,11 @@ std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
|
|||||||
// removes these accidental results.
|
// removes these accidental results.
|
||||||
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
||||||
result.push_back(MatchedPatternFragment(
|
|
||||||
Utils::getIdFromMarker(marker),
|
SubstringOccurence occurence;
|
||||||
Utils::getOffsetFromMarker(marker),
|
occurence.enterDataFromMarker(marker);
|
||||||
0,
|
result.addOccurence(occurence);
|
||||||
hash.size()));
|
if (result.getOccurences().size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
|
||||||
if (result.size() >= CONCORDIA_SEARCH_MAX_RESULTS) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
/*!
|
/*!
|
||||||
Class for searching the index with a sentence. In all searches the sentence
|
Class for searching the index with a sentence. In all searches the sentence
|
||||||
is first hashed and then used as a query.
|
is first hashed and then used as a query.
|
||||||
|
|
||||||
IndexSearcher performs the simpleSearch on its own, but uses a
|
IndexSearcher performs the simpleSearch on its own, but uses a
|
||||||
ConcordiaSearcher object to carry out concordiaSearch.
|
ConcordiaSearcher object to carry out concordiaSearch.
|
||||||
|
|
||||||
@ -42,10 +42,10 @@ public:
|
|||||||
\param markers markers array for the needs of searching
|
\param markers markers array for the needs of searching
|
||||||
\param SA suffix array for the needs of searching
|
\param SA suffix array for the needs of searching
|
||||||
\param pattern string pattern to be searched in the index.
|
\param pattern string pattern to be searched in the index.
|
||||||
\returns vector of occurences of the pattern in the index
|
\returns matched pattern fragment, containing occurences of the pattern in the index
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
std::vector<MatchedPatternFragment> simpleSearch(
|
MatchedPatternFragment simpleSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
@ -1,14 +1,10 @@
|
|||||||
#include "concordia/matched_pattern_fragment.hpp"
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
|
|
||||||
MatchedPatternFragment::MatchedPatternFragment(
|
MatchedPatternFragment::MatchedPatternFragment(
|
||||||
const SUFFIX_MARKER_TYPE & exampleId,
|
|
||||||
const SUFFIX_MARKER_TYPE & exampleOffset,
|
|
||||||
const SUFFIX_MARKER_TYPE & patternOffset,
|
const SUFFIX_MARKER_TYPE & patternOffset,
|
||||||
const SUFFIX_MARKER_TYPE & matchedLength):
|
const SUFFIX_MARKER_TYPE & matchedLength):
|
||||||
Interval(patternOffset,
|
Interval(patternOffset,
|
||||||
patternOffset + matchedLength),
|
patternOffset + matchedLength),
|
||||||
_exampleId(exampleId),
|
|
||||||
_exampleOffset(exampleOffset),
|
|
||||||
_patternOffset(patternOffset),
|
_patternOffset(patternOffset),
|
||||||
_matchedLength(matchedLength) {
|
_matchedLength(matchedLength) {
|
||||||
}
|
}
|
||||||
@ -16,3 +12,7 @@ MatchedPatternFragment::MatchedPatternFragment(
|
|||||||
MatchedPatternFragment::~MatchedPatternFragment() {
|
MatchedPatternFragment::~MatchedPatternFragment() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void MatchedPatternFragment::addOccurence(
|
||||||
|
const SubstringOccurence & occurence) {
|
||||||
|
_occurences.push_back(occurence);
|
||||||
|
}
|
||||||
|
@ -3,11 +3,15 @@
|
|||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/interval.hpp"
|
#include "concordia/interval.hpp"
|
||||||
|
#include "concordia/substring_occurence.hpp"
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing matched pattern fragment in concordia search.
|
Class representing matched pattern fragment in concordia search.
|
||||||
This fragment can be seen as a word interval of the pattern.
|
This fragment can be seen as a word interval of the pattern.
|
||||||
|
|
||||||
This class holds information about:
|
This class holds information about:
|
||||||
- where the pattern fragment was matched (example id and example offset)
|
- where the pattern fragment was matched (example id and example offset)
|
||||||
- where the fragment is located within the pattern
|
- where the fragment is located within the pattern
|
||||||
@ -17,32 +21,26 @@
|
|||||||
class MatchedPatternFragment : public Interval {
|
class MatchedPatternFragment : public Interval {
|
||||||
public:
|
public:
|
||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
\param exampleId id of the example where the pattern fragment was matched
|
|
||||||
\param exampleOffset offset of the matched fragment in the example
|
|
||||||
\param patternOffset offset of the matched fragment in the pattern
|
\param patternOffset offset of the matched fragment in the pattern
|
||||||
\param matchedLength length of the matched pattern
|
\param matchedLength length of the matched pattern
|
||||||
*/
|
*/
|
||||||
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
|
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & patternOffset,
|
||||||
const SUFFIX_MARKER_TYPE & exampleOffset,
|
|
||||||
const SUFFIX_MARKER_TYPE & patternOffset,
|
|
||||||
const SUFFIX_MARKER_TYPE & matchedLength);
|
const SUFFIX_MARKER_TYPE & matchedLength);
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~MatchedPatternFragment();
|
virtual ~MatchedPatternFragment();
|
||||||
|
|
||||||
/*! Getter for example id.
|
/*! Getter for occurences.
|
||||||
\returns example id
|
\returns occurences
|
||||||
*/
|
*/
|
||||||
SUFFIX_MARKER_TYPE getExampleId() const {
|
std::vector<SubstringOccurence> getOccurences() const {
|
||||||
return _exampleId;
|
return _occurences;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! Getter for example offset.
|
/*! Adds an occurence to the list.
|
||||||
\returns example offset
|
\param fragment occurence to be added
|
||||||
*/
|
*/
|
||||||
SUFFIX_MARKER_TYPE getExampleOffset() const {
|
void addOccurence(const SubstringOccurence & occurence);
|
||||||
return _exampleOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! Getter for pattern offset.
|
/*! Getter for pattern offset.
|
||||||
\returns pattern offset
|
\returns pattern offset
|
||||||
@ -65,10 +63,22 @@ public:
|
|||||||
return (_matchedLength > other.getMatchedLength());
|
return (_matchedLength > other.getMatchedLength());
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
friend std::ostream & operator << (std::ostream & o,
|
||||||
SUFFIX_MARKER_TYPE _exampleId;
|
const MatchedPatternFragment & fragment) {
|
||||||
|
o << "fragment(patternOffset=" << fragment.getPatternOffset()
|
||||||
|
<< ", matchedLength=" << fragment.getMatchedLength() << ") {"
|
||||||
|
<< std::endl;
|
||||||
|
BOOST_FOREACH(SubstringOccurence occurence, fragment.getOccurences()) {
|
||||||
|
o << "\t" << occurence << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE _exampleOffset;
|
o << "}";
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<SubstringOccurence> _occurences;
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE _patternOffset;
|
SUFFIX_MARKER_TYPE _patternOffset;
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing occurence of a searched substring.
|
Class representing occurence of a searched substring.
|
||||||
@ -65,6 +66,13 @@ public:
|
|||||||
*/
|
*/
|
||||||
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
|
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
|
||||||
|
|
||||||
|
friend std::ostream & operator << (std::ostream & o,
|
||||||
|
const SubstringOccurence & occurence) {
|
||||||
|
return o << "occurence(exampleId=" << occurence.getId()
|
||||||
|
<< ", offset=" << occurence.getOffset() << ")";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
SUFFIX_MARKER_TYPE _id;
|
SUFFIX_MARKER_TYPE _id;
|
||||||
|
|
||||||
|
@ -40,53 +40,51 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
|||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
|
||||||
|
|
||||||
concordia.addExample(Example("Ala posiada rysia",51));
|
concordia.addExample(Example("Ala posiada rysia",51));
|
||||||
concordia.addExample(Example("Marysia posiada rysia",123));
|
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||||
concordia.refreshSAfromRAM();
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
/*The test index contains 3 sentences:
|
/*The test index contains 3 sentences:
|
||||||
14: "Ala posiada kota"
|
14: "Ala posiada kota"
|
||||||
51: "Ala posiada rysia"
|
51: "Ala posiada rysia"
|
||||||
123: "Marysia posiada rysia"
|
123: "Marysia posiada rysia"
|
||||||
|
|
||||||
Test word map:
|
Test word map:
|
||||||
Ala -> 0
|
Ala -> 0
|
||||||
posiada -> 1
|
posiada -> 1
|
||||||
kota -> 2
|
kota -> 2
|
||||||
rysia -> 3
|
rysia -> 3
|
||||||
Marysia -> 4
|
Marysia -> 4
|
||||||
|
|
||||||
Test hashed index:
|
Test hashed index:
|
||||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||||
|
|
||||||
Test suffix array:
|
Test suffix array:
|
||||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
std::vector<MatchedPatternFragment> searchResult1 = concordia.simpleSearch("posiada rysia");
|
MatchedPatternFragment searchResult1 = concordia.simpleSearch("posiada rysia");
|
||||||
std::vector<MatchedPatternFragment> searchResult2 = concordia.simpleSearch("posiada kota Ala");
|
MatchedPatternFragment searchResult2 = concordia.simpleSearch("posiada kota Ala");
|
||||||
|
|
||||||
concordia.clearIndex();
|
concordia.clearIndex();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 123);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 123);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 51);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 51);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
|
|
||||||
|
|
||||||
// Checking pattern spanning over 2 segments
|
// Checking pattern spanning over 2 segments
|
||||||
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
|
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||||
{
|
{
|
||||||
// modified stop words to avoid anonymization
|
// modified stop words to avoid anonymization
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||||
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
std::vector<Example> testExamples;
|
std::vector<Example> testExamples;
|
||||||
@ -106,12 +104,12 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
|||||||
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getType(), 1);
|
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getType(), 1);
|
||||||
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getValue(), "xjest");
|
BOOST_CHECK_EQUAL(hashedPatterns.at(2).getTokens().at(2).getValue(), "xjest");
|
||||||
|
|
||||||
/*The test index contains 4 sentences:
|
/*The test index contains 4 sentences:
|
||||||
312: "xto xjest okno"
|
312: "xto xjest okno"
|
||||||
202: "czy xjest okno otwarte"
|
202: "czy xjest okno otwarte"
|
||||||
45: "chyba xto xjest xtutaj"
|
45: "chyba xto xjest xtutaj"
|
||||||
29: "xto xjest"
|
29: "xto xjest"
|
||||||
|
|
||||||
Test word map:
|
Test word map:
|
||||||
xto -> 0
|
xto -> 0
|
||||||
xjest -> 1
|
xjest -> 1
|
||||||
@ -120,42 +118,37 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
|||||||
otwarte -> 4
|
otwarte -> 4
|
||||||
chyba -> 5
|
chyba -> 5
|
||||||
xtutaj -> 6
|
xtutaj -> 6
|
||||||
|
|
||||||
Test hashed index:
|
Test hashed index:
|
||||||
n: 0 1 2 3 4 5 6 7 8 9 10 11 12
|
n: 0 1 2 3 4 5 6 7 8 9 10 11 12
|
||||||
T[n]: 0 1 2 3 1 2 4 5 0 1 6 0 1
|
T[n]: 0 1 2 3 1 2 4 5 0 1 6 0 1
|
||||||
|
|
||||||
Test suffix array:
|
Test suffix array:
|
||||||
n: 0 1 2 3 4 5 6 7 8 9 10 11 12
|
n: 0 1 2 3 4 5 6 7 8 9 10 11 12
|
||||||
SA[n]: 11 0 8 12 1 4 9 2 5 3 6 7 10
|
SA[n]: 11 0 8 12 1 4 9 2 5 3 6 7 10
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
|
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
|
||||||
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("xto xjest");
|
MatchedPatternFragment searchResult1 = concordia2.simpleSearch("xto xjest");
|
||||||
std::vector<MatchedPatternFragment> searchResult2 = concordia2.simpleSearch("xjest okno");
|
MatchedPatternFragment searchResult2 = concordia2.simpleSearch("xjest okno");
|
||||||
|
|
||||||
concordia2.clearIndex();
|
concordia2.clearIndex();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1.size(), 3);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 3);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 0);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 0);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getId(), 45);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 45);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(1).getOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getId(), 29);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(2).getOffset(), 0);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleId(), 29);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleOffset(), 0);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(2).getMatchedLength(), 2);
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
BOOST_CHECK_EQUAL(searchResult2.getOccurences().size(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 202);
|
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getId(), 202);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(0).getOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getMatchedLength(), 2);
|
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getId(), 312);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 312);
|
BOOST_CHECK_EQUAL(searchResult2.getOccurences().at(1).getOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleOffset(), 1);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getMatchedLength(), 2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||||
@ -166,17 +159,16 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
|||||||
testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
|
testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
|
||||||
testExamples.push_back(Example("czy xjest żółte otwarte",202));
|
testExamples.push_back(Example("czy xjest żółte otwarte",202));
|
||||||
concordia.addAllExamples(testExamples);
|
concordia.addAllExamples(testExamples);
|
||||||
|
|
||||||
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
|
Concordia concordia2 = Concordia(TestResourcesManager::getTempPath(),
|
||||||
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
MatchedPatternFragment searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
||||||
|
|
||||||
concordia2.clearIndex();
|
concordia2.clearIndex();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().size(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getId(), 312);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 2);
|
BOOST_CHECK_EQUAL(searchResult1.getOccurences().at(0).getOffset(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 6);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
||||||
@ -187,15 +179,15 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
|||||||
concordia.addExample(Example("Ala posiada rysia",51));
|
concordia.addExample(Example("Ala posiada rysia",51));
|
||||||
concordia.addExample(Example("Marysia posiada rysia",123));
|
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||||
concordia.refreshSAfromRAM();
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
/*The test index contains 3 sentences:
|
/*The test index contains 3 sentences:
|
||||||
14: "Ala posiada kota"
|
14: "Ala posiada kota"
|
||||||
51: "Ala posiada rysia"
|
51: "Ala posiada rysia"
|
||||||
123: "Marysia posiada rysia"
|
123: "Marysia posiada rysia"
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// the below expectations assume 0.3 anubis threshold
|
// the below expectations assume 0.3 anubis threshold
|
||||||
|
|
||||||
std::vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
|
std::vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
|
||||||
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
|
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 51);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 51);
|
||||||
@ -235,9 +227,9 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
|||||||
concordia.addExample(Example("Gosia chyba posiada rysia też",167));
|
concordia.addExample(Example("Gosia chyba posiada rysia też",167));
|
||||||
concordia.addExample(Example("Ania od wczoraj posiada rysia",45));
|
concordia.addExample(Example("Ania od wczoraj posiada rysia",45));
|
||||||
concordia.refreshSAfromRAM();
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
|
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
|
||||||
// best overlay: [0,2], [2,3], score = 0.829
|
// best overlay: [0,2], [2,3], score = 0.829
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
||||||
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.829, 0.1);
|
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.829, 0.1);
|
||||||
@ -247,53 +239,50 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
|||||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3);
|
||||||
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
addFragment 45,2,0,2
|
adding fragment: offset=0, length=2
|
||||||
addFragment 51,1,0,2
|
adding occurence: example id=167, offset=2
|
||||||
addFragment 123,1,0,2
|
adding occurence: example id=45, offset=3
|
||||||
addFragment 45,3,1,1
|
adding occurence: example id=51, offset=1
|
||||||
addFragment 51,2,1,1
|
adding occurence: example id=123, offset=1
|
||||||
addFragment 123,2,1,1
|
adding fragment: offset=1, length=1
|
||||||
addFragment 167,1,2,1
|
adding occurence: example id=167, offset=3
|
||||||
|
adding occurence: example id=45, offset=4
|
||||||
|
adding occurence: example id=51, offset=2
|
||||||
|
adding occurence: example id=123, offset=2
|
||||||
|
adding fragment: offset=2, length=1
|
||||||
|
adding occurence: example id=167, offset=1
|
||||||
*/
|
*/
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 167);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 3);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 167);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getId(), 45);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(1).getOffset(), 3);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getId(), 51);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(2).getOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getId(), 123);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(3).getOffset(), 1);
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 45);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 3);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getId(), 167);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(0).getOffset(), 3);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getId(), 45);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(1).getOffset(), 4);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getId(), 51);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(2).getOffset(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getId(), 123);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getOccurences().at(3).getOffset(), 2);
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 51);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getId(), 167);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getOccurences().at(0).getOffset(), 1);
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 167);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 45);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 4);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 51);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);
|
|
||||||
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleId(), 167);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleOffset(), 1);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2);
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1);
|
|
||||||
|
|
||||||
concordia.clearIndex();
|
concordia.clearIndex();
|
||||||
}
|
}
|
||||||
@ -322,9 +311,11 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
|||||||
concordia.addTokenizedExample(ts, 14);
|
concordia.addTokenizedExample(ts, 14);
|
||||||
|
|
||||||
concordia.refreshSAfromRAM();
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
|
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
|
||||||
// best overlay:
|
// best overlay:
|
||||||
|
|
||||||
|
// std::cout << *searchResult1 << std::endl;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2);
|
||||||
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.655, 0.1);
|
BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.655, 0.1);
|
||||||
@ -344,10 +335,46 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
|||||||
Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2
|
Matched pattern fragment found. Pattern fragment: [3,5]321,2,3,2
|
||||||
Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2
|
Matched pattern fragment found. Pattern fragment: [7,9]14,9,7,2
|
||||||
Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1
|
Matched pattern fragment found. Pattern fragment: [8,9]14,10,8,1
|
||||||
|
|
||||||
|
Best overlay {
|
||||||
|
fragment(patternOffset=1, matchedLength=4) {
|
||||||
|
occurence(exampleId=321, offset=0)
|
||||||
|
}
|
||||||
|
fragment(patternOffset=5, matchedLength=4) {
|
||||||
|
occurence(exampleId=14, offset=7)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
All fragments {
|
||||||
|
fragment(patternOffset=4, matchedLength=5) {
|
||||||
|
occurence(exampleId=14, offset=6)
|
||||||
|
}
|
||||||
|
fragment(patternOffset=1, matchedLength=4) {
|
||||||
|
occurence(exampleId=321, offset=0)
|
||||||
|
}
|
||||||
|
fragment(patternOffset=5, matchedLength=4) {
|
||||||
|
occurence(exampleId=14, offset=7)
|
||||||
|
}
|
||||||
|
fragment(patternOffset=2, matchedLength=3) {
|
||||||
|
occurence(exampleId=321, offset=1)
|
||||||
|
}
|
||||||
|
fragment(patternOffset=6, matchedLength=3) {
|
||||||
|
occurence(exampleId=14, offset=8)
|
||||||
|
}
|
||||||
|
fragment(patternOffset=3, matchedLength=2) {
|
||||||
|
occurence(exampleId=321, offset=2)
|
||||||
|
}
|
||||||
|
fragment(patternOffset=7, matchedLength=2) {
|
||||||
|
occurence(exampleId=14, offset=9)
|
||||||
|
}
|
||||||
|
fragment(patternOffset=8, matchedLength=1) {
|
||||||
|
occurence(exampleId=14, offset=10)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 14);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getId(), 14);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 6);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getOccurences().at(0).getOffset(), 6);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 4);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 5);
|
||||||
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4);
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getStart(), 4);
|
||||||
@ -373,13 +400,13 @@ BOOST_AUTO_TEST_CASE( Tokenize )
|
|||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada");
|
||||||
|
|
||||||
std::vector<std::string> sentences;
|
std::vector<std::string> sentences;
|
||||||
sentences.push_back("Marysia, ma rysia;");
|
sentences.push_back("Marysia, ma rysia;");
|
||||||
sentences.push_back("Testing complete;");
|
sentences.push_back("Testing complete;");
|
||||||
sentences.push_back("This, is (a) weird;! sentence <>");
|
sentences.push_back("This, is (a) weird;! sentence <>");
|
||||||
std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
|
std::vector<TokenizedSentence> tokenizedSentences = concordia.tokenizeAll(sentences);
|
||||||
|
|
||||||
|
|
||||||
concordia.clearIndex();
|
concordia.clearIndex();
|
||||||
|
|
||||||
@ -387,7 +414,7 @@ BOOST_AUTO_TEST_CASE( Tokenize )
|
|||||||
BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3);
|
BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3);
|
||||||
BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2);
|
BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2);
|
||||||
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
|
BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
|
BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
|
||||||
@ -400,30 +427,30 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
|
|||||||
concordia.addExample(Example("Marysia posiada rysia",123));
|
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||||
concordia.addExample(Example("Ala posiada kota i psa",542));
|
concordia.addExample(Example("Ala posiada kota i psa",542));
|
||||||
concordia.refreshSAfromRAM();
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
/*The test index contains 3 sentences:
|
/*The test index contains 3 sentences:
|
||||||
14: "Ala posiada kota"
|
14: "Ala posiada kota"
|
||||||
51: "Ala posiada rysia"
|
51: "Ala posiada rysia"
|
||||||
123: "Marysia posiada rysia"
|
123: "Marysia posiada rysia"
|
||||||
|
|
||||||
Test word map:
|
Test word map:
|
||||||
Ala -> 0
|
Ala -> 0
|
||||||
posiada -> 1
|
posiada -> 1
|
||||||
kota -> 2
|
kota -> 2
|
||||||
rysia -> 3
|
rysia -> 3
|
||||||
Marysia -> 4
|
Marysia -> 4
|
||||||
|
|
||||||
Test hashed index:
|
Test hashed index:
|
||||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||||
|
|
||||||
Test suffix array:
|
Test suffix array:
|
||||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0);
|
BOOST_CHECK_EQUAL(concordia.countOccurences("Ala posiada"), 0);
|
||||||
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0);
|
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada"), 0);
|
||||||
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1);
|
BOOST_CHECK_EQUAL(concordia.countOccurences("Marysia posiada rysia"), 1);
|
||||||
@ -446,7 +473,7 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
|
|||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
||||||
@ -455,7 +482,7 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
|
|||||||
BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
|
BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
|
||||||
|
|
||||||
concordia.clearIndex();
|
concordia.clearIndex();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( TokenizeOnly )
|
BOOST_AUTO_TEST_CASE( TokenizeOnly )
|
||||||
@ -469,7 +496,7 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly )
|
|||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
|
||||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
||||||
@ -478,7 +505,7 @@ BOOST_AUTO_TEST_CASE( TokenizeOnly )
|
|||||||
BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
|
BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
|
||||||
|
|
||||||
concordia.clearIndex();
|
concordia.clearIndex();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
BIN
scripts/concordia_json.zip
Normal file
BIN
scripts/concordia_json.zip
Normal file
Binary file not shown.
22
scripts/responseExplained.txt
Normal file
22
scripts/responseExplained.txt
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"status": "success", //status operacji
|
||||||
|
"result": {
|
||||||
|
"bestOverlayScore" : <liczba 0-1> // Concordia podaje score znalezionego przez siebie dopasowania
|
||||||
|
"bestOverlay" : [ // lista fragmentów zdania wejściowego, które znalazły się w pamięci tłumaczeń
|
||||||
|
{ // jeden fragment
|
||||||
|
"matchedPatternStart": 0, // index (character-based) początku fragmentu zdania wejściowego
|
||||||
|
"matchedPatternEnd": 8, // index końca fragmentu (exclusive, przedział prawostronnie otwarty)
|
||||||
|
"occurences": [{ // lista przykładów z pamięci tłumaczeń, w których znalazł się dany fragment zdania wejściowego
|
||||||
|
"id": 1782145, // id przykładu z pamięci tłumaczeń (przykład to para zdań źródłowe-docelowe)
|
||||||
|
"matchedExampleStart": 305, // index początku fragmentu w zdaniu źródłowym przykładu
|
||||||
|
"matchedExampleEnd": 314, // index końca fragmentu w zdaniu źródłowym przykładu (exclusive)
|
||||||
|
"sourceSegment": <text>, // pełny tekst zdania źródłowego przykładu
|
||||||
|
"targetSegment": <text>, // pełny tekst zdania docelowego przykładu
|
||||||
|
"targetFragments": [ // lista fragmentów zdania docelowego, do których urównoleglony jest znaleziony fragment zdania źródłowego
|
||||||
|
[257, 264]
|
||||||
|
]
|
||||||
|
}, .... // mogą być jeszcze kolejne fragmenty
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
5
scripts/sampleRequest.json
Normal file
5
scripts/sampleRequest.json
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"operation": "concordiaSearch",
|
||||||
|
"pattern":"Ala ma kota",
|
||||||
|
"tmId":1
|
||||||
|
}
|
43
scripts/sampleResponse.json
Normal file
43
scripts/sampleResponse.json
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
{
|
||||||
|
"status": "success",
|
||||||
|
"result": {
|
||||||
|
"bestOverlayScore": 0.5,
|
||||||
|
"bestOverlay": [{
|
||||||
|
"matchedPatternStart": 0,
|
||||||
|
"matchedPatternEnd": 8,
|
||||||
|
"occurences": [{
|
||||||
|
"id": 1782145,
|
||||||
|
"matchedExampleStart": 305,
|
||||||
|
"matchedExampleEnd": 314,
|
||||||
|
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
|
||||||
|
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
|
||||||
|
"targetFragments": [
|
||||||
|
[257, 264]
|
||||||
|
]
|
||||||
|
}, {
|
||||||
|
"id": 1782145,
|
||||||
|
"matchedExampleStart": 326,
|
||||||
|
"matchedExampleEnd": 335,
|
||||||
|
"sourceSegment": "10 . państwa członkowskie odpowiedzialne są za drukowanie formularzy . formularze można również drukować w drukarniach wyznaczonych przez państwo członkowskie , w którym są ustanowione . w ostatnim przypadku na każdym formularzu musi znaleźć się odniesienie do tego postanowienia państwa członkowskiego . na każdym formularzu znajduje się informacja dotycząca nazwy i adresu drukarni lub znak umożliwiający jej identyfikację . \" ;",
|
||||||
|
"targetSegment": "10 . member states shall be responsible for having the forms printed . the forms may also be printed by printers appointed by the member state in which they are established . in the latter case , reference to the appointment by the member state must appear on each form . each form shall bear an indication of the printer 's name and address or a mark enabling the printer to be identified . '",
|
||||||
|
"targetFragments": [
|
||||||
|
[300, 315]
|
||||||
|
]
|
||||||
|
}]
|
||||||
|
}, {
|
||||||
|
"matchedPatternStart": 9,
|
||||||
|
"matchedPatternEnd": 47,
|
||||||
|
"occurences": [{
|
||||||
|
"id": 1623941,
|
||||||
|
"matchedExampleStart": 54,
|
||||||
|
"matchedExampleEnd": 93,
|
||||||
|
"sourceSegment": "wszelkie spory między albo islandią , albo norwegią a państwem członkowskim unii europejskiej dotyczące interpretacji lub stosowania niniejszej umowy mogą być przekazywane przez stronę sporu na posiedzeniu przedstawicieli rządów państw członkowskich unii europejskiej oraz islandii i norwegii w celu rozstrzygnięcia sporu w terminie sześciu miesięcy .",
|
||||||
|
"targetSegment": "any dispute between either iceland or norway and a member state of the european union regarding the interpretation or the application of this agreement may be referred by a party to the dispute to a meeting of representatives of the governments of the member states of the european union and of iceland and norway , with a view to its settlement within six months .",
|
||||||
|
"targetFragments": [
|
||||||
|
[51, 85],
|
||||||
|
[96, 99]
|
||||||
|
]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user