fixed lcp search
Former-commit-id: 18192126d134323569bc43205ccc60788d9e6cb6
This commit is contained in:
parent
2533fd5b44
commit
f03b4ad954
@ -18,6 +18,8 @@ set (SUFFIX_MARKER_TYPE_MAX_VALUE "ULLONG_MAX")
|
|||||||
set (SUFFIX_MARKER_SENTENCE_BYTES 2)
|
set (SUFFIX_MARKER_SENTENCE_BYTES 2)
|
||||||
# The above settings assign 4 bytes to sentence id and 2 bytes each for suffix offset and sentence length.
|
# The above settings assign 4 bytes to sentence id and 2 bytes each for suffix offset and sentence length.
|
||||||
# This allows to store 2^32=4 294 967 296 sentences no longer than 65536 words.
|
# This allows to store 2^32=4 294 967 296 sentences no longer than 65536 words.
|
||||||
|
# After changing these values be sure to adjust tests (as well as the above calculations).
|
||||||
|
# Also, you might want to run TooLongHashTest from test_hash_generator.cpp
|
||||||
|
|
||||||
# =============================== #
|
# =============================== #
|
||||||
# Production paths
|
# Production paths
|
||||||
|
12
TODO.txt
12
TODO.txt
@ -4,6 +4,13 @@ DONE 3. Dzielenie zdań (max 255 tokenów)
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
|
||||||
|
|
||||||
|
DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
|
||||||
|
2. Wykonać anubis search na nowych markerach z długością zdania
|
||||||
|
3. Multi-threading?
|
||||||
|
|
||||||
- concordia-server
|
- concordia-server
|
||||||
- zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
|
- zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
|
||||||
- wyłączyć stopWords
|
- wyłączyć stopWords
|
||||||
@ -15,8 +22,3 @@ zastanowić się nad optymalizacją:
|
|||||||
- unordered_map tmMatchesMap
|
- unordered_map tmMatchesMap
|
||||||
- LCP array
|
- LCP array
|
||||||
|
|
||||||
DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
|
|
||||||
|
|
||||||
DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
|
|
||||||
2. Wykonać anubis search na nowych markerach z długością zdania
|
|
||||||
3. Multi-threading?
|
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
#include "concordia/anubis_searcher.hpp"
|
#include "concordia/anubis_searcher.hpp"
|
||||||
#include "concordia/tm_matches.hpp"
|
#include "concordia/tm_matches.hpp"
|
||||||
|
#include "concordia/common/logging.hpp"
|
||||||
|
|
||||||
#include <boost/ptr_container/ptr_map.hpp>
|
#include <boost/ptr_container/ptr_map.hpp>
|
||||||
|
#include <boost/assign/ptr_map_inserter.hpp>
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <map>
|
#include <map>
|
||||||
@ -23,6 +25,10 @@ boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
|||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
|
SET_LOGGER_FILE("/tmp/concordia.log");
|
||||||
|
SET_LOGGING_LEVEL("ERROR");
|
||||||
|
INFO("AnubisSearcher::anubisSearch");
|
||||||
|
|
||||||
boost::ptr_vector<AnubisSearchResult> result;
|
boost::ptr_vector<AnubisSearchResult> result;
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > patternVector =
|
boost::shared_ptr<std::vector<sauchar_t> > patternVector =
|
||||||
@ -33,25 +39,66 @@ boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
|||||||
throw ConcordiaException("Increasing pattern resolution went wrong.");
|
throw ConcordiaException("Increasing pattern resolution went wrong.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
INFO("AnubisSearcher::anubisSearch - about to create tmMatchesMap");
|
||||||
TmMatchesMap tmMatchesMap;
|
TmMatchesMap tmMatchesMap;
|
||||||
for (int offset = 0; offset < pattern->size(); offset++) {
|
for (int offset = 0; offset < pattern->size(); offset++) {
|
||||||
|
INFO("AnubisSearcher::anubisSearch - offset: ");
|
||||||
|
INFO(offset);
|
||||||
|
|
||||||
int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
|
int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
INFO("AnubisSearcher::anubisSearch - high res offset: ");
|
||||||
|
INFO(highResOffset);
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > currentPattern =
|
boost::shared_ptr<std::vector<sauchar_t> > currentPattern =
|
||||||
boost::shared_ptr<std::vector<sauchar_t> >
|
boost::shared_ptr<std::vector<sauchar_t> >
|
||||||
(new std::vector<sauchar_t>(
|
(new std::vector<sauchar_t>(
|
||||||
patternVector->begin()+highResOffset, patternVector->end()));
|
patternVector->begin()+highResOffset, patternVector->end()));
|
||||||
SUFFIX_MARKER_TYPE longestPrefixesLength;
|
SUFFIX_MARKER_TYPE highResLongestPrefixesLength;
|
||||||
|
INFO("AnubisSearcher::anubisSearch - about to get longest prefixes");
|
||||||
boost::ptr_vector<SubstringOccurence> longestPrefixes =
|
boost::ptr_vector<SubstringOccurence> longestPrefixes =
|
||||||
lcpSearch(T, markers, SA, currentPattern, longestPrefixesLength);
|
lcpSearch(T, markers, SA, currentPattern, highResLongestPrefixesLength);
|
||||||
|
|
||||||
BOOST_FOREACH(SubstringOccurence & occurence, longestPrefixes) {
|
INFO("AnubisSearcher::anubisSearch - longest prefixes got");
|
||||||
TmMatchesMapIterator mapIterator = tmMatchesMap.find(
|
SUFFIX_MARKER_TYPE longestPrefixesLength = highResLongestPrefixesLength /
|
||||||
occurence.getId());
|
sizeof(INDEX_CHARACTER_TYPE);
|
||||||
if (mapIterator != tmMatchesMap.end()) {
|
INFO("AnubisSearcher::anubisSearch - longest prefixes high res length");
|
||||||
|
INFO(highResLongestPrefixesLength);
|
||||||
|
INFO("AnubisSearcher::anubisSearch - longest prefixes length");
|
||||||
|
INFO(longestPrefixesLength);
|
||||||
|
|
||||||
|
if (longestPrefixesLength > 0) {
|
||||||
|
BOOST_FOREACH(SubstringOccurence & occurence, longestPrefixes) {
|
||||||
|
boost::shared_ptr<TmMatches> tmMatches;
|
||||||
|
|
||||||
|
TmMatchesMapIterator mapIterator = tmMatchesMap.find(
|
||||||
|
occurence.getId());
|
||||||
|
if (mapIterator != tmMatchesMap.end()) {
|
||||||
|
tmMatches = boost::shared_ptr<TmMatches>(
|
||||||
|
mapIterator->second
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
tmMatches = boost::shared_ptr<TmMatches>(
|
||||||
|
new TmMatches(
|
||||||
|
occurence.getId(),
|
||||||
|
occurence.getExampleLength(),
|
||||||
|
patternVector->size()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// add intervals to tmMatches
|
||||||
|
tmMatches->addExampleInterval(
|
||||||
|
occurence.getOffset(),
|
||||||
|
occurence.getOffset() + longestPrefixesLength
|
||||||
|
);
|
||||||
|
tmMatches->addPatternInterval(
|
||||||
|
offset,
|
||||||
|
offset + longestPrefixesLength
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get the tmMatches list sorted descending by score
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -76,7 +123,7 @@ boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
|||||||
prevLeft = left;
|
prevLeft = left;
|
||||||
prevSize = size;
|
prevSize = size;
|
||||||
|
|
||||||
patternLength++;
|
patternLength += sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
saidx_t localLeft;
|
saidx_t localLeft;
|
||||||
size = sa_search(T->data(), (saidx_t) T->size(),
|
size = sa_search(T->data(), (saidx_t) T->size(),
|
||||||
@ -91,7 +138,8 @@ boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
|||||||
|
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
// The search managed to find exactly the longest common prefixes.
|
// The search managed to find exactly the longest common prefixes.
|
||||||
length = patternLength - 1;
|
|
||||||
|
length = patternLength - sizeof(INDEX_CHARACTER_TYPE);
|
||||||
if (length > 0) {
|
if (length > 0) {
|
||||||
// Get the results of the previous search
|
// Get the results of the previous search
|
||||||
_collectResults(result, markers, SA, prevLeft, prevSize);
|
_collectResults(result, markers, SA, prevLeft, prevSize);
|
||||||
@ -115,9 +163,10 @@ void AnubisSearcher::_collectResults(
|
|||||||
saidx_t left, saidx_t size) {
|
saidx_t left, saidx_t size) {
|
||||||
for (saidx_t i = 0; i < size; i++) {
|
for (saidx_t i = 0; i < size; i++) {
|
||||||
saidx_t resultPos = SA->at(left + i);
|
saidx_t resultPos = SA->at(left + i);
|
||||||
SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
|
|
||||||
result.push_back(new SubstringOccurence(
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||||
Utils::getIdFromMarker(marker),
|
SUFFIX_MARKER_TYPE marker = markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
|
||||||
Utils::getOffsetFromMarker(marker)));
|
result.push_back(new SubstringOccurence(marker));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -143,3 +143,15 @@ boost::ptr_vector<SubstringOccurence> Concordia::simpleSearch(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boost::ptr_vector<AnubisSearchResult> Concordia::anubisSearch(
|
||||||
|
const string & pattern)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
if (_T->size() > 0) {
|
||||||
|
return _searcher->anubisSearch(_hashGenerator, _T,
|
||||||
|
_markers, _SA, pattern);
|
||||||
|
} else {
|
||||||
|
boost::ptr_vector<AnubisSearchResult> result;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
#include "concordia/concordia_config.hpp"
|
#include "concordia/concordia_config.hpp"
|
||||||
#include "concordia/concordia_index.hpp"
|
#include "concordia/concordia_index.hpp"
|
||||||
#include "concordia/index_searcher.hpp"
|
#include "concordia/index_searcher.hpp"
|
||||||
|
#include "concordia/anubis_search_result.hpp"
|
||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
|
|
||||||
@ -47,6 +48,10 @@ public:
|
|||||||
const std::string & pattern)
|
const std::string & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
boost::ptr_vector<AnubisSearchResult> anubisSearch(
|
||||||
|
const std::string & pattern)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
||||||
|
|
||||||
void refreshSAfromRAM() throw(ConcordiaException);
|
void refreshSAfromRAM() throw(ConcordiaException);
|
||||||
|
@ -40,9 +40,7 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
|||||||
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
||||||
|
|
||||||
result.push_back(new SubstringOccurence(
|
result.push_back(new SubstringOccurence(marker));
|
||||||
Utils::getIdFromMarker(marker),
|
|
||||||
Utils::getOffsetFromMarker(marker)));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,10 +1,21 @@
|
|||||||
#include "concordia/substring_occurence.hpp"
|
#include "concordia/substring_occurence.hpp"
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
|
|
||||||
|
|
||||||
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & marker) {
|
||||||
const SUFFIX_MARKER_TYPE & offset):
|
_id = Utils::getIdFromMarker(marker);
|
||||||
_id(id),
|
_offset = Utils::getOffsetFromMarker(marker);
|
||||||
_offset(offset) {
|
_exampleLength = Utils::getLengthFromMarker(marker);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
SubstringOccurence::SubstringOccurence(
|
||||||
|
const SUFFIX_MARKER_TYPE & id,
|
||||||
|
const SUFFIX_MARKER_TYPE & offset,
|
||||||
|
const SUFFIX_MARKER_TYPE & exampleLength):
|
||||||
|
_id(id),
|
||||||
|
_offset(offset),
|
||||||
|
_exampleLength(exampleLength) {
|
||||||
}
|
}
|
||||||
|
|
||||||
SubstringOccurence::~SubstringOccurence() {
|
SubstringOccurence::~SubstringOccurence() {
|
||||||
|
@ -13,9 +13,11 @@ using namespace std;
|
|||||||
|
|
||||||
class SubstringOccurence {
|
class SubstringOccurence {
|
||||||
public:
|
public:
|
||||||
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & marker);
|
||||||
const SUFFIX_MARKER_TYPE & offset);
|
|
||||||
|
|
||||||
|
SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
||||||
|
const SUFFIX_MARKER_TYPE & offset,
|
||||||
|
const SUFFIX_MARKER_TYPE & exampleLength);
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~SubstringOccurence();
|
virtual ~SubstringOccurence();
|
||||||
@ -28,10 +30,17 @@ public:
|
|||||||
return _offset;
|
return _offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE getExampleLength() const {
|
||||||
|
return _exampleLength;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
SUFFIX_MARKER_TYPE _id;
|
SUFFIX_MARKER_TYPE _id;
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE _offset;
|
SUFFIX_MARKER_TYPE _offset;
|
||||||
|
|
||||||
|
// the example
|
||||||
|
SUFFIX_MARKER_TYPE _exampleLength;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -21,40 +21,115 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
word map: b=1,a=2,n=3,z=4
|
word map: b=1,a=2,n=3,z=4
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
T->push_back(1);
|
T->push_back(1);
|
||||||
|
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
T->push_back(2);
|
T->push_back(2);
|
||||||
|
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
T->push_back(3);
|
T->push_back(3);
|
||||||
|
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
T->push_back(2);
|
T->push_back(2);
|
||||||
|
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
T->push_back(3);
|
T->push_back(3);
|
||||||
|
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(0);
|
||||||
T->push_back(2);
|
T->push_back(2);
|
||||||
|
|
||||||
for(int i=0;i<6;i++) {
|
for(int i=0;i<6;i++) {
|
||||||
markers->push_back(Utils::createMarker(34,i,6));
|
markers->push_back(Utils::createMarker(34,i,6));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pattern->push_back(0);
|
||||||
|
pattern->push_back(0);
|
||||||
|
pattern->push_back(0);
|
||||||
pattern->push_back(2);
|
pattern->push_back(2);
|
||||||
|
|
||||||
|
pattern->push_back(0);
|
||||||
|
pattern->push_back(0);
|
||||||
|
pattern->push_back(0);
|
||||||
pattern->push_back(3);
|
pattern->push_back(3);
|
||||||
pattern->push_back(4);
|
|
||||||
|
pattern->push_back(0);
|
||||||
|
pattern->push_back(0);
|
||||||
|
pattern->push_back(0);
|
||||||
pattern->push_back(4);
|
pattern->push_back(4);
|
||||||
|
|
||||||
/* Suffix array for the hashed index: 1 2 3 2 3 2
|
pattern->push_back(0);
|
||||||
0: 1 2 3 2 3 2
|
pattern->push_back(0);
|
||||||
5: 2
|
pattern->push_back(0);
|
||||||
3: 2 3 2
|
pattern->push_back(4);
|
||||||
1: 2 3 2 3 2
|
|
||||||
4: 3 2
|
/* Suffix array for the hashed index: 0001 0002 0003 0002 0003 0002
|
||||||
2: 3 2 3 2
|
0:000100020003000200030002
|
||||||
|
20:0002
|
||||||
|
12:000200030002
|
||||||
|
4:00020003000200030002
|
||||||
|
16:00030002
|
||||||
|
8:0003000200030002
|
||||||
|
1:00100020003000200030002
|
||||||
|
21:002
|
||||||
|
13:00200030002
|
||||||
|
5:0020003000200030002
|
||||||
|
17:0030002
|
||||||
|
9:003000200030002
|
||||||
|
2:0100020003000200030002
|
||||||
|
22:02
|
||||||
|
14:0200030002
|
||||||
|
6:020003000200030002
|
||||||
|
18:030002
|
||||||
|
10:03000200030002
|
||||||
|
3:100020003000200030002
|
||||||
|
23:2
|
||||||
|
15:200030002
|
||||||
|
7:20003000200030002
|
||||||
|
19:30002
|
||||||
|
11:3000200030002
|
||||||
*/
|
*/
|
||||||
|
|
||||||
SA->push_back(0);
|
SA->push_back(0);
|
||||||
SA->push_back(5);
|
SA->push_back(20);
|
||||||
SA->push_back(3);
|
SA->push_back(12);
|
||||||
SA->push_back(1);
|
|
||||||
SA->push_back(4);
|
SA->push_back(4);
|
||||||
|
SA->push_back(16);
|
||||||
|
SA->push_back(8);
|
||||||
|
SA->push_back(1);
|
||||||
|
SA->push_back(21);
|
||||||
|
SA->push_back(13);
|
||||||
|
SA->push_back(5);
|
||||||
|
SA->push_back(17);
|
||||||
|
SA->push_back(9);
|
||||||
SA->push_back(2);
|
SA->push_back(2);
|
||||||
|
SA->push_back(22);
|
||||||
|
SA->push_back(14);
|
||||||
|
SA->push_back(6);
|
||||||
|
SA->push_back(18);
|
||||||
|
SA->push_back(10);
|
||||||
|
SA->push_back(3);
|
||||||
|
SA->push_back(23);
|
||||||
|
SA->push_back(15);
|
||||||
|
SA->push_back(7);
|
||||||
|
SA->push_back(19);
|
||||||
|
SA->push_back(11);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE length;
|
SUFFIX_MARKER_TYPE highResLength;
|
||||||
boost::ptr_vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, length);
|
boost::ptr_vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, highResLength);
|
||||||
|
SUFFIX_MARKER_TYPE length = highResLength / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
/* Expecting to get the following results from SA:
|
/* Expecting to get the following results from SA:
|
||||||
3: ana
|
3: ana
|
||||||
@ -72,21 +147,46 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
//--------pattern banana
|
//--------pattern banana
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > pattern2(new std::vector<sauchar_t>());
|
boost::shared_ptr<std::vector<sauchar_t> > pattern2(new std::vector<sauchar_t>());
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(0);
|
||||||
pattern2->push_back(1);
|
pattern2->push_back(1);
|
||||||
pattern2->push_back(2);
|
|
||||||
pattern2->push_back(3);
|
pattern2->push_back(0);
|
||||||
pattern2->push_back(2);
|
pattern2->push_back(0);
|
||||||
pattern2->push_back(3);
|
pattern2->push_back(0);
|
||||||
pattern2->push_back(2);
|
pattern2->push_back(2);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE length2;
|
pattern2->push_back(0);
|
||||||
boost::ptr_vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, length2);
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(3);
|
||||||
|
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(2);
|
||||||
|
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(3);
|
||||||
|
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(0);
|
||||||
|
pattern2->push_back(2);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE highResLength2;
|
||||||
|
boost::ptr_vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, highResLength2);
|
||||||
|
SUFFIX_MARKER_TYPE length2 = highResLength2 / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
/* Expecting to get one result from SA:
|
/* Expecting to get one result from SA:
|
||||||
0: banana
|
0: banana
|
||||||
Which is one substring occurence (34,0) with the lcp length = 6;
|
Which is one substring occurence (34,0) with the lcp length = 6;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(result2.size(),1);
|
BOOST_CHECK_EQUAL(result2.size(),1);
|
||||||
BOOST_CHECK_EQUAL(length2,6);
|
BOOST_CHECK_EQUAL(length2,6);
|
||||||
BOOST_CHECK_EQUAL(result2.at(0).getId(),34);
|
BOOST_CHECK_EQUAL(result2.at(0).getId(),34);
|
||||||
@ -95,14 +195,34 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
//--------pattern banan
|
//--------pattern banan
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > pattern3(new std::vector<sauchar_t>());
|
boost::shared_ptr<std::vector<sauchar_t> > pattern3(new std::vector<sauchar_t>());
|
||||||
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(0);
|
||||||
pattern3->push_back(1);
|
pattern3->push_back(1);
|
||||||
|
|
||||||
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(0);
|
||||||
pattern3->push_back(2);
|
pattern3->push_back(2);
|
||||||
pattern3->push_back(3);
|
|
||||||
pattern3->push_back(2);
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(0);
|
||||||
pattern3->push_back(3);
|
pattern3->push_back(3);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE length3;
|
pattern3->push_back(0);
|
||||||
boost::ptr_vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, length3);
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(2);
|
||||||
|
|
||||||
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(0);
|
||||||
|
pattern3->push_back(3);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE highResLength3;
|
||||||
|
boost::ptr_vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, highResLength3);
|
||||||
|
SUFFIX_MARKER_TYPE length3 = highResLength3 / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
/* Expecting to get one result from SA:
|
/* Expecting to get one result from SA:
|
||||||
0: banana
|
0: banana
|
||||||
@ -117,13 +237,29 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
//--------pattern nazz
|
//--------pattern nazz
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > pattern4(new std::vector<sauchar_t>());
|
boost::shared_ptr<std::vector<sauchar_t> > pattern4(new std::vector<sauchar_t>());
|
||||||
|
pattern4->push_back(0);
|
||||||
|
pattern4->push_back(0);
|
||||||
|
pattern4->push_back(0);
|
||||||
pattern4->push_back(3);
|
pattern4->push_back(3);
|
||||||
|
|
||||||
|
pattern4->push_back(0);
|
||||||
|
pattern4->push_back(0);
|
||||||
|
pattern4->push_back(0);
|
||||||
pattern4->push_back(2);
|
pattern4->push_back(2);
|
||||||
pattern4->push_back(4);
|
|
||||||
|
pattern4->push_back(0);
|
||||||
|
pattern4->push_back(0);
|
||||||
|
pattern4->push_back(0);
|
||||||
pattern4->push_back(4);
|
pattern4->push_back(4);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE length4;
|
pattern4->push_back(0);
|
||||||
boost::ptr_vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, length4);
|
pattern4->push_back(0);
|
||||||
|
pattern4->push_back(0);
|
||||||
|
pattern4->push_back(4);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE highResLength4;
|
||||||
|
boost::ptr_vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, highResLength4);
|
||||||
|
SUFFIX_MARKER_TYPE length4 = highResLength4 / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
/* Expecting to get 2 results from SA:
|
/* Expecting to get 2 results from SA:
|
||||||
4: na
|
4: na
|
||||||
@ -137,6 +273,60 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
BOOST_CHECK_EQUAL(result4.at(0).getOffset(),4);
|
BOOST_CHECK_EQUAL(result4.at(0).getOffset(),4);
|
||||||
BOOST_CHECK_EQUAL(result4.at(1).getId(),34);
|
BOOST_CHECK_EQUAL(result4.at(1).getId(),34);
|
||||||
BOOST_CHECK_EQUAL(result4.at(1).getOffset(),2);
|
BOOST_CHECK_EQUAL(result4.at(1).getOffset(),2);
|
||||||
|
|
||||||
|
|
||||||
|
//--------pattern zz
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > pattern5(new std::vector<sauchar_t>());
|
||||||
|
pattern5->push_back(0);
|
||||||
|
pattern5->push_back(0);
|
||||||
|
pattern5->push_back(0);
|
||||||
|
pattern5->push_back(4);
|
||||||
|
|
||||||
|
pattern5->push_back(0);
|
||||||
|
pattern5->push_back(0);
|
||||||
|
pattern5->push_back(0);
|
||||||
|
pattern5->push_back(4);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE highResLength5;
|
||||||
|
boost::ptr_vector<SubstringOccurence> result5 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength5);
|
||||||
|
SUFFIX_MARKER_TYPE length5 = highResLength5 / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
|
/* Expecting to get 0 results from SA, lcp length = 0;
|
||||||
|
*/
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(result5.size(),0);
|
||||||
|
BOOST_CHECK_EQUAL(length5,0);
|
||||||
|
|
||||||
|
//--------pattern existing in the text but spanning over parts of characters
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > pattern6(new std::vector<sauchar_t>());
|
||||||
|
pattern6->push_back(0);
|
||||||
|
pattern6->push_back(0);
|
||||||
|
pattern6->push_back(3);
|
||||||
|
|
||||||
|
pattern6->push_back(0);
|
||||||
|
pattern6->push_back(0);
|
||||||
|
pattern6->push_back(0);
|
||||||
|
pattern6->push_back(2);
|
||||||
|
|
||||||
|
pattern6->push_back(0);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE highResLength6;
|
||||||
|
boost::ptr_vector<SubstringOccurence> result6 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength6);
|
||||||
|
SUFFIX_MARKER_TYPE length6 = highResLength6 / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
|
/* Expecting to get 0 results from SA, lcp length = 0;
|
||||||
|
*/
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(result6.size(),0);
|
||||||
|
BOOST_CHECK_EQUAL(length6,0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( AnubisSearch1 )
|
||||||
|
{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
#include "concordia/concordia.hpp"
|
#include "concordia/concordia.hpp"
|
||||||
|
#include "concordia/anubis_search_result.hpp"
|
||||||
#include "tests/common/test_resources_manager.hpp"
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
|
||||||
@ -148,5 +149,52 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
|||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
||||||
|
{
|
||||||
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
concordia.addExample(Example("Ala posiada kota",14));
|
||||||
|
concordia.addExample(Example("Ala posiada rysia",51));
|
||||||
|
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||||
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
|
/*The test index contains 3 sentences:
|
||||||
|
14: "Ala posiada kota"
|
||||||
|
51: "Ala posiada rysia"
|
||||||
|
123: "Marysia posiada rysia"
|
||||||
|
|
||||||
|
Test word map:
|
||||||
|
Ala -> 0
|
||||||
|
posiada -> 1
|
||||||
|
kota -> 2
|
||||||
|
rysia -> 3
|
||||||
|
Marysia -> 4
|
||||||
|
|
||||||
|
Test hashed index:
|
||||||
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
|
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||||
|
|
||||||
|
Test suffix array:
|
||||||
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
|
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
|
*/
|
||||||
|
boost::ptr_vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
|
||||||
|
boost::ptr_vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
|
||||||
|
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||||
|
|
||||||
|
/*
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1);
|
||||||
|
|
||||||
|
// Checking pattern spanning over 2 segments
|
||||||
|
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
@ -32,6 +32,8 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
|||||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
|
BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
|
||||||
|
Or don't run it at all, whatever! I don't care! There is still the test for max sentence size in test_utils.cpp
|
||||||
BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
||||||
{
|
{
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
@ -62,6 +64,7 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
|||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user