lcp search
Former-commit-id: 925a5de8bc33256b594c369907f202e29f809f47
This commit is contained in:
parent
dd8b27cc23
commit
e8ea5881a5
10
TODO.txt
10
TODO.txt
@ -1,4 +1,6 @@
|
|||||||
1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
||||||
2. anonimizacja zdań
|
DONE 2. anonimizacja zdań
|
||||||
3. Dzielenie zdań (max 255 tokenów)
|
DONE 3. Dzielenie zdań (max 255 tokenów)
|
||||||
4. concordia-server
|
|
||||||
|
- concordia-server
|
||||||
|
- zastanowić się nad empty hash examples
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
#include "concordia/anubis_searcher.hpp"
|
#include "concordia/anubis_searcher.hpp"
|
||||||
|
|
||||||
|
#include<iostream>
|
||||||
|
|
||||||
|
|
||||||
AnubisSearcher::AnubisSearcher() {
|
AnubisSearcher::AnubisSearcher() {
|
||||||
}
|
}
|
||||||
@ -13,9 +15,75 @@ boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
|||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> >)
|
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
|
|
||||||
boost::ptr_vector<AnubisSearchResult> result;
|
boost::ptr_vector<AnubisSearchResult> result;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > pattern,
|
||||||
|
SUFFIX_MARKER_TYPE & length)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
saidx_t patternLength = 0;
|
||||||
|
saidx_t size = SA->size();
|
||||||
|
saidx_t left = 0;
|
||||||
|
|
||||||
|
sauchar_t * patternArray = pattern->data();
|
||||||
|
|
||||||
|
saidx_t * SAleft = SA->data();
|
||||||
|
|
||||||
|
saidx_t prevLeft;
|
||||||
|
saidx_t prevSize;
|
||||||
|
do {
|
||||||
|
prevLeft = left;
|
||||||
|
prevSize = size;
|
||||||
|
|
||||||
|
patternLength++;
|
||||||
|
|
||||||
|
saidx_t localLeft;
|
||||||
|
size = sa_search(T->data(), (saidx_t) T->size(),
|
||||||
|
(const sauchar_t *) patternArray, patternLength,
|
||||||
|
SAleft, size, &localLeft);
|
||||||
|
|
||||||
|
left += localLeft;
|
||||||
|
SAleft += localLeft;
|
||||||
|
} while (patternLength < pattern->size() && size > 0);
|
||||||
|
|
||||||
|
boost::ptr_vector<SubstringOccurence> result;
|
||||||
|
|
||||||
|
if (size == 0) {
|
||||||
|
// The search managed to find exactly the longest common prefixes.
|
||||||
|
length = patternLength - 1;
|
||||||
|
if (length > 0) {
|
||||||
|
// Get the results of the previous search
|
||||||
|
_collectResults(result, markers, SA, prevLeft, prevSize);
|
||||||
|
}
|
||||||
|
// If length == 0, then the pattern has no common prefixes
|
||||||
|
// with the index.
|
||||||
|
} else {
|
||||||
|
// Seemingly, the index contains at least one utterance
|
||||||
|
// of the whole search pattern.
|
||||||
|
length = patternLength;
|
||||||
|
_collectResults(result, markers, SA, left, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void AnubisSearcher::_collectResults(
|
||||||
|
boost::ptr_vector<SubstringOccurence> & result,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
saidx_t left, saidx_t size) {
|
||||||
|
for (saidx_t i = 0; i < size; i++) {
|
||||||
|
saidx_t resultPos = SA->at(left + i);
|
||||||
|
SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
|
||||||
|
result.push_back(new SubstringOccurence(
|
||||||
|
marker / SUFFIX_MARKER_DIVISOR,
|
||||||
|
marker % SUFFIX_MARKER_DIVISOR));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <boost/ptr_container/ptr_vector.hpp>
|
#include <boost/ptr_container/ptr_vector.hpp>
|
||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
#include "concordia/substring_occurence.hpp"
|
#include "concordia/substring_occurence.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include "concordia/anubis_search_result.hpp"
|
#include "concordia/anubis_search_result.hpp"
|
||||||
@ -27,11 +28,24 @@ public:
|
|||||||
virtual ~AnubisSearcher();
|
virtual ~AnubisSearcher();
|
||||||
|
|
||||||
boost::ptr_vector<AnubisSearchResult> anubisSearch(
|
boost::ptr_vector<AnubisSearchResult> anubisSearch(
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
boost::ptr_vector<SubstringOccurence> lcpSearch(
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> >) throw(ConcordiaException);
|
boost::shared_ptr<std::vector<sauchar_t> > pattern,
|
||||||
|
SUFFIX_MARKER_TYPE & length) throw(ConcordiaException);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void _collectResults(boost::ptr_vector<SubstringOccurence> & result,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
saidx_t left, saidx_t size);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#define UTILS_HDR
|
#define UTILS_HDR
|
||||||
|
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -37,9 +38,19 @@ public:
|
|||||||
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
||||||
INDEX_CHARACTER_TYPE character);
|
INDEX_CHARACTER_TYPE character);
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static void printVector(boost::shared_ptr<std::vector<T> > vector);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static void _insertCharToSaucharArray(sauchar_t * array,
|
static void _insertCharToSaucharArray(sauchar_t * array,
|
||||||
INDEX_CHARACTER_TYPE character, int pos);
|
INDEX_CHARACTER_TYPE character, int pos);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void Utils::printVector(boost::shared_ptr<std::vector<T> > vector) {
|
||||||
|
for (int i = 0; i < vector->size(); i++) {
|
||||||
|
cout << vector->at(i) << " ";
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -25,6 +25,7 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
|||||||
hashGenerator->generateHash(pattern);
|
hashGenerator->generateHash(pattern);
|
||||||
saidx_t patternLength = hash->size()*sizeof(INDEX_CHARACTER_TYPE);
|
saidx_t patternLength = hash->size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||||
|
|
||||||
int size = sa_search(T->data(), (saidx_t) T->size(),
|
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||||
(const sauchar_t *) patternArray, patternLength,
|
(const sauchar_t *) patternArray, patternLength,
|
||||||
SA->data(), (saidx_t) SA->size(), &left);
|
SA->data(), (saidx_t) SA->size(), &left);
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
|
|
||||||
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
||||||
const int & offset):
|
const SUFFIX_MARKER_TYPE & offset):
|
||||||
_id(id),
|
_id(id),
|
||||||
_offset(offset) {
|
_offset(offset) {
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,7 @@ using namespace std;
|
|||||||
class SubstringOccurence {
|
class SubstringOccurence {
|
||||||
public:
|
public:
|
||||||
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
||||||
const int & offset);
|
const SUFFIX_MARKER_TYPE & offset);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
@ -24,14 +24,14 @@ public:
|
|||||||
return _id;
|
return _id;
|
||||||
}
|
}
|
||||||
|
|
||||||
int getOffset() const {
|
SUFFIX_MARKER_TYPE getOffset() const {
|
||||||
return _offset;
|
return _offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
SUFFIX_MARKER_TYPE _id;
|
SUFFIX_MARKER_TYPE _id;
|
||||||
|
|
||||||
int _offset;
|
SUFFIX_MARKER_TYPE _offset;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
add_library(concordia-tests
|
add_library(concordia-tests
|
||||||
|
test_anubis_searcher.cpp
|
||||||
test_sentence_anonymizer.cpp
|
test_sentence_anonymizer.cpp
|
||||||
test_text_utils.cpp
|
test_text_utils.cpp
|
||||||
test_regex_replacement.cpp
|
test_regex_replacement.cpp
|
||||||
|
143
concordia/t/test_anubis_searcher.cpp
Normal file
143
concordia/t/test_anubis_searcher.cpp
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
|
#include "concordia/anubis_searcher.hpp"
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE(anubis_searcher)
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||||
|
{
|
||||||
|
AnubisSearcher searcher;
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA(new std::vector<saidx_t>());
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > pattern(new std::vector<sauchar_t>());
|
||||||
|
|
||||||
|
/* Search in text: "banana"
|
||||||
|
T = 123232 (all one sentence id=34)
|
||||||
|
pattern: "anzzz" = 23444
|
||||||
|
word map: b=1,a=2,n=3,z=4
|
||||||
|
*/
|
||||||
|
|
||||||
|
T->push_back(1);
|
||||||
|
T->push_back(2);
|
||||||
|
T->push_back(3);
|
||||||
|
T->push_back(2);
|
||||||
|
T->push_back(3);
|
||||||
|
T->push_back(2);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE marker = 34 * SUFFIX_MARKER_DIVISOR;
|
||||||
|
for(int i=0;i<6;i++) {
|
||||||
|
markers->push_back(marker++);
|
||||||
|
}
|
||||||
|
|
||||||
|
pattern->push_back(2);
|
||||||
|
pattern->push_back(3);
|
||||||
|
pattern->push_back(4);
|
||||||
|
pattern->push_back(4);
|
||||||
|
|
||||||
|
/* Suffix array for the hashed index: 1 2 3 2 3 2
|
||||||
|
0: 1 2 3 2 3 2
|
||||||
|
5: 2
|
||||||
|
3: 2 3 2
|
||||||
|
1: 2 3 2 3 2
|
||||||
|
4: 3 2
|
||||||
|
2: 3 2 3 2
|
||||||
|
*/
|
||||||
|
|
||||||
|
SA->push_back(0);
|
||||||
|
SA->push_back(5);
|
||||||
|
SA->push_back(3);
|
||||||
|
SA->push_back(1);
|
||||||
|
SA->push_back(4);
|
||||||
|
SA->push_back(2);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE length;
|
||||||
|
boost::ptr_vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, length);
|
||||||
|
|
||||||
|
/* Expecting to get the following results from SA:
|
||||||
|
3: ana
|
||||||
|
1: anana
|
||||||
|
Which are 2 substring occurences (34,3) and (34,1) with the lcp length = 2;
|
||||||
|
*/
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(result.size(),2);
|
||||||
|
BOOST_CHECK_EQUAL(length,2);
|
||||||
|
BOOST_CHECK_EQUAL(result.at(0).getId(),34);
|
||||||
|
BOOST_CHECK_EQUAL(result.at(0).getOffset(),3);
|
||||||
|
BOOST_CHECK_EQUAL(result.at(1).getId(),34);
|
||||||
|
BOOST_CHECK_EQUAL(result.at(1).getOffset(),1);
|
||||||
|
|
||||||
|
//--------pattern banana
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > pattern2(new std::vector<sauchar_t>());
|
||||||
|
pattern2->push_back(1);
|
||||||
|
pattern2->push_back(2);
|
||||||
|
pattern2->push_back(3);
|
||||||
|
pattern2->push_back(2);
|
||||||
|
pattern2->push_back(3);
|
||||||
|
pattern2->push_back(2);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE length2;
|
||||||
|
boost::ptr_vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, length2);
|
||||||
|
|
||||||
|
/* Expecting to get one result from SA:
|
||||||
|
0: banana
|
||||||
|
Which is one substring occurence (34,0) with the lcp length = 6;
|
||||||
|
*/
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(result2.size(),1);
|
||||||
|
BOOST_CHECK_EQUAL(length2,6);
|
||||||
|
BOOST_CHECK_EQUAL(result2.at(0).getId(),34);
|
||||||
|
BOOST_CHECK_EQUAL(result2.at(0).getOffset(),0);
|
||||||
|
|
||||||
|
//--------pattern banan
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > pattern3(new std::vector<sauchar_t>());
|
||||||
|
pattern3->push_back(1);
|
||||||
|
pattern3->push_back(2);
|
||||||
|
pattern3->push_back(3);
|
||||||
|
pattern3->push_back(2);
|
||||||
|
pattern3->push_back(3);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE length3;
|
||||||
|
boost::ptr_vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, length3);
|
||||||
|
|
||||||
|
/* Expecting to get one result from SA:
|
||||||
|
0: banana
|
||||||
|
Which is one substring occurence (34,0) with the lcp length = 5;
|
||||||
|
*/
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(result3.size(),1);
|
||||||
|
BOOST_CHECK_EQUAL(length3,5);
|
||||||
|
BOOST_CHECK_EQUAL(result3.at(0).getId(),34);
|
||||||
|
BOOST_CHECK_EQUAL(result3.at(0).getOffset(),0);
|
||||||
|
|
||||||
|
//--------pattern nazz
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > pattern4(new std::vector<sauchar_t>());
|
||||||
|
pattern4->push_back(3);
|
||||||
|
pattern4->push_back(2);
|
||||||
|
pattern4->push_back(4);
|
||||||
|
pattern4->push_back(4);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE length4;
|
||||||
|
boost::ptr_vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, length4);
|
||||||
|
|
||||||
|
/* Expecting to get 2 results from SA:
|
||||||
|
4: na
|
||||||
|
2: nana
|
||||||
|
Which are 2 substring occurences (34,4) and (34,2) with the lcp length = 2;
|
||||||
|
*/
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(result4.size(),2);
|
||||||
|
BOOST_CHECK_EQUAL(length4,2);
|
||||||
|
BOOST_CHECK_EQUAL(result4.at(0).getId(),34);
|
||||||
|
BOOST_CHECK_EQUAL(result4.at(0).getOffset(),4);
|
||||||
|
BOOST_CHECK_EQUAL(result4.at(1).getId(),34);
|
||||||
|
BOOST_CHECK_EQUAL(result4.at(1).getOffset(),2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE_END()
|
@ -93,4 +93,36 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
|
|||||||
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
|
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest3 )
|
||||||
|
{
|
||||||
|
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||||
|
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||||
|
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
||||||
|
|
||||||
|
//Test hashed index:
|
||||||
|
// n: 0 1 2 3 4 5
|
||||||
|
// T[n]: 1 2 3 2 3 2
|
||||||
|
T->push_back(1);
|
||||||
|
T->push_back(2);
|
||||||
|
T->push_back(3);
|
||||||
|
T->push_back(2);
|
||||||
|
T->push_back(3);
|
||||||
|
T->push_back(2);
|
||||||
|
|
||||||
|
//Test suffix array:
|
||||||
|
// n: 0 1 2 3 4 5
|
||||||
|
//SA[n]: 5 3 1 0 4 2
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||||
|
|
||||||
|
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
||||||
|
expectedSA->push_back(0);
|
||||||
|
expectedSA->push_back(5);
|
||||||
|
expectedSA->push_back(3);
|
||||||
|
expectedSA->push_back(1);
|
||||||
|
expectedSA->push_back(4);
|
||||||
|
expectedSA->push_back(2);
|
||||||
|
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
Loading…
Reference in New Issue
Block a user