lcp search
Former-commit-id: 925a5de8bc33256b594c369907f202e29f809f47
This commit is contained in:
parent
dd8b27cc23
commit
e8ea5881a5
10
TODO.txt
10
TODO.txt
@ -1,4 +1,6 @@
|
||||
1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
||||
2. anonimizacja zdań
|
||||
3. Dzielenie zdań (max 255 tokenów)
|
||||
4. concordia-server
|
||||
DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
||||
DONE 2. anonimizacja zdań
|
||||
DONE 3. Dzielenie zdań (max 255 tokenów)
|
||||
|
||||
- concordia-server
|
||||
- zastanowić się nad empty hash examples
|
||||
|
@ -1,5 +1,7 @@
|
||||
#include "concordia/anubis_searcher.hpp"
|
||||
|
||||
#include<iostream>
|
||||
|
||||
|
||||
AnubisSearcher::AnubisSearcher() {
|
||||
}
|
||||
@ -13,9 +15,75 @@ boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> >)
|
||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||
throw(ConcordiaException) {
|
||||
|
||||
boost::ptr_vector<AnubisSearchResult> result;
|
||||
return result;
|
||||
}
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern,
|
||||
SUFFIX_MARKER_TYPE & length)
|
||||
throw(ConcordiaException) {
|
||||
saidx_t patternLength = 0;
|
||||
saidx_t size = SA->size();
|
||||
saidx_t left = 0;
|
||||
|
||||
sauchar_t * patternArray = pattern->data();
|
||||
|
||||
saidx_t * SAleft = SA->data();
|
||||
|
||||
saidx_t prevLeft;
|
||||
saidx_t prevSize;
|
||||
do {
|
||||
prevLeft = left;
|
||||
prevSize = size;
|
||||
|
||||
patternLength++;
|
||||
|
||||
saidx_t localLeft;
|
||||
size = sa_search(T->data(), (saidx_t) T->size(),
|
||||
(const sauchar_t *) patternArray, patternLength,
|
||||
SAleft, size, &localLeft);
|
||||
|
||||
left += localLeft;
|
||||
SAleft += localLeft;
|
||||
} while (patternLength < pattern->size() && size > 0);
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> result;
|
||||
|
||||
if (size == 0) {
|
||||
// The search managed to find exactly the longest common prefixes.
|
||||
length = patternLength - 1;
|
||||
if (length > 0) {
|
||||
// Get the results of the previous search
|
||||
_collectResults(result, markers, SA, prevLeft, prevSize);
|
||||
}
|
||||
// If length == 0, then the pattern has no common prefixes
|
||||
// with the index.
|
||||
} else {
|
||||
// Seemingly, the index contains at least one utterance
|
||||
// of the whole search pattern.
|
||||
length = patternLength;
|
||||
_collectResults(result, markers, SA, left, size);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void AnubisSearcher::_collectResults(
|
||||
boost::ptr_vector<SubstringOccurence> & result,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
saidx_t left, saidx_t size) {
|
||||
for (saidx_t i = 0; i < size; i++) {
|
||||
saidx_t resultPos = SA->at(left + i);
|
||||
SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
|
||||
result.push_back(new SubstringOccurence(
|
||||
marker / SUFFIX_MARKER_DIVISOR,
|
||||
marker % SUFFIX_MARKER_DIVISOR));
|
||||
}
|
||||
}
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include "concordia/substring_occurence.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include "concordia/anubis_search_result.hpp"
|
||||
@ -30,8 +31,21 @@ public:
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> >) throw(ConcordiaException);
|
||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> lcpSearch(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern,
|
||||
SUFFIX_MARKER_TYPE & length) throw(ConcordiaException);
|
||||
|
||||
private:
|
||||
void _collectResults(boost::ptr_vector<SubstringOccurence> & result,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
saidx_t left, saidx_t size);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define UTILS_HDR
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
@ -37,9 +38,19 @@ public:
|
||||
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
||||
INDEX_CHARACTER_TYPE character);
|
||||
|
||||
template <typename T>
|
||||
static void printVector(boost::shared_ptr<std::vector<T> > vector);
|
||||
|
||||
private:
|
||||
static void _insertCharToSaucharArray(sauchar_t * array,
|
||||
INDEX_CHARACTER_TYPE character, int pos);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void Utils::printVector(boost::shared_ptr<std::vector<T> > vector) {
|
||||
for (int i = 0; i < vector->size(); i++) {
|
||||
cout << vector->at(i) << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
#endif
|
||||
|
@ -25,6 +25,7 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
||||
hashGenerator->generateHash(pattern);
|
||||
saidx_t patternLength = hash->size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||
|
||||
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||
(const sauchar_t *) patternArray, patternLength,
|
||||
SA->data(), (saidx_t) SA->size(), &left);
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
|
||||
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
||||
const int & offset):
|
||||
const SUFFIX_MARKER_TYPE & offset):
|
||||
_id(id),
|
||||
_offset(offset) {
|
||||
}
|
||||
|
@ -14,7 +14,7 @@ using namespace std;
|
||||
class SubstringOccurence {
|
||||
public:
|
||||
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
||||
const int & offset);
|
||||
const SUFFIX_MARKER_TYPE & offset);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
@ -24,14 +24,14 @@ public:
|
||||
return _id;
|
||||
}
|
||||
|
||||
int getOffset() const {
|
||||
SUFFIX_MARKER_TYPE getOffset() const {
|
||||
return _offset;
|
||||
}
|
||||
|
||||
private:
|
||||
SUFFIX_MARKER_TYPE _id;
|
||||
|
||||
int _offset;
|
||||
SUFFIX_MARKER_TYPE _offset;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,4 +1,5 @@
|
||||
add_library(concordia-tests
|
||||
test_anubis_searcher.cpp
|
||||
test_sentence_anonymizer.cpp
|
||||
test_text_utils.cpp
|
||||
test_regex_replacement.cpp
|
||||
|
143
concordia/t/test_anubis_searcher.cpp
Normal file
143
concordia/t/test_anubis_searcher.cpp
Normal file
@ -0,0 +1,143 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/anubis_searcher.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(anubis_searcher)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||
{
|
||||
AnubisSearcher searcher;
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA(new std::vector<saidx_t>());
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern(new std::vector<sauchar_t>());
|
||||
|
||||
/* Search in text: "banana"
|
||||
T = 123232 (all one sentence id=34)
|
||||
pattern: "anzzz" = 23444
|
||||
word map: b=1,a=2,n=3,z=4
|
||||
*/
|
||||
|
||||
T->push_back(1);
|
||||
T->push_back(2);
|
||||
T->push_back(3);
|
||||
T->push_back(2);
|
||||
T->push_back(3);
|
||||
T->push_back(2);
|
||||
|
||||
SUFFIX_MARKER_TYPE marker = 34 * SUFFIX_MARKER_DIVISOR;
|
||||
for(int i=0;i<6;i++) {
|
||||
markers->push_back(marker++);
|
||||
}
|
||||
|
||||
pattern->push_back(2);
|
||||
pattern->push_back(3);
|
||||
pattern->push_back(4);
|
||||
pattern->push_back(4);
|
||||
|
||||
/* Suffix array for the hashed index: 1 2 3 2 3 2
|
||||
0: 1 2 3 2 3 2
|
||||
5: 2
|
||||
3: 2 3 2
|
||||
1: 2 3 2 3 2
|
||||
4: 3 2
|
||||
2: 3 2 3 2
|
||||
*/
|
||||
|
||||
SA->push_back(0);
|
||||
SA->push_back(5);
|
||||
SA->push_back(3);
|
||||
SA->push_back(1);
|
||||
SA->push_back(4);
|
||||
SA->push_back(2);
|
||||
|
||||
SUFFIX_MARKER_TYPE length;
|
||||
boost::ptr_vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, length);
|
||||
|
||||
/* Expecting to get the following results from SA:
|
||||
3: ana
|
||||
1: anana
|
||||
Which are 2 substring occurences (34,3) and (34,1) with the lcp length = 2;
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(result.size(),2);
|
||||
BOOST_CHECK_EQUAL(length,2);
|
||||
BOOST_CHECK_EQUAL(result.at(0).getId(),34);
|
||||
BOOST_CHECK_EQUAL(result.at(0).getOffset(),3);
|
||||
BOOST_CHECK_EQUAL(result.at(1).getId(),34);
|
||||
BOOST_CHECK_EQUAL(result.at(1).getOffset(),1);
|
||||
|
||||
//--------pattern banana
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern2(new std::vector<sauchar_t>());
|
||||
pattern2->push_back(1);
|
||||
pattern2->push_back(2);
|
||||
pattern2->push_back(3);
|
||||
pattern2->push_back(2);
|
||||
pattern2->push_back(3);
|
||||
pattern2->push_back(2);
|
||||
|
||||
SUFFIX_MARKER_TYPE length2;
|
||||
boost::ptr_vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, length2);
|
||||
|
||||
/* Expecting to get one result from SA:
|
||||
0: banana
|
||||
Which is one substring occurence (34,0) with the lcp length = 6;
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(result2.size(),1);
|
||||
BOOST_CHECK_EQUAL(length2,6);
|
||||
BOOST_CHECK_EQUAL(result2.at(0).getId(),34);
|
||||
BOOST_CHECK_EQUAL(result2.at(0).getOffset(),0);
|
||||
|
||||
//--------pattern banan
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern3(new std::vector<sauchar_t>());
|
||||
pattern3->push_back(1);
|
||||
pattern3->push_back(2);
|
||||
pattern3->push_back(3);
|
||||
pattern3->push_back(2);
|
||||
pattern3->push_back(3);
|
||||
|
||||
SUFFIX_MARKER_TYPE length3;
|
||||
boost::ptr_vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, length3);
|
||||
|
||||
/* Expecting to get one result from SA:
|
||||
0: banana
|
||||
Which is one substring occurence (34,0) with the lcp length = 5;
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(result3.size(),1);
|
||||
BOOST_CHECK_EQUAL(length3,5);
|
||||
BOOST_CHECK_EQUAL(result3.at(0).getId(),34);
|
||||
BOOST_CHECK_EQUAL(result3.at(0).getOffset(),0);
|
||||
|
||||
//--------pattern nazz
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern4(new std::vector<sauchar_t>());
|
||||
pattern4->push_back(3);
|
||||
pattern4->push_back(2);
|
||||
pattern4->push_back(4);
|
||||
pattern4->push_back(4);
|
||||
|
||||
SUFFIX_MARKER_TYPE length4;
|
||||
boost::ptr_vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, length4);
|
||||
|
||||
/* Expecting to get 2 results from SA:
|
||||
4: na
|
||||
2: nana
|
||||
Which are 2 substring occurences (34,4) and (34,2) with the lcp length = 2;
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(result4.size(),2);
|
||||
BOOST_CHECK_EQUAL(length4,2);
|
||||
BOOST_CHECK_EQUAL(result4.at(0).getId(),34);
|
||||
BOOST_CHECK_EQUAL(result4.at(0).getOffset(),4);
|
||||
BOOST_CHECK_EQUAL(result4.at(1).getId(),34);
|
||||
BOOST_CHECK_EQUAL(result4.at(1).getOffset(),2);
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
@ -93,4 +93,36 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest3 )
|
||||
{
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
||||
|
||||
//Test hashed index:
|
||||
// n: 0 1 2 3 4 5
|
||||
// T[n]: 1 2 3 2 3 2
|
||||
T->push_back(1);
|
||||
T->push_back(2);
|
||||
T->push_back(3);
|
||||
T->push_back(2);
|
||||
T->push_back(3);
|
||||
T->push_back(2);
|
||||
|
||||
//Test suffix array:
|
||||
// n: 0 1 2 3 4 5
|
||||
//SA[n]: 5 3 1 0 4 2
|
||||
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||
|
||||
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
||||
expectedSA->push_back(0);
|
||||
expectedSA->push_back(5);
|
||||
expectedSA->push_back(3);
|
||||
expectedSA->push_back(1);
|
||||
expectedSA->push_back(4);
|
||||
expectedSA->push_back(2);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
Loading…
Reference in New Issue
Block a user