lcp search

Former-commit-id: 925a5de8bc33256b594c369907f202e29f809f47
This commit is contained in:
rjawor 2014-05-15 22:20:31 +02:00
parent dd8b27cc23
commit e8ea5881a5
10 changed files with 283 additions and 11 deletions

View File

@ -1,4 +1,6 @@
1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
2. anonimizacja zdań
3. Dzielenie zdań (max 255 tokenów)
4. concordia-server
DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
DONE 2. anonimizacja zdań
DONE 3. Dzielenie zdań (max 255 tokenów)
- concordia-server
- zastanowić się nad empty hash examples

View File

@ -1,5 +1,7 @@
#include "concordia/anubis_searcher.hpp"
#include<iostream>
AnubisSearcher::AnubisSearcher() {
}
@ -13,9 +15,75 @@ boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> >)
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
throw(ConcordiaException) {
boost::ptr_vector<AnubisSearchResult> result;
return result;
}
boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<sauchar_t> > pattern,
SUFFIX_MARKER_TYPE & length)
throw(ConcordiaException) {
saidx_t patternLength = 0;
saidx_t size = SA->size();
saidx_t left = 0;
sauchar_t * patternArray = pattern->data();
saidx_t * SAleft = SA->data();
saidx_t prevLeft;
saidx_t prevSize;
do {
prevLeft = left;
prevSize = size;
patternLength++;
saidx_t localLeft;
size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SAleft, size, &localLeft);
left += localLeft;
SAleft += localLeft;
} while (patternLength < pattern->size() && size > 0);
boost::ptr_vector<SubstringOccurence> result;
if (size == 0) {
// The search managed to find exactly the longest common prefixes.
length = patternLength - 1;
if (length > 0) {
// Get the results of the previous search
_collectResults(result, markers, SA, prevLeft, prevSize);
}
// If length == 0, then the pattern has no common prefixes
// with the index.
} else {
// Seemingly, the index contains at least one utterance
// of the whole search pattern.
length = patternLength;
_collectResults(result, markers, SA, left, size);
}
return result;
}
void AnubisSearcher::_collectResults(
boost::ptr_vector<SubstringOccurence> & result,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
saidx_t left, saidx_t size) {
for (saidx_t i = 0; i < size; i++) {
saidx_t resultPos = SA->at(left + i);
SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
result.push_back(new SubstringOccurence(
marker / SUFFIX_MARKER_DIVISOR,
marker % SUFFIX_MARKER_DIVISOR));
}
}

View File

@ -5,6 +5,7 @@
#include <boost/ptr_container/ptr_vector.hpp>
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
#include "concordia/substring_occurence.hpp"
#include "concordia/concordia_exception.hpp"
#include "concordia/anubis_search_result.hpp"
@ -30,8 +31,21 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> >) throw(ConcordiaException);
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
throw(ConcordiaException);
boost::ptr_vector<SubstringOccurence> lcpSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<sauchar_t> > pattern,
SUFFIX_MARKER_TYPE & length) throw(ConcordiaException);
private:
void _collectResults(boost::ptr_vector<SubstringOccurence> & result,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
saidx_t left, saidx_t size);
};
#endif

View File

@ -2,6 +2,7 @@
#define UTILS_HDR
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include <fstream>
#include <iostream>
#include <vector>
@ -37,9 +38,19 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > vector,
INDEX_CHARACTER_TYPE character);
template <typename T>
static void printVector(boost::shared_ptr<std::vector<T> > vector);
private:
static void _insertCharToSaucharArray(sauchar_t * array,
INDEX_CHARACTER_TYPE character, int pos);
};
template <typename T>
void Utils::printVector(boost::shared_ptr<std::vector<T> > vector) {
for (int i = 0; i < vector->size(); i++) {
cout << vector->at(i) << " ";
}
cout << endl;
}
#endif

View File

@ -25,6 +25,7 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
hashGenerator->generateHash(pattern);
saidx_t patternLength = hash->size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) SA->size(), &left);

View File

@ -2,7 +2,7 @@
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
const int & offset):
const SUFFIX_MARKER_TYPE & offset):
_id(id),
_offset(offset) {
}

View File

@ -14,7 +14,7 @@ using namespace std;
class SubstringOccurence {
public:
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
const int & offset);
const SUFFIX_MARKER_TYPE & offset);
/*! Destructor.
*/
@ -24,14 +24,14 @@ public:
return _id;
}
int getOffset() const {
SUFFIX_MARKER_TYPE getOffset() const {
return _offset;
}
private:
SUFFIX_MARKER_TYPE _id;
int _offset;
SUFFIX_MARKER_TYPE _offset;
};
#endif

View File

@ -1,4 +1,5 @@
add_library(concordia-tests
test_anubis_searcher.cpp
test_sentence_anonymizer.cpp
test_text_utils.cpp
test_regex_replacement.cpp

View File

@ -0,0 +1,143 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/anubis_searcher.hpp"
#include "concordia/common/config.hpp"
using namespace std;
BOOST_AUTO_TEST_SUITE(anubis_searcher)
BOOST_AUTO_TEST_CASE( LcpSearch1 )
{
AnubisSearcher searcher;
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
boost::shared_ptr<std::vector<saidx_t> > SA(new std::vector<saidx_t>());
boost::shared_ptr<std::vector<sauchar_t> > pattern(new std::vector<sauchar_t>());
/* Search in text: "banana"
T = 123232 (all one sentence id=34)
pattern: "anzzz" = 23444
word map: b=1,a=2,n=3,z=4
*/
T->push_back(1);
T->push_back(2);
T->push_back(3);
T->push_back(2);
T->push_back(3);
T->push_back(2);
SUFFIX_MARKER_TYPE marker = 34 * SUFFIX_MARKER_DIVISOR;
for(int i=0;i<6;i++) {
markers->push_back(marker++);
}
pattern->push_back(2);
pattern->push_back(3);
pattern->push_back(4);
pattern->push_back(4);
/* Suffix array for the hashed index: 1 2 3 2 3 2
0: 1 2 3 2 3 2
5: 2
3: 2 3 2
1: 2 3 2 3 2
4: 3 2
2: 3 2 3 2
*/
SA->push_back(0);
SA->push_back(5);
SA->push_back(3);
SA->push_back(1);
SA->push_back(4);
SA->push_back(2);
SUFFIX_MARKER_TYPE length;
boost::ptr_vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, length);
/* Expecting to get the following results from SA:
3: ana
1: anana
Which are 2 substring occurences (34,3) and (34,1) with the lcp length = 2;
*/
BOOST_CHECK_EQUAL(result.size(),2);
BOOST_CHECK_EQUAL(length,2);
BOOST_CHECK_EQUAL(result.at(0).getId(),34);
BOOST_CHECK_EQUAL(result.at(0).getOffset(),3);
BOOST_CHECK_EQUAL(result.at(1).getId(),34);
BOOST_CHECK_EQUAL(result.at(1).getOffset(),1);
//--------pattern banana
boost::shared_ptr<std::vector<sauchar_t> > pattern2(new std::vector<sauchar_t>());
pattern2->push_back(1);
pattern2->push_back(2);
pattern2->push_back(3);
pattern2->push_back(2);
pattern2->push_back(3);
pattern2->push_back(2);
SUFFIX_MARKER_TYPE length2;
boost::ptr_vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, length2);
/* Expecting to get one result from SA:
0: banana
Which is one substring occurence (34,0) with the lcp length = 6;
*/
BOOST_CHECK_EQUAL(result2.size(),1);
BOOST_CHECK_EQUAL(length2,6);
BOOST_CHECK_EQUAL(result2.at(0).getId(),34);
BOOST_CHECK_EQUAL(result2.at(0).getOffset(),0);
//--------pattern banan
boost::shared_ptr<std::vector<sauchar_t> > pattern3(new std::vector<sauchar_t>());
pattern3->push_back(1);
pattern3->push_back(2);
pattern3->push_back(3);
pattern3->push_back(2);
pattern3->push_back(3);
SUFFIX_MARKER_TYPE length3;
boost::ptr_vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, length3);
/* Expecting to get one result from SA:
0: banana
Which is one substring occurence (34,0) with the lcp length = 5;
*/
BOOST_CHECK_EQUAL(result3.size(),1);
BOOST_CHECK_EQUAL(length3,5);
BOOST_CHECK_EQUAL(result3.at(0).getId(),34);
BOOST_CHECK_EQUAL(result3.at(0).getOffset(),0);
//--------pattern nazz
boost::shared_ptr<std::vector<sauchar_t> > pattern4(new std::vector<sauchar_t>());
pattern4->push_back(3);
pattern4->push_back(2);
pattern4->push_back(4);
pattern4->push_back(4);
SUFFIX_MARKER_TYPE length4;
boost::ptr_vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, length4);
/* Expecting to get 2 results from SA:
4: na
2: nana
Which are 2 substring occurences (34,4) and (34,2) with the lcp length = 2;
*/
BOOST_CHECK_EQUAL(result4.size(),2);
BOOST_CHECK_EQUAL(length4,2);
BOOST_CHECK_EQUAL(result4.at(0).getId(),34);
BOOST_CHECK_EQUAL(result4.at(0).getOffset(),4);
BOOST_CHECK_EQUAL(result4.at(1).getId(),34);
BOOST_CHECK_EQUAL(result4.at(1).getOffset(),2);
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -93,4 +93,36 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
}
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest3 )
{
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
//Test hashed index:
// n: 0 1 2 3 4 5
// T[n]: 1 2 3 2 3 2
T->push_back(1);
T->push_back(2);
T->push_back(3);
T->push_back(2);
T->push_back(3);
T->push_back(2);
//Test suffix array:
// n: 0 1 2 3 4 5
//SA[n]: 5 3 1 0 4 2
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
expectedSA->push_back(0);
expectedSA->push_back(5);
expectedSA->push_back(3);
expectedSA->push_back(1);
expectedSA->push_back(4);
expectedSA->push_back(2);
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
}
BOOST_AUTO_TEST_SUITE_END()