suffix markers

Former-commit-id: 7426cce771f548dcd4eb7478aafa912fb73784bf
This commit is contained in:
rjawor 2014-02-20 10:49:17 +01:00
parent b318770752
commit fb65cc9c66
24 changed files with 407 additions and 124 deletions

View File

@ -6,9 +6,16 @@ project(concordia C CXX)
set (CONCORDIA_VERSION_MAJOR 0) set (CONCORDIA_VERSION_MAJOR 0)
set (CONCORDIA_VERSION_MINOR 1) set (CONCORDIA_VERSION_MINOR 1)
# Type of the characters in index # Type of the characters in SA
set (INDEX_CHARACTER_TYPE "unsigned int") set (INDEX_CHARACTER_TYPE "unsigned int")
# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
# Suffix markers
set (SUFFIX_MARKER_TYPE "unsigned int")
set (SUFFIX_MARKER_DIVISOR 256)
# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
# ============================== # # ============================== #
# Production paths # Production paths
@ -26,6 +33,7 @@ set (TEST_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/tests/resources")
set (TEST_PUDDLE_TAGSET_PATH "${TEST_RESOURCES_DIRECTORY}/puddle/basic-tagset.txt") set (TEST_PUDDLE_TAGSET_PATH "${TEST_RESOURCES_DIRECTORY}/puddle/basic-tagset.txt")
set (TEMP_WORD_MAP "temp_word_map.bin") set (TEMP_WORD_MAP "temp_word_map.bin")
set (TEMP_HASHED_INDEX "temp_hashed_index.bin") set (TEMP_HASHED_INDEX "temp_hashed_index.bin")
set (TEMP_MARKERS "temp_markers.bin")
set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin") set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin")
file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp) file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp)

View File

@ -3,9 +3,12 @@
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/date_time/posix_time/posix_time.hpp> #include <boost/date_time/posix_time/posix_time.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include "concordia/concordia.hpp" #include "concordia/concordia.hpp"
#include "concordia/substring_occurence.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp" #include "concordia/common/utils.hpp"
#include "build/libdivsufsort/include/divsufsort.h" #include "build/libdivsufsort/include/divsufsort.h"
@ -65,18 +68,17 @@ int main(int argc, char** argv) {
std::cout << "\tSearching for pattern: \"" << pattern << std::cout << "\tSearching for pattern: \"" << pattern <<
"\"" << std::endl; "\"" << std::endl;
time_start = boost::posix_time::microsec_clock::local_time(); time_start = boost::posix_time::microsec_clock::local_time();
boost::shared_ptr<vector<saidx_t> > result = boost::ptr_vector<SubstringOccurence> result =
concordia.simpleSearch(pattern); concordia.simpleSearch(pattern);
time_end = boost::posix_time::microsec_clock::local_time(); time_end = boost::posix_time::microsec_clock::local_time();
msdiff = time_end - time_start; msdiff = time_end - time_start;
std::cout << "\tFound: " << result->size() << " matches. " std::cout << "\tFound: " << result.size() << " matches. "
<< "Search took: " << << "Search took: " <<
msdiff.total_milliseconds() << "ms." << std::endl; msdiff.total_milliseconds() << "ms." << std::endl;
if (!cli.count("silent")) { if (!cli.count("silent")) {
for (vector<saidx_t>::iterator it = result->begin(); BOOST_FOREACH(SubstringOccurence occurence, result) {
it != result->end(); ++it) { std::cout << "\t\tfound match in sentence number: "
std::cout << "\t\tfound match on word number: " << *it << occurence.getId() << std::endl;
<< std::endl;
} }
} }
} else if (cli.count("read-file")) { } else if (cli.count("read-file")) {
@ -87,16 +89,15 @@ int main(int argc, char** argv) {
std::string line; std::string line;
if (text_file.is_open()) { if (text_file.is_open()) {
long lineCount = 0; long lineCount = 0;
boost::shared_ptr<std::vector<std::string> > boost::ptr_vector<Example> buffer;
buffer(new std::vector<std::string>());
boost::posix_time::ptime timeStart = boost::posix_time::ptime timeStart =
boost::posix_time::microsec_clock::local_time(); boost::posix_time::microsec_clock::local_time();
while (getline(text_file, line)) { while (getline(text_file, line)) {
lineCount++; lineCount++;
buffer->push_back(line); buffer.push_back(new Example(line, lineCount));
if (lineCount % READ_BUFFER_LENGTH == 0) { if (lineCount % READ_BUFFER_LENGTH == 0) {
concordia.addAllSentences(buffer); concordia.addAllExamples(buffer);
buffer->clear(); buffer.clear();
boost::posix_time::ptime timeEnd = boost::posix_time::ptime timeEnd =
boost::posix_time::microsec_clock::local_time(); boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration msdiff = boost::posix_time::time_duration msdiff =
@ -110,8 +111,8 @@ int main(int argc, char** argv) {
" sentences per second" << std::endl; " sentences per second" << std::endl;
} }
} }
if (buffer->size() > 0) { if (buffer.size() > 0) {
concordia.addAllSentences(buffer); concordia.addAllExamples(buffer);
} }
text_file.close(); text_file.close();
boost::posix_time::ptime timeTotalEnd = boost::posix_time::ptime timeTotalEnd =
@ -156,6 +157,5 @@ int main(int argc, char** argv) {
<< std::endl; << std::endl;
return 1; return 1;
} }
return 0; return 0;
} }

View File

@ -6,6 +6,8 @@ foreach(dir ${ALL_DIRECTORIES})
endforeach(dir) endforeach(dir)
add_library(concordia SHARED add_library(concordia SHARED
substring_occurence.cpp
example.cpp
index_searcher.cpp index_searcher.cpp
concordia_index.cpp concordia_index.cpp
word_map.cpp word_map.cpp

View File

@ -4,6 +4,7 @@
#define TEST_RESOURCES_DIRECTORY "@TEST_RESOURCES_DIRECTORY@" #define TEST_RESOURCES_DIRECTORY "@TEST_RESOURCES_DIRECTORY@"
#define TEMP_WORD_MAP "@TEMP_WORD_MAP@" #define TEMP_WORD_MAP "@TEMP_WORD_MAP@"
#define TEMP_HASHED_INDEX "@TEMP_HASHED_INDEX@" #define TEMP_HASHED_INDEX "@TEMP_HASHED_INDEX@"
#define TEMP_MARKERS "@TEMP_MARKERS@"
#define TEMP_SUFFIX_ARRAY "@TEMP_SUFFIX_ARRAY@" #define TEMP_SUFFIX_ARRAY "@TEMP_SUFFIX_ARRAY@"
#define PROD_RESOURCES_DIRECTORY "@PROD_RESOURCES_DIRECTORY@" #define PROD_RESOURCES_DIRECTORY "@PROD_RESOURCES_DIRECTORY@"
@ -17,4 +18,6 @@
#define LEXICON_FIELD_SEPARATOR "\t" #define LEXICON_FIELD_SEPARATOR "\t"
typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE; typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;
typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@

View File

@ -11,12 +11,23 @@ void Utils::writeIndexCharacter(ofstream & file,
file.write(reinterpret_cast<char *>(&character), sizeof(character)); file.write(reinterpret_cast<char *>(&character), sizeof(character));
} }
void Utils::writeMarker(ofstream & file,
SUFFIX_MARKER_TYPE marker) {
file.write(reinterpret_cast<char *>(&marker), sizeof(marker));
}
INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) { INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) {
INDEX_CHARACTER_TYPE character; INDEX_CHARACTER_TYPE character;
file.read(reinterpret_cast<char *>(&character), sizeof(character)); file.read(reinterpret_cast<char *>(&character), sizeof(character));
return character; return character;
} }
SUFFIX_MARKER_TYPE Utils::readMarker(ifstream & file) {
SUFFIX_MARKER_TYPE marker;
file.read(reinterpret_cast<char *>(&marker), sizeof(marker));
return marker;
}
sauchar_t * Utils::indexVectorToSaucharArray( sauchar_t * Utils::indexVectorToSaucharArray(
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input) { boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input) {
const int kArraySize = input->size()*sizeof(INDEX_CHARACTER_TYPE); const int kArraySize = input->size()*sizeof(INDEX_CHARACTER_TYPE);

View File

@ -23,14 +23,20 @@ public:
static void writeIndexCharacter(ofstream & file, static void writeIndexCharacter(ofstream & file,
INDEX_CHARACTER_TYPE character); INDEX_CHARACTER_TYPE character);
static void writeMarker(ofstream & file,
SUFFIX_MARKER_TYPE marker);
static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file); static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file);
static SUFFIX_MARKER_TYPE readMarker(ifstream & file);
static sauchar_t * indexVectorToSaucharArray( static sauchar_t * indexVectorToSaucharArray(
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input); boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input);
static void appendCharToSaucharVector( static void appendCharToSaucharVector(
boost::shared_ptr<std::vector<sauchar_t> > vector, boost::shared_ptr<std::vector<sauchar_t> > vector,
INDEX_CHARACTER_TYPE character); INDEX_CHARACTER_TYPE character);
private: private:
static void _insertCharToSaucharArray(sauchar_t * array, static void _insertCharToSaucharArray(sauchar_t * array,
INDEX_CHARACTER_TYPE character, int pos); INDEX_CHARACTER_TYPE character, int pos);

View File

@ -19,7 +19,8 @@ Concordia::Concordia(const std::string & configFilePath)
_config = boost::shared_ptr<ConcordiaConfig> ( _config = boost::shared_ptr<ConcordiaConfig> (
new ConcordiaConfig(configFilePath)); new ConcordiaConfig(configFilePath));
_index = boost::shared_ptr<ConcordiaIndex>( _index = boost::shared_ptr<ConcordiaIndex>(
new ConcordiaIndex(_config->getHashedIndexFilePath())); new ConcordiaIndex(_config->getHashedIndexFilePath(),
_config->getMarkersFilePath()));
_searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher()); _searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher());
_initializeIndex(); _initializeIndex();
} }
@ -42,30 +43,32 @@ std::string _createLibraryVersion() {
} }
// Sentences are written to disk and added to T. // Sentences are written to disk and added to T.
// SA is generated on command by different methods. // SA is generated on command by other methods.
void Concordia::addSentence(const std::string & sentence) // TODO(rjawor): modify SA on the fly
void Concordia::addExample(const Example & example)
throw(ConcordiaException) { throw(ConcordiaException) {
_index->addSentence(_hashGenerator, _T, sentence); _index->addExample(_hashGenerator, _T, _markers, example);
} }
// Sentences are written to disk and added to T. // Sentences are written to disk and added to T.
// SA is generated on command by different methods. // SA is generated on command by other methods.
void Concordia::addAllSentences( // TODO(rjawor): modify SA on the fly
boost::shared_ptr<std::vector<std::string> > sentences) void Concordia::addAllExamples(const boost::ptr_vector<Example > & examples)
throw(ConcordiaException) { throw(ConcordiaException) {
_index->addAllSentences(_hashGenerator, _T, sentences); _index->addAllExamples(_hashGenerator, _T, _markers, examples);
} }
void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) { void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
if (boost::filesystem::exists(_config->getWordMapFilePath()) if (boost::filesystem::exists(_config->getWordMapFilePath())
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) { && boost::filesystem::exists(_config->getHashedIndexFilePath())
// reading index from files && boost::filesystem::exists(_config->getMarkersFilePath())) {
// reading index from file
_T->clear(); _T->clear();
ifstream hashedIndexFile; ifstream hashedIndexFile;
hashedIndexFile.open(_config->getHashedIndexFilePath().c_str(), ios::in hashedIndexFile.open(_config->getHashedIndexFilePath().c_str(), ios::in
| ios::ate | ios::binary); | ios::ate | ios::binary);
saidx_t fileSize = hashedIndexFile.tellg(); saidx_t hiFileSize = hashedIndexFile.tellg();
if (fileSize > 0) { if (hiFileSize > 0) {
hashedIndexFile.seekg(0, ios::beg); hashedIndexFile.seekg(0, ios::beg);
while (!hashedIndexFile.eof()) { while (!hashedIndexFile.eof()) {
@ -74,12 +77,32 @@ void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
Utils::appendCharToSaucharVector(_T, character); Utils::appendCharToSaucharVector(_T, character);
} }
hashedIndexFile.close(); hashedIndexFile.close();
// generating suffix array
_SA = _index->generateSuffixArray(_hashGenerator, _T);
} else { } else {
hashedIndexFile.close();
throw ConcordiaException("Index corrupt: empty hash index file"); throw ConcordiaException("Index corrupt: empty hash index file");
} }
// reading markers from file
_markers->clear();
ifstream markersFile;
markersFile.open(_config->getMarkersFilePath().c_str(), ios::in
| ios::ate | ios::binary);
saidx_t maFileSize = markersFile.tellg();
if (maFileSize > 0) {
markersFile.seekg(0, ios::beg);
while (!markersFile.eof()) {
SUFFIX_MARKER_TYPE marker =
Utils::readMarker(markersFile);
_markers->push_back(marker);
}
markersFile.close();
} else {
markersFile.close();
throw ConcordiaException("Index corrupt: empty markers file");
}
// generating suffix array
_SA = _index->generateSuffixArray(_hashGenerator, _T);
} else { } else {
throw ConcordiaException("Index corrupt: missing files"); throw ConcordiaException("Index corrupt: missing files");
} }
@ -95,6 +118,8 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
new HashGenerator(_config->getWordMapFilePath())); new HashGenerator(_config->getWordMapFilePath()));
_T = boost::shared_ptr<std::vector<sauchar_t> >( _T = boost::shared_ptr<std::vector<sauchar_t> >(
new std::vector<sauchar_t>); new std::vector<sauchar_t>);
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(
new std::vector<SUFFIX_MARKER_TYPE>);
if (boost::filesystem::exists(_config->getWordMapFilePath()) if (boost::filesystem::exists(_config->getWordMapFilePath())
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) { && boost::filesystem::exists(_config->getHashedIndexFilePath())) {
loadRAMIndexFromDisk(); loadRAMIndexFromDisk();
@ -108,16 +133,15 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
} }
} }
boost::shared_ptr<std::vector<saidx_t> > Concordia::simpleSearch( boost::ptr_vector<SubstringOccurence> Concordia::simpleSearch(
const string & pattern) const string & pattern)
throw(ConcordiaException) { throw(ConcordiaException) {
if (_T->size() > 0) { if (_T->size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T, _SA, pattern); return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern);
} else { } else {
boost::shared_ptr<std::vector<saidx_t> > result = boost::ptr_vector<SubstringOccurence> result;
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
return result; return result;
} }
} }

View File

@ -4,8 +4,12 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include "concordia/common/config.hpp"
#include "concordia/example.hpp"
#include "concordia/substring_occurence.hpp"
#include "concordia/concordia_config.hpp" #include "concordia/concordia_config.hpp"
#include "concordia/concordia_index.hpp" #include "concordia/concordia_index.hpp"
#include "concordia/index_searcher.hpp" #include "concordia/index_searcher.hpp"
@ -34,12 +38,12 @@ public:
*/ */
std::string & getVersion(); std::string & getVersion();
void addSentence(const std::string & sentence) throw(ConcordiaException); void addExample(const Example & example) throw(ConcordiaException);
void addAllSentences(boost::shared_ptr<std::vector<std::string> > sentences) void addAllExamples(const boost::ptr_vector<Example > & examples)
throw(ConcordiaException); throw(ConcordiaException);
boost::shared_ptr<std::vector<saidx_t> > simpleSearch( boost::ptr_vector<SubstringOccurence> simpleSearch(
const std::string & pattern) const std::string & pattern)
throw(ConcordiaException); throw(ConcordiaException);
@ -63,6 +67,8 @@ private:
boost::shared_ptr<std::vector<sauchar_t> > _T; boost::shared_ptr<std::vector<sauchar_t> > _T;
boost::shared_ptr<std::vector<saidx_t> > _SA; boost::shared_ptr<std::vector<saidx_t> > _SA;
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > _markers;
}; };
#endif #endif

View File

@ -5,6 +5,7 @@
#define PUDDLE_TAGSET_PARAM "puddle_tagset_path" #define PUDDLE_TAGSET_PARAM "puddle_tagset_path"
#define WORD_MAP_PARAM "word_map_path" #define WORD_MAP_PARAM "word_map_path"
#define HASHED_INDEX_PARAM "hashed_index_path" #define HASHED_INDEX_PARAM "hashed_index_path"
#define MARKERS_PARAM "markers_path"
#define SUFFIX_ARRAY_PARAM "suffix_array_path" #define SUFFIX_ARRAY_PARAM "suffix_array_path"
ConcordiaConfig::ConcordiaConfig(const string & configFilePath) ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
@ -24,6 +25,8 @@ ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
ConcordiaConfig::_readConfigParameterStr(WORD_MAP_PARAM); ConcordiaConfig::_readConfigParameterStr(WORD_MAP_PARAM);
_hashedIndexFilePath = _hashedIndexFilePath =
ConcordiaConfig::_readConfigParameterStr(HASHED_INDEX_PARAM); ConcordiaConfig::_readConfigParameterStr(HASHED_INDEX_PARAM);
_markersFilePath =
ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
_suffixArrayFilePath = _suffixArrayFilePath =
ConcordiaConfig::_readConfigParameterStr(SUFFIX_ARRAY_PARAM); ConcordiaConfig::_readConfigParameterStr(SUFFIX_ARRAY_PARAM);
} }

View File

@ -42,6 +42,10 @@ public:
return _hashedIndexFilePath; return _hashedIndexFilePath;
} }
string & getMarkersFilePath() {
return _markersFilePath;
}
string & getSuffixArrayFilePath() { string & getSuffixArrayFilePath() {
return _suffixArrayFilePath; return _suffixArrayFilePath;
} }
@ -55,6 +59,8 @@ private:
string _hashedIndexFilePath; string _hashedIndexFilePath;
string _markersFilePath;
string _suffixArrayFilePath; string _suffixArrayFilePath;
string _readConfigParameterStr(const string & name) string _readConfigParameterStr(const string & name)

View File

@ -2,11 +2,15 @@
#include "concordia/common/utils.hpp" #include "concordia/common/utils.hpp"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <boost/foreach.hpp>
#include <iostream> #include <iostream>
#include <climits>
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath) ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
const string & markersFilePath)
throw(ConcordiaException) : throw(ConcordiaException) :
_hashedIndexFilePath(hashedIndexFilePath) { _hashedIndexFilePath(hashedIndexFilePath),
_markersFilePath(markersFilePath) {
} }
ConcordiaIndex::~ConcordiaIndex() { ConcordiaIndex::~ConcordiaIndex() {
@ -30,45 +34,80 @@ boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
return result; return result;
} }
void ConcordiaIndex::addSentence(boost::shared_ptr<HashGenerator> hashGenerator, void ConcordiaIndex::addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T, boost::shared_ptr<vector<sauchar_t> > T,
const string & sentence) { boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
ofstream hashedIndexFile; ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out| hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
ios::app|ios::binary); ios::app|ios::binary);
_addSingleSentence(hashedIndexFile, hashGenerator, T, sentence); ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), ios::out|
ios::app|ios::binary);
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedIndexFile.close(); hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap(); hashGenerator->serializeWordMap();
} }
void ConcordiaIndex::addAllSentences( void ConcordiaIndex::addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T, boost::shared_ptr<vector<sauchar_t> > T,
boost::shared_ptr<vector<string> > sentences) { boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
const boost::ptr_vector<Example > & examples) {
ofstream hashedIndexFile; ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out| hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
ios::app|ios::binary); ios::app|ios::binary);
for (vector<string>::iterator sent_it = sentences->begin(); ofstream markersFile;
sent_it != sentences->end(); ++sent_it) { markersFile.open(_markersFilePath.c_str(), ios::out|
string sentence = *sent_it; ios::app|ios::binary);
_addSingleSentence(hashedIndexFile, hashGenerator, T, sentence);
BOOST_FOREACH(Example example, examples) {
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
} }
hashedIndexFile.close(); hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap(); hashGenerator->serializeWordMap();
} }
void ConcordiaIndex::_addSingleSentence( void ConcordiaIndex::_addSingleExample(
ofstream & hashedIndexFile, ofstream & hashedIndexFile,
boost::shared_ptr<HashGenerator> hashGenerator, ofstream & markersFile,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<HashGenerator> hashGenerator,
const string & sentence) { boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash
= hashGenerator->generateHash(sentence); = hashGenerator->generateHash(example.getSentence());
int offset = 0;
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash->begin(); for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash->begin();
it != hash->end(); ++it) { it != hash->end(); ++it) {
INDEX_CHARACTER_TYPE character = *it; INDEX_CHARACTER_TYPE character = *it;
Utils::writeIndexCharacter(hashedIndexFile, character); Utils::writeIndexCharacter(hashedIndexFile, character);
Utils::appendCharToSaucharVector(T, character); Utils::appendCharToSaucharVector(T, character);
// append to markersFile
SUFFIX_MARKER_TYPE marker = offset;
marker += example.getId() * SUFFIX_MARKER_DIVISOR;
Utils::writeMarker(markersFile, marker);
markers->push_back(marker);
offset++;
} }
// append sentence boundary marker
INDEX_CHARACTER_TYPE sentenceBoundaryHI = ULONG_MAX;
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
SUFFIX_MARKER_TYPE sentenceBoundaryMA = ULONG_MAX;
Utils::writeMarker(markersFile, sentenceBoundaryMA);
markers->push_back(sentenceBoundaryMA);
} }

View File

@ -2,10 +2,13 @@
#define CONCORDIA_INDEX_HDR #define CONCORDIA_INDEX_HDR
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "concordia/common/config.hpp"
#include "concordia/example.hpp"
#include "concordia/hash_generator.hpp" #include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
#include "build/libdivsufsort/include/divsufsort.h" #include "build/libdivsufsort/include/divsufsort.h"
@ -19,35 +22,42 @@ using namespace std;
class ConcordiaIndex { class ConcordiaIndex {
public: public:
explicit ConcordiaIndex(const string & hashedIndexFilePath) explicit ConcordiaIndex(const string & hashedIndexFilePath,
const string & markersFilePath)
throw(ConcordiaException); throw(ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */
virtual ~ConcordiaIndex(); virtual ~ConcordiaIndex();
void addSentence( void addExample(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T, boost::shared_ptr<vector<sauchar_t> > T,
const string & sentence); boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example);
void addAllSentences( void addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T, boost::shared_ptr<vector<sauchar_t> > T,
boost::shared_ptr<vector<string> > sentences); boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
const boost::ptr_vector<Example > & examples);
boost::shared_ptr<vector<saidx_t> > generateSuffixArray( boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T); boost::shared_ptr<vector<sauchar_t> > T);
private: private:
// Add sentence to disk index and update RAM index. // Add example to disk index and update RAM index.
void _addSingleSentence(ofstream & hashedIndexFile, void _addSingleExample(ofstream & hashedIndexFile,
boost::shared_ptr<HashGenerator> hashGenerator, ofstream & markersFile,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<HashGenerator> hashGenerator,
const string & sentence); boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example);
string _hashedIndexFilePath; string _hashedIndexFilePath;
string _markersFilePath;
}; };
#endif #endif

11
concordia/example.cpp Normal file
View File

@ -0,0 +1,11 @@
#include "concordia/example.hpp"
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id):
_sentence(sentence),
_id(id) {
}
Example::~Example() {
}

36
concordia/example.hpp Normal file
View File

@ -0,0 +1,36 @@
#ifndef EXAMPLE_HDR
#define EXAMPLE_HDR
#include "concordia/common/config.hpp"
#include <string>
/*!
Class representing a single sentence to be added into index along with its id.
*/
using namespace std;
class Example {
public:
explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id);
/*! Destructor.
*/
virtual ~Example();
string getSentence() const {
return _sentence;
}
SUFFIX_MARKER_TYPE getId() const {
return _id;
}
private:
string _sentence;
SUFFIX_MARKER_TYPE _id;
};
#endif

View File

@ -10,13 +10,13 @@ IndexSearcher::IndexSearcher() {
IndexSearcher::~IndexSearcher() { IndexSearcher::~IndexSearcher() {
} }
boost::shared_ptr<vector<saidx_t> > IndexSearcher::simpleSearch( boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const string & pattern) throw(ConcordiaException) { boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<vector<saidx_t> > result = const string & pattern) throw(ConcordiaException) {
boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>()); boost::ptr_vector<SubstringOccurence> result;
int left; int left;
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
@ -27,14 +27,19 @@ boost::shared_ptr<vector<saidx_t> > IndexSearcher::simpleSearch(
(const sauchar_t *) patternArray, patternLength, (const sauchar_t *) patternArray, patternLength,
SA->data(), (saidx_t) T->size(), &left); SA->data(), (saidx_t) T->size(), &left);
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
saidx_t result_pos = SA->at(left + i); saidx_t resultPos = SA->at(left + i);
if (result_pos % sizeof(INDEX_CHARACTER_TYPE) == 0) { if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
// As we are looking for a pattern in an array of higher // As we are looking for a pattern in an array of higher
// resolution than the hashed index file, we might // resolution than the hashed index file, we might
// obtain accidental results exceeding the boundaries // obtain accidental results exceeding the boundaries
// of characters in hashed index. The above check // of characters in hashed index. The above check
// removes these accidental results. // removes these accidental results.
result->push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE)); saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
result.push_back(new SubstringOccurence(
marker / SUFFIX_MARKER_DIVISOR,
marker % SUFFIX_MARKER_DIVISOR));
} }
} }
@ -42,6 +47,3 @@ boost::shared_ptr<vector<saidx_t> > IndexSearcher::simpleSearch(
return result; return result;
} }

View File

@ -2,11 +2,13 @@
#define INDEX_SEARCHER_HDR #define INDEX_SEARCHER_HDR
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "build/libdivsufsort/include/divsufsort.h" #include "build/libdivsufsort/include/divsufsort.h"
#include "concordia/substring_occurence.hpp"
#include "concordia/hash_generator.hpp" #include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
@ -25,9 +27,10 @@ public:
*/ */
virtual ~IndexSearcher(); virtual ~IndexSearcher();
boost::shared_ptr<vector<saidx_t> > simpleSearch( boost::ptr_vector<SubstringOccurence> simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const string & pattern) throw(ConcordiaException); const string & pattern) throw(ConcordiaException);
private: private:

View File

@ -0,0 +1,12 @@
#include "concordia/substring_occurence.hpp"
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
const int & offset):
_id(id),
_offset(offset) {
}
SubstringOccurence::~SubstringOccurence() {
}

View File

@ -0,0 +1,37 @@
#ifndef SUBSTRING_OCCURENCE_HDR
#define SUBSTRING_OCCURENCE_HDR
#include "concordia/common/config.hpp"
#include <string>
/*!
Class representing occurence of a searched substring.
*/
using namespace std;
class SubstringOccurence {
public:
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
const int & offset);
/*! Destructor.
*/
virtual ~SubstringOccurence();
SUFFIX_MARKER_TYPE getId() const {
return _id;
}
int getOffset() const {
return _offset;
}
private:
SUFFIX_MARKER_TYPE _id;
int _offset;
};
#endif

View File

@ -4,6 +4,7 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include <boost/algorithm/string/predicate.hpp> #include <boost/algorithm/string/predicate.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <string> #include <string>
@ -19,18 +20,19 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
BOOST_CHECK_EQUAL( version , "0.1"); BOOST_CHECK_EQUAL( version , "0.1");
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
{ {
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
concordia.addSentence("Ala ma kota"); concordia.addExample(Example("Ala ma kota",14));
concordia.addSentence("Ala ma rysia"); concordia.addExample(Example("Ala ma rysia",51));
concordia.addSentence("Marysia ma rysia"); concordia.addExample(Example("Marysia ma rysia",123));
concordia.refreshSAfromRAM(); concordia.refreshSAfromRAM();
/*The test index contains 3 sentences: /*The test index contains 3 sentences:
"Ala ma kota" 14: "Ala ma kota"
"Ala ma rysia" 51: "Ala ma rysia"
"Marysia ma rysia" 123: "Marysia ma rysia"
Test word map: Test word map:
Ala -> 0 Ala -> 0
@ -40,44 +42,48 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
Marysia -> 4 Marysia -> 4
Test hashed index: Test hashed index:
n: 0 1 2 3 4 5 6 7 8 n: 0 1 2 3 4 5 6 7 8 9 10 11
T[n]: 0 1 2 0 1 3 4 1 3 T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
Test suffix array: Test suffix array:
n: 0 1 2 3 4 5 6 7 8 n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 3 1 7 4 2 8 5 6 SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
*/ */
boost::shared_ptr<std::vector<saidx_t> > expectedResult1(new std::vector<saidx_t>()); boost::ptr_vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("ma rysia");
expectedResult1->push_back(7); boost::ptr_vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("ma kota Ala");
expectedResult1->push_back(4);
boost::shared_ptr<std::vector<saidx_t> > searchResult1 = concordia.simpleSearch("ma rysia");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1->begin(), searchResult1->end(), BOOST_CHECK_EQUAL(searchResult1.size(), 2);
expectedResult1->begin(), expectedResult1->end()); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51);
BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1);
// Checking pattern spanning over 2 segments
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
{ {
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::shared_ptr<vector<string> > testSentences (new vector<string>()); boost::ptr_vector<Example> testExamples;
testSentences->push_back("to jest okno"); testExamples.push_back(new Example("to jest okno",312));
testSentences->push_back("czy jest okno otwarte"); testExamples.push_back(new Example("czy jest okno otwarte",202));
testSentences->push_back("chyba to jest tutaj"); testExamples.push_back(new Example("chyba to jest tutaj",45));
testSentences->push_back("to jest"); testExamples.push_back(new Example("to jest",29));
concordia.addAllSentences(testSentences); concordia.addAllExamples(testExamples);
/*The test index contains 4 sentences: /*The test index contains 4 sentences:
"to jest okno" 312: "to jest okno"
"czy jest okno otwarte" 202: "czy jest okno otwarte"
"chyba to jest tutaj" 45: "chyba to jest tutaj"
"to jest" 29: "to jest"
Test word map: Test word map:
to -> 0 to -> 0
@ -98,27 +104,27 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
*/ */
boost::shared_ptr<vector<saidx_t> > expectedResult1(new vector<saidx_t>());
expectedResult1->push_back(11);
expectedResult1->push_back(0);
expectedResult1->push_back(8);
boost::shared_ptr<vector<saidx_t> > expectedResult2(new vector<saidx_t>());
expectedResult2->push_back(1);
expectedResult2->push_back(4);
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::shared_ptr<vector<saidx_t> > searchResult1 = concordia2.simpleSearch("to jest"); boost::ptr_vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("to jest");
boost::shared_ptr<vector<saidx_t> > searchResult2 = concordia2.simpleSearch("jest okno"); boost::ptr_vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("jest okno");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1->begin(), searchResult1->end(), BOOST_CHECK_EQUAL(searchResult1.size(), 3);
expectedResult1->begin(), expectedResult1->end()); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult2->begin(), searchResult2->end(), BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 0);
expectedResult2->begin(), expectedResult2->end()); BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 45);
BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(2).getId(), 29);
BOOST_CHECK_EQUAL(searchResult1.at(2).getOffset(), 0);
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -17,6 +17,7 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" ); BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" );
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" ); BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" );
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" ); BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" );
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "tmp/ma.bin" );
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" ); BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" );
} }

View File

@ -16,7 +16,8 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
{ {
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent")); boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")); ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>()); boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
// Test hashed index: // Test hashed index:
// n: 0 1 2 3 4 5 6 7 8 // n: 0 1 2 3 4 5 6 7 8
@ -50,4 +51,50 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end()); BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
} }
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
{
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
//Test hashed index:
// n: 0 1 2 3 4 5 6 7 8 9 10 11
// T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
T->push_back(0);
T->push_back(1);
T->push_back(2);
T->push_back(255);
T->push_back(0);
T->push_back(1);
T->push_back(3);
T->push_back(255);
T->push_back(4);
T->push_back(1);
T->push_back(3);
T->push_back(255);
//Test suffix array:
// n: 0 1 2 3 4 5 6 7 8 9 10 11
//SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(hashGenerator, T);
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
expectedSA->push_back(0);
expectedSA->push_back(4);
expectedSA->push_back(1);
expectedSA->push_back(9);
expectedSA->push_back(5);
expectedSA->push_back(2);
expectedSA->push_back(10);
expectedSA->push_back(6);
expectedSA->push_back(8);
expectedSA->push_back(11);
expectedSA->push_back(3);
expectedSA->push_back(7);
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -18,6 +18,10 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
hashed_index_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@" hashed_index_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
#File containing suffix markers (sentence ids and offsets)
markers_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_MARKERS@"
#Binarized suffix array #Binarized suffix array
suffix_array_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@" suffix_array_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"

View File

@ -10,6 +10,8 @@ word_map_path = "tmp/wm.bin"
hashed_index_path = "tmp/hi.bin" hashed_index_path = "tmp/hi.bin"
markers_path = "tmp/ma.bin"
suffix_array_path = "tmp/sa.bin" suffix_array_path = "tmp/sa.bin"
### eof ### eof

View File

@ -18,6 +18,10 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
hashed_index_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@" hashed_index_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
#File containing suffix markers (sentence ids and offsets)
markers_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_MARKERS@"
#Binarized suffix array #Binarized suffix array
suffix_array_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@" suffix_array_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"