suffix markers
Former-commit-id: 7426cce771f548dcd4eb7478aafa912fb73784bf
This commit is contained in:
parent
b318770752
commit
fb65cc9c66
@ -6,9 +6,16 @@ project(concordia C CXX)
|
|||||||
set (CONCORDIA_VERSION_MAJOR 0)
|
set (CONCORDIA_VERSION_MAJOR 0)
|
||||||
set (CONCORDIA_VERSION_MINOR 1)
|
set (CONCORDIA_VERSION_MINOR 1)
|
||||||
|
|
||||||
# Type of the characters in index
|
# Type of the characters in SA
|
||||||
|
|
||||||
set (INDEX_CHARACTER_TYPE "unsigned int")
|
set (INDEX_CHARACTER_TYPE "unsigned int")
|
||||||
|
# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
|
||||||
|
|
||||||
|
# Suffix markers
|
||||||
|
set (SUFFIX_MARKER_TYPE "unsigned int")
|
||||||
|
set (SUFFIX_MARKER_DIVISOR 256)
|
||||||
|
# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
|
||||||
|
# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
|
||||||
|
|
||||||
# ============================== #
|
# ============================== #
|
||||||
# Production paths
|
# Production paths
|
||||||
@ -26,6 +33,7 @@ set (TEST_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/tests/resources")
|
|||||||
set (TEST_PUDDLE_TAGSET_PATH "${TEST_RESOURCES_DIRECTORY}/puddle/basic-tagset.txt")
|
set (TEST_PUDDLE_TAGSET_PATH "${TEST_RESOURCES_DIRECTORY}/puddle/basic-tagset.txt")
|
||||||
set (TEMP_WORD_MAP "temp_word_map.bin")
|
set (TEMP_WORD_MAP "temp_word_map.bin")
|
||||||
set (TEMP_HASHED_INDEX "temp_hashed_index.bin")
|
set (TEMP_HASHED_INDEX "temp_hashed_index.bin")
|
||||||
|
set (TEMP_MARKERS "temp_markers.bin")
|
||||||
set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin")
|
set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin")
|
||||||
|
|
||||||
file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp)
|
file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp)
|
||||||
|
@ -3,9 +3,12 @@
|
|||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
#include <boost/date_time/posix_time/posix_time.hpp>
|
#include <boost/date_time/posix_time/posix_time.hpp>
|
||||||
|
#include <boost/ptr_container/ptr_vector.hpp>
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
#include "concordia/concordia.hpp"
|
#include "concordia/concordia.hpp"
|
||||||
|
#include "concordia/substring_occurence.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/common/utils.hpp"
|
#include "concordia/common/utils.hpp"
|
||||||
#include "build/libdivsufsort/include/divsufsort.h"
|
#include "build/libdivsufsort/include/divsufsort.h"
|
||||||
@ -65,18 +68,17 @@ int main(int argc, char** argv) {
|
|||||||
std::cout << "\tSearching for pattern: \"" << pattern <<
|
std::cout << "\tSearching for pattern: \"" << pattern <<
|
||||||
"\"" << std::endl;
|
"\"" << std::endl;
|
||||||
time_start = boost::posix_time::microsec_clock::local_time();
|
time_start = boost::posix_time::microsec_clock::local_time();
|
||||||
boost::shared_ptr<vector<saidx_t> > result =
|
boost::ptr_vector<SubstringOccurence> result =
|
||||||
concordia.simpleSearch(pattern);
|
concordia.simpleSearch(pattern);
|
||||||
time_end = boost::posix_time::microsec_clock::local_time();
|
time_end = boost::posix_time::microsec_clock::local_time();
|
||||||
msdiff = time_end - time_start;
|
msdiff = time_end - time_start;
|
||||||
std::cout << "\tFound: " << result->size() << " matches. "
|
std::cout << "\tFound: " << result.size() << " matches. "
|
||||||
<< "Search took: " <<
|
<< "Search took: " <<
|
||||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||||
if (!cli.count("silent")) {
|
if (!cli.count("silent")) {
|
||||||
for (vector<saidx_t>::iterator it = result->begin();
|
BOOST_FOREACH(SubstringOccurence occurence, result) {
|
||||||
it != result->end(); ++it) {
|
std::cout << "\t\tfound match in sentence number: "
|
||||||
std::cout << "\t\tfound match on word number: " << *it
|
<< occurence.getId() << std::endl;
|
||||||
<< std::endl;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (cli.count("read-file")) {
|
} else if (cli.count("read-file")) {
|
||||||
@ -87,16 +89,15 @@ int main(int argc, char** argv) {
|
|||||||
std::string line;
|
std::string line;
|
||||||
if (text_file.is_open()) {
|
if (text_file.is_open()) {
|
||||||
long lineCount = 0;
|
long lineCount = 0;
|
||||||
boost::shared_ptr<std::vector<std::string> >
|
boost::ptr_vector<Example> buffer;
|
||||||
buffer(new std::vector<std::string>());
|
|
||||||
boost::posix_time::ptime timeStart =
|
boost::posix_time::ptime timeStart =
|
||||||
boost::posix_time::microsec_clock::local_time();
|
boost::posix_time::microsec_clock::local_time();
|
||||||
while (getline(text_file, line)) {
|
while (getline(text_file, line)) {
|
||||||
lineCount++;
|
lineCount++;
|
||||||
buffer->push_back(line);
|
buffer.push_back(new Example(line, lineCount));
|
||||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||||
concordia.addAllSentences(buffer);
|
concordia.addAllExamples(buffer);
|
||||||
buffer->clear();
|
buffer.clear();
|
||||||
boost::posix_time::ptime timeEnd =
|
boost::posix_time::ptime timeEnd =
|
||||||
boost::posix_time::microsec_clock::local_time();
|
boost::posix_time::microsec_clock::local_time();
|
||||||
boost::posix_time::time_duration msdiff =
|
boost::posix_time::time_duration msdiff =
|
||||||
@ -110,8 +111,8 @@ int main(int argc, char** argv) {
|
|||||||
" sentences per second" << std::endl;
|
" sentences per second" << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (buffer->size() > 0) {
|
if (buffer.size() > 0) {
|
||||||
concordia.addAllSentences(buffer);
|
concordia.addAllExamples(buffer);
|
||||||
}
|
}
|
||||||
text_file.close();
|
text_file.close();
|
||||||
boost::posix_time::ptime timeTotalEnd =
|
boost::posix_time::ptime timeTotalEnd =
|
||||||
@ -156,6 +157,5 @@ int main(int argc, char** argv) {
|
|||||||
<< std::endl;
|
<< std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,8 @@ foreach(dir ${ALL_DIRECTORIES})
|
|||||||
endforeach(dir)
|
endforeach(dir)
|
||||||
|
|
||||||
add_library(concordia SHARED
|
add_library(concordia SHARED
|
||||||
|
substring_occurence.cpp
|
||||||
|
example.cpp
|
||||||
index_searcher.cpp
|
index_searcher.cpp
|
||||||
concordia_index.cpp
|
concordia_index.cpp
|
||||||
word_map.cpp
|
word_map.cpp
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#define TEST_RESOURCES_DIRECTORY "@TEST_RESOURCES_DIRECTORY@"
|
#define TEST_RESOURCES_DIRECTORY "@TEST_RESOURCES_DIRECTORY@"
|
||||||
#define TEMP_WORD_MAP "@TEMP_WORD_MAP@"
|
#define TEMP_WORD_MAP "@TEMP_WORD_MAP@"
|
||||||
#define TEMP_HASHED_INDEX "@TEMP_HASHED_INDEX@"
|
#define TEMP_HASHED_INDEX "@TEMP_HASHED_INDEX@"
|
||||||
|
#define TEMP_MARKERS "@TEMP_MARKERS@"
|
||||||
#define TEMP_SUFFIX_ARRAY "@TEMP_SUFFIX_ARRAY@"
|
#define TEMP_SUFFIX_ARRAY "@TEMP_SUFFIX_ARRAY@"
|
||||||
|
|
||||||
#define PROD_RESOURCES_DIRECTORY "@PROD_RESOURCES_DIRECTORY@"
|
#define PROD_RESOURCES_DIRECTORY "@PROD_RESOURCES_DIRECTORY@"
|
||||||
@ -17,4 +18,6 @@
|
|||||||
#define LEXICON_FIELD_SEPARATOR "\t"
|
#define LEXICON_FIELD_SEPARATOR "\t"
|
||||||
|
|
||||||
typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;
|
typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;
|
||||||
|
typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
||||||
|
|
||||||
|
#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
|
||||||
|
@ -11,12 +11,23 @@ void Utils::writeIndexCharacter(ofstream & file,
|
|||||||
file.write(reinterpret_cast<char *>(&character), sizeof(character));
|
file.write(reinterpret_cast<char *>(&character), sizeof(character));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Utils::writeMarker(ofstream & file,
|
||||||
|
SUFFIX_MARKER_TYPE marker) {
|
||||||
|
file.write(reinterpret_cast<char *>(&marker), sizeof(marker));
|
||||||
|
}
|
||||||
|
|
||||||
INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) {
|
INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) {
|
||||||
INDEX_CHARACTER_TYPE character;
|
INDEX_CHARACTER_TYPE character;
|
||||||
file.read(reinterpret_cast<char *>(&character), sizeof(character));
|
file.read(reinterpret_cast<char *>(&character), sizeof(character));
|
||||||
return character;
|
return character;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE Utils::readMarker(ifstream & file) {
|
||||||
|
SUFFIX_MARKER_TYPE marker;
|
||||||
|
file.read(reinterpret_cast<char *>(&marker), sizeof(marker));
|
||||||
|
return marker;
|
||||||
|
}
|
||||||
|
|
||||||
sauchar_t * Utils::indexVectorToSaucharArray(
|
sauchar_t * Utils::indexVectorToSaucharArray(
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input) {
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input) {
|
||||||
const int kArraySize = input->size()*sizeof(INDEX_CHARACTER_TYPE);
|
const int kArraySize = input->size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
@ -23,14 +23,20 @@ public:
|
|||||||
static void writeIndexCharacter(ofstream & file,
|
static void writeIndexCharacter(ofstream & file,
|
||||||
INDEX_CHARACTER_TYPE character);
|
INDEX_CHARACTER_TYPE character);
|
||||||
|
|
||||||
|
static void writeMarker(ofstream & file,
|
||||||
|
SUFFIX_MARKER_TYPE marker);
|
||||||
|
|
||||||
static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file);
|
static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file);
|
||||||
|
|
||||||
|
static SUFFIX_MARKER_TYPE readMarker(ifstream & file);
|
||||||
|
|
||||||
static sauchar_t * indexVectorToSaucharArray(
|
static sauchar_t * indexVectorToSaucharArray(
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input);
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input);
|
||||||
|
|
||||||
static void appendCharToSaucharVector(
|
static void appendCharToSaucharVector(
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
||||||
INDEX_CHARACTER_TYPE character);
|
INDEX_CHARACTER_TYPE character);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static void _insertCharToSaucharArray(sauchar_t * array,
|
static void _insertCharToSaucharArray(sauchar_t * array,
|
||||||
INDEX_CHARACTER_TYPE character, int pos);
|
INDEX_CHARACTER_TYPE character, int pos);
|
||||||
|
@ -19,7 +19,8 @@ Concordia::Concordia(const std::string & configFilePath)
|
|||||||
_config = boost::shared_ptr<ConcordiaConfig> (
|
_config = boost::shared_ptr<ConcordiaConfig> (
|
||||||
new ConcordiaConfig(configFilePath));
|
new ConcordiaConfig(configFilePath));
|
||||||
_index = boost::shared_ptr<ConcordiaIndex>(
|
_index = boost::shared_ptr<ConcordiaIndex>(
|
||||||
new ConcordiaIndex(_config->getHashedIndexFilePath()));
|
new ConcordiaIndex(_config->getHashedIndexFilePath(),
|
||||||
|
_config->getMarkersFilePath()));
|
||||||
_searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher());
|
_searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher());
|
||||||
_initializeIndex();
|
_initializeIndex();
|
||||||
}
|
}
|
||||||
@ -42,30 +43,32 @@ std::string _createLibraryVersion() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Sentences are written to disk and added to T.
|
// Sentences are written to disk and added to T.
|
||||||
// SA is generated on command by different methods.
|
// SA is generated on command by other methods.
|
||||||
void Concordia::addSentence(const std::string & sentence)
|
// TODO(rjawor): modify SA on the fly
|
||||||
|
void Concordia::addExample(const Example & example)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
_index->addSentence(_hashGenerator, _T, sentence);
|
_index->addExample(_hashGenerator, _T, _markers, example);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sentences are written to disk and added to T.
|
// Sentences are written to disk and added to T.
|
||||||
// SA is generated on command by different methods.
|
// SA is generated on command by other methods.
|
||||||
void Concordia::addAllSentences(
|
// TODO(rjawor): modify SA on the fly
|
||||||
boost::shared_ptr<std::vector<std::string> > sentences)
|
void Concordia::addAllExamples(const boost::ptr_vector<Example > & examples)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
_index->addAllSentences(_hashGenerator, _T, sentences);
|
_index->addAllExamples(_hashGenerator, _T, _markers, examples);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
||||||
if (boost::filesystem::exists(_config->getWordMapFilePath())
|
if (boost::filesystem::exists(_config->getWordMapFilePath())
|
||||||
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) {
|
&& boost::filesystem::exists(_config->getHashedIndexFilePath())
|
||||||
// reading index from files
|
&& boost::filesystem::exists(_config->getMarkersFilePath())) {
|
||||||
|
// reading index from file
|
||||||
_T->clear();
|
_T->clear();
|
||||||
ifstream hashedIndexFile;
|
ifstream hashedIndexFile;
|
||||||
hashedIndexFile.open(_config->getHashedIndexFilePath().c_str(), ios::in
|
hashedIndexFile.open(_config->getHashedIndexFilePath().c_str(), ios::in
|
||||||
| ios::ate | ios::binary);
|
| ios::ate | ios::binary);
|
||||||
saidx_t fileSize = hashedIndexFile.tellg();
|
saidx_t hiFileSize = hashedIndexFile.tellg();
|
||||||
if (fileSize > 0) {
|
if (hiFileSize > 0) {
|
||||||
hashedIndexFile.seekg(0, ios::beg);
|
hashedIndexFile.seekg(0, ios::beg);
|
||||||
|
|
||||||
while (!hashedIndexFile.eof()) {
|
while (!hashedIndexFile.eof()) {
|
||||||
@ -74,12 +77,32 @@ void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
|||||||
Utils::appendCharToSaucharVector(_T, character);
|
Utils::appendCharToSaucharVector(_T, character);
|
||||||
}
|
}
|
||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
|
|
||||||
// generating suffix array
|
|
||||||
_SA = _index->generateSuffixArray(_hashGenerator, _T);
|
|
||||||
} else {
|
} else {
|
||||||
|
hashedIndexFile.close();
|
||||||
throw ConcordiaException("Index corrupt: empty hash index file");
|
throw ConcordiaException("Index corrupt: empty hash index file");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reading markers from file
|
||||||
|
_markers->clear();
|
||||||
|
ifstream markersFile;
|
||||||
|
markersFile.open(_config->getMarkersFilePath().c_str(), ios::in
|
||||||
|
| ios::ate | ios::binary);
|
||||||
|
saidx_t maFileSize = markersFile.tellg();
|
||||||
|
if (maFileSize > 0) {
|
||||||
|
markersFile.seekg(0, ios::beg);
|
||||||
|
|
||||||
|
while (!markersFile.eof()) {
|
||||||
|
SUFFIX_MARKER_TYPE marker =
|
||||||
|
Utils::readMarker(markersFile);
|
||||||
|
_markers->push_back(marker);
|
||||||
|
}
|
||||||
|
markersFile.close();
|
||||||
|
} else {
|
||||||
|
markersFile.close();
|
||||||
|
throw ConcordiaException("Index corrupt: empty markers file");
|
||||||
|
}
|
||||||
|
// generating suffix array
|
||||||
|
_SA = _index->generateSuffixArray(_hashGenerator, _T);
|
||||||
} else {
|
} else {
|
||||||
throw ConcordiaException("Index corrupt: missing files");
|
throw ConcordiaException("Index corrupt: missing files");
|
||||||
}
|
}
|
||||||
@ -95,6 +118,8 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
|
|||||||
new HashGenerator(_config->getWordMapFilePath()));
|
new HashGenerator(_config->getWordMapFilePath()));
|
||||||
_T = boost::shared_ptr<std::vector<sauchar_t> >(
|
_T = boost::shared_ptr<std::vector<sauchar_t> >(
|
||||||
new std::vector<sauchar_t>);
|
new std::vector<sauchar_t>);
|
||||||
|
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(
|
||||||
|
new std::vector<SUFFIX_MARKER_TYPE>);
|
||||||
if (boost::filesystem::exists(_config->getWordMapFilePath())
|
if (boost::filesystem::exists(_config->getWordMapFilePath())
|
||||||
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) {
|
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) {
|
||||||
loadRAMIndexFromDisk();
|
loadRAMIndexFromDisk();
|
||||||
@ -108,16 +133,15 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > Concordia::simpleSearch(
|
boost::ptr_vector<SubstringOccurence> Concordia::simpleSearch(
|
||||||
const string & pattern)
|
const string & pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
if (_T->size() > 0) {
|
if (_T->size() > 0) {
|
||||||
return _searcher->simpleSearch(_hashGenerator, _T, _SA, pattern);
|
return _searcher->simpleSearch(_hashGenerator, _T,
|
||||||
|
_markers, _SA, pattern);
|
||||||
} else {
|
} else {
|
||||||
boost::shared_ptr<std::vector<saidx_t> > result =
|
boost::ptr_vector<SubstringOccurence> result;
|
||||||
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,8 +4,12 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <boost/ptr_container/ptr_vector.hpp>
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/example.hpp"
|
||||||
|
#include "concordia/substring_occurence.hpp"
|
||||||
#include "concordia/concordia_config.hpp"
|
#include "concordia/concordia_config.hpp"
|
||||||
#include "concordia/concordia_index.hpp"
|
#include "concordia/concordia_index.hpp"
|
||||||
#include "concordia/index_searcher.hpp"
|
#include "concordia/index_searcher.hpp"
|
||||||
@ -34,12 +38,12 @@ public:
|
|||||||
*/
|
*/
|
||||||
std::string & getVersion();
|
std::string & getVersion();
|
||||||
|
|
||||||
void addSentence(const std::string & sentence) throw(ConcordiaException);
|
void addExample(const Example & example) throw(ConcordiaException);
|
||||||
|
|
||||||
void addAllSentences(boost::shared_ptr<std::vector<std::string> > sentences)
|
void addAllExamples(const boost::ptr_vector<Example > & examples)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > simpleSearch(
|
boost::ptr_vector<SubstringOccurence> simpleSearch(
|
||||||
const std::string & pattern)
|
const std::string & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
@ -63,6 +67,8 @@ private:
|
|||||||
boost::shared_ptr<std::vector<sauchar_t> > _T;
|
boost::shared_ptr<std::vector<sauchar_t> > _T;
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > _SA;
|
boost::shared_ptr<std::vector<saidx_t> > _SA;
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > _markers;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#define PUDDLE_TAGSET_PARAM "puddle_tagset_path"
|
#define PUDDLE_TAGSET_PARAM "puddle_tagset_path"
|
||||||
#define WORD_MAP_PARAM "word_map_path"
|
#define WORD_MAP_PARAM "word_map_path"
|
||||||
#define HASHED_INDEX_PARAM "hashed_index_path"
|
#define HASHED_INDEX_PARAM "hashed_index_path"
|
||||||
|
#define MARKERS_PARAM "markers_path"
|
||||||
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
|
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
|
||||||
|
|
||||||
ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
|
ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
|
||||||
@ -24,6 +25,8 @@ ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
|
|||||||
ConcordiaConfig::_readConfigParameterStr(WORD_MAP_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(WORD_MAP_PARAM);
|
||||||
_hashedIndexFilePath =
|
_hashedIndexFilePath =
|
||||||
ConcordiaConfig::_readConfigParameterStr(HASHED_INDEX_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(HASHED_INDEX_PARAM);
|
||||||
|
_markersFilePath =
|
||||||
|
ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
|
||||||
_suffixArrayFilePath =
|
_suffixArrayFilePath =
|
||||||
ConcordiaConfig::_readConfigParameterStr(SUFFIX_ARRAY_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(SUFFIX_ARRAY_PARAM);
|
||||||
}
|
}
|
||||||
|
@ -42,6 +42,10 @@ public:
|
|||||||
return _hashedIndexFilePath;
|
return _hashedIndexFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string & getMarkersFilePath() {
|
||||||
|
return _markersFilePath;
|
||||||
|
}
|
||||||
|
|
||||||
string & getSuffixArrayFilePath() {
|
string & getSuffixArrayFilePath() {
|
||||||
return _suffixArrayFilePath;
|
return _suffixArrayFilePath;
|
||||||
}
|
}
|
||||||
@ -55,6 +59,8 @@ private:
|
|||||||
|
|
||||||
string _hashedIndexFilePath;
|
string _hashedIndexFilePath;
|
||||||
|
|
||||||
|
string _markersFilePath;
|
||||||
|
|
||||||
string _suffixArrayFilePath;
|
string _suffixArrayFilePath;
|
||||||
|
|
||||||
string _readConfigParameterStr(const string & name)
|
string _readConfigParameterStr(const string & name)
|
||||||
|
@ -2,11 +2,15 @@
|
|||||||
|
|
||||||
#include "concordia/common/utils.hpp"
|
#include "concordia/common/utils.hpp"
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <climits>
|
||||||
|
|
||||||
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath)
|
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
|
||||||
|
const string & markersFilePath)
|
||||||
throw(ConcordiaException) :
|
throw(ConcordiaException) :
|
||||||
_hashedIndexFilePath(hashedIndexFilePath) {
|
_hashedIndexFilePath(hashedIndexFilePath),
|
||||||
|
_markersFilePath(markersFilePath) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ConcordiaIndex::~ConcordiaIndex() {
|
ConcordiaIndex::~ConcordiaIndex() {
|
||||||
@ -30,45 +34,80 @@ boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaIndex::addSentence(boost::shared_ptr<HashGenerator> hashGenerator,
|
void ConcordiaIndex::addExample(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<vector<sauchar_t> > T,
|
boost::shared_ptr<vector<sauchar_t> > T,
|
||||||
const string & sentence) {
|
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
const Example & example) {
|
||||||
ofstream hashedIndexFile;
|
ofstream hashedIndexFile;
|
||||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
||||||
ios::app|ios::binary);
|
ios::app|ios::binary);
|
||||||
_addSingleSentence(hashedIndexFile, hashGenerator, T, sentence);
|
ofstream markersFile;
|
||||||
|
markersFile.open(_markersFilePath.c_str(), ios::out|
|
||||||
|
ios::app|ios::binary);
|
||||||
|
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
|
T, markers, example);
|
||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
|
markersFile.close();
|
||||||
hashGenerator->serializeWordMap();
|
hashGenerator->serializeWordMap();
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaIndex::addAllSentences(
|
void ConcordiaIndex::addAllExamples(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<vector<sauchar_t> > T,
|
boost::shared_ptr<vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<vector<string> > sentences) {
|
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
const boost::ptr_vector<Example > & examples) {
|
||||||
ofstream hashedIndexFile;
|
ofstream hashedIndexFile;
|
||||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
||||||
ios::app|ios::binary);
|
ios::app|ios::binary);
|
||||||
for (vector<string>::iterator sent_it = sentences->begin();
|
ofstream markersFile;
|
||||||
sent_it != sentences->end(); ++sent_it) {
|
markersFile.open(_markersFilePath.c_str(), ios::out|
|
||||||
string sentence = *sent_it;
|
ios::app|ios::binary);
|
||||||
_addSingleSentence(hashedIndexFile, hashGenerator, T, sentence);
|
|
||||||
|
BOOST_FOREACH(Example example, examples) {
|
||||||
|
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
|
T, markers, example);
|
||||||
}
|
}
|
||||||
|
|
||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
|
markersFile.close();
|
||||||
hashGenerator->serializeWordMap();
|
hashGenerator->serializeWordMap();
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaIndex::_addSingleSentence(
|
void ConcordiaIndex::_addSingleExample(
|
||||||
ofstream & hashedIndexFile,
|
ofstream & hashedIndexFile,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
ofstream & markersFile,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
const string & sentence) {
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
const Example & example) {
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash
|
||||||
= hashGenerator->generateHash(sentence);
|
= hashGenerator->generateHash(example.getSentence());
|
||||||
|
int offset = 0;
|
||||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash->begin();
|
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash->begin();
|
||||||
it != hash->end(); ++it) {
|
it != hash->end(); ++it) {
|
||||||
INDEX_CHARACTER_TYPE character = *it;
|
INDEX_CHARACTER_TYPE character = *it;
|
||||||
Utils::writeIndexCharacter(hashedIndexFile, character);
|
Utils::writeIndexCharacter(hashedIndexFile, character);
|
||||||
Utils::appendCharToSaucharVector(T, character);
|
Utils::appendCharToSaucharVector(T, character);
|
||||||
|
|
||||||
|
// append to markersFile
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE marker = offset;
|
||||||
|
marker += example.getId() * SUFFIX_MARKER_DIVISOR;
|
||||||
|
|
||||||
|
Utils::writeMarker(markersFile, marker);
|
||||||
|
markers->push_back(marker);
|
||||||
|
|
||||||
|
offset++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// append sentence boundary marker
|
||||||
|
INDEX_CHARACTER_TYPE sentenceBoundaryHI = ULONG_MAX;
|
||||||
|
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
||||||
|
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE sentenceBoundaryMA = ULONG_MAX;
|
||||||
|
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||||
|
markers->push_back(sentenceBoundaryMA);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,10 +2,13 @@
|
|||||||
#define CONCORDIA_INDEX_HDR
|
#define CONCORDIA_INDEX_HDR
|
||||||
|
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <boost/ptr_container/ptr_vector.hpp>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/example.hpp"
|
||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include "build/libdivsufsort/include/divsufsort.h"
|
#include "build/libdivsufsort/include/divsufsort.h"
|
||||||
@ -19,35 +22,42 @@ using namespace std;
|
|||||||
|
|
||||||
class ConcordiaIndex {
|
class ConcordiaIndex {
|
||||||
public:
|
public:
|
||||||
explicit ConcordiaIndex(const string & hashedIndexFilePath)
|
explicit ConcordiaIndex(const string & hashedIndexFilePath,
|
||||||
|
const string & markersFilePath)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~ConcordiaIndex();
|
virtual ~ConcordiaIndex();
|
||||||
|
|
||||||
void addSentence(
|
void addExample(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<vector<sauchar_t> > T,
|
boost::shared_ptr<vector<sauchar_t> > T,
|
||||||
const string & sentence);
|
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
const Example & example);
|
||||||
|
|
||||||
void addAllSentences(
|
void addAllExamples(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<vector<sauchar_t> > T,
|
boost::shared_ptr<vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<vector<string> > sentences);
|
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
const boost::ptr_vector<Example > & examples);
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
|
boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<vector<sauchar_t> > T);
|
boost::shared_ptr<vector<sauchar_t> > T);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Add sentence to disk index and update RAM index.
|
// Add example to disk index and update RAM index.
|
||||||
void _addSingleSentence(ofstream & hashedIndexFile,
|
void _addSingleExample(ofstream & hashedIndexFile,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
ofstream & markersFile,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
const string & sentence);
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
const Example & example);
|
||||||
|
|
||||||
string _hashedIndexFilePath;
|
string _hashedIndexFilePath;
|
||||||
|
|
||||||
|
string _markersFilePath;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
11
concordia/example.cpp
Normal file
11
concordia/example.cpp
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#include "concordia/example.hpp"
|
||||||
|
|
||||||
|
|
||||||
|
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id):
|
||||||
|
_sentence(sentence),
|
||||||
|
_id(id) {
|
||||||
|
}
|
||||||
|
|
||||||
|
Example::~Example() {
|
||||||
|
}
|
||||||
|
|
36
concordia/example.hpp
Normal file
36
concordia/example.hpp
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
#ifndef EXAMPLE_HDR
|
||||||
|
#define EXAMPLE_HDR
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Class representing a single sentence to be added into index along with its id.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
class Example {
|
||||||
|
public:
|
||||||
|
explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id);
|
||||||
|
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~Example();
|
||||||
|
|
||||||
|
string getSentence() const {
|
||||||
|
return _sentence;
|
||||||
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE getId() const {
|
||||||
|
return _id;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
string _sentence;
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE _id;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -10,13 +10,13 @@ IndexSearcher::IndexSearcher() {
|
|||||||
IndexSearcher::~IndexSearcher() {
|
IndexSearcher::~IndexSearcher() {
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > IndexSearcher::simpleSearch(
|
boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const string & pattern) throw(ConcordiaException) {
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
boost::shared_ptr<vector<saidx_t> > result =
|
const string & pattern) throw(ConcordiaException) {
|
||||||
boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
boost::ptr_vector<SubstringOccurence> result;
|
||||||
|
|
||||||
int left;
|
int left;
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
|
||||||
@ -27,14 +27,19 @@ boost::shared_ptr<vector<saidx_t> > IndexSearcher::simpleSearch(
|
|||||||
(const sauchar_t *) patternArray, patternLength,
|
(const sauchar_t *) patternArray, patternLength,
|
||||||
SA->data(), (saidx_t) T->size(), &left);
|
SA->data(), (saidx_t) T->size(), &left);
|
||||||
for (int i = 0; i < size; ++i) {
|
for (int i = 0; i < size; ++i) {
|
||||||
saidx_t result_pos = SA->at(left + i);
|
saidx_t resultPos = SA->at(left + i);
|
||||||
if (result_pos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||||
// As we are looking for a pattern in an array of higher
|
// As we are looking for a pattern in an array of higher
|
||||||
// resolution than the hashed index file, we might
|
// resolution than the hashed index file, we might
|
||||||
// obtain accidental results exceeding the boundaries
|
// obtain accidental results exceeding the boundaries
|
||||||
// of characters in hashed index. The above check
|
// of characters in hashed index. The above check
|
||||||
// removes these accidental results.
|
// removes these accidental results.
|
||||||
result->push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE));
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
||||||
|
|
||||||
|
result.push_back(new SubstringOccurence(
|
||||||
|
marker / SUFFIX_MARKER_DIVISOR,
|
||||||
|
marker % SUFFIX_MARKER_DIVISOR));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -42,6 +47,3 @@ boost::shared_ptr<vector<saidx_t> > IndexSearcher::simpleSearch(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,11 +2,13 @@
|
|||||||
#define INDEX_SEARCHER_HDR
|
#define INDEX_SEARCHER_HDR
|
||||||
|
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <boost/ptr_container/ptr_vector.hpp>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "build/libdivsufsort/include/divsufsort.h"
|
#include "build/libdivsufsort/include/divsufsort.h"
|
||||||
|
#include "concordia/substring_occurence.hpp"
|
||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
|
||||||
@ -25,9 +27,10 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~IndexSearcher();
|
virtual ~IndexSearcher();
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > simpleSearch(
|
boost::ptr_vector<SubstringOccurence> simpleSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const string & pattern) throw(ConcordiaException);
|
const string & pattern) throw(ConcordiaException);
|
||||||
private:
|
private:
|
||||||
|
12
concordia/substring_occurence.cpp
Normal file
12
concordia/substring_occurence.cpp
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
#include "concordia/substring_occurence.hpp"
|
||||||
|
|
||||||
|
|
||||||
|
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
||||||
|
const int & offset):
|
||||||
|
_id(id),
|
||||||
|
_offset(offset) {
|
||||||
|
}
|
||||||
|
|
||||||
|
SubstringOccurence::~SubstringOccurence() {
|
||||||
|
}
|
||||||
|
|
37
concordia/substring_occurence.hpp
Normal file
37
concordia/substring_occurence.hpp
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
#ifndef SUBSTRING_OCCURENCE_HDR
|
||||||
|
#define SUBSTRING_OCCURENCE_HDR
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Class representing occurence of a searched substring.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
class SubstringOccurence {
|
||||||
|
public:
|
||||||
|
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
||||||
|
const int & offset);
|
||||||
|
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~SubstringOccurence();
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE getId() const {
|
||||||
|
return _id;
|
||||||
|
}
|
||||||
|
|
||||||
|
int getOffset() const {
|
||||||
|
return _offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
SUFFIX_MARKER_TYPE _id;
|
||||||
|
|
||||||
|
int _offset;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -4,6 +4,7 @@
|
|||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
|
||||||
#include <boost/algorithm/string/predicate.hpp>
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
|
#include <boost/ptr_container/ptr_vector.hpp>
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -19,18 +20,19 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
|
|||||||
BOOST_CHECK_EQUAL( version , "0.1");
|
BOOST_CHECK_EQUAL( version , "0.1");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||||
{
|
{
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
concordia.addSentence("Ala ma kota");
|
concordia.addExample(Example("Ala ma kota",14));
|
||||||
concordia.addSentence("Ala ma rysia");
|
concordia.addExample(Example("Ala ma rysia",51));
|
||||||
concordia.addSentence("Marysia ma rysia");
|
concordia.addExample(Example("Marysia ma rysia",123));
|
||||||
concordia.refreshSAfromRAM();
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
/*The test index contains 3 sentences:
|
/*The test index contains 3 sentences:
|
||||||
"Ala ma kota"
|
14: "Ala ma kota"
|
||||||
"Ala ma rysia"
|
51: "Ala ma rysia"
|
||||||
"Marysia ma rysia"
|
123: "Marysia ma rysia"
|
||||||
|
|
||||||
Test word map:
|
Test word map:
|
||||||
Ala -> 0
|
Ala -> 0
|
||||||
@ -40,44 +42,48 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
|||||||
Marysia -> 4
|
Marysia -> 4
|
||||||
|
|
||||||
Test hashed index:
|
Test hashed index:
|
||||||
n: 0 1 2 3 4 5 6 7 8
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
T[n]: 0 1 2 0 1 3 4 1 3
|
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||||
|
|
||||||
Test suffix array:
|
Test suffix array:
|
||||||
n: 0 1 2 3 4 5 6 7 8
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
SA[n]: 0 3 1 7 4 2 8 5 6
|
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > expectedResult1(new std::vector<saidx_t>());
|
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("ma rysia");
|
||||||
expectedResult1->push_back(7);
|
boost::ptr_vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("ma kota Ala");
|
||||||
expectedResult1->push_back(4);
|
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > searchResult1 = concordia.simpleSearch("ma rysia");
|
|
||||||
|
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1->begin(), searchResult1->end(),
|
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
|
||||||
expectedResult1->begin(), expectedResult1->end());
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1);
|
||||||
|
|
||||||
|
// Checking pattern spanning over 2 segments
|
||||||
|
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||||
{
|
{
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
boost::shared_ptr<vector<string> > testSentences (new vector<string>());
|
boost::ptr_vector<Example> testExamples;
|
||||||
testSentences->push_back("to jest okno");
|
testExamples.push_back(new Example("to jest okno",312));
|
||||||
testSentences->push_back("czy jest okno otwarte");
|
testExamples.push_back(new Example("czy jest okno otwarte",202));
|
||||||
testSentences->push_back("chyba to jest tutaj");
|
testExamples.push_back(new Example("chyba to jest tutaj",45));
|
||||||
testSentences->push_back("to jest");
|
testExamples.push_back(new Example("to jest",29));
|
||||||
concordia.addAllSentences(testSentences);
|
concordia.addAllExamples(testExamples);
|
||||||
|
|
||||||
/*The test index contains 4 sentences:
|
/*The test index contains 4 sentences:
|
||||||
"to jest okno"
|
312: "to jest okno"
|
||||||
"czy jest okno otwarte"
|
202: "czy jest okno otwarte"
|
||||||
"chyba to jest tutaj"
|
45: "chyba to jest tutaj"
|
||||||
"to jest"
|
29: "to jest"
|
||||||
|
|
||||||
Test word map:
|
Test word map:
|
||||||
to -> 0
|
to -> 0
|
||||||
@ -98,27 +104,27 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > expectedResult1(new vector<saidx_t>());
|
|
||||||
expectedResult1->push_back(11);
|
|
||||||
expectedResult1->push_back(0);
|
|
||||||
expectedResult1->push_back(8);
|
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > expectedResult2(new vector<saidx_t>());
|
|
||||||
expectedResult2->push_back(1);
|
|
||||||
expectedResult2->push_back(4);
|
|
||||||
|
|
||||||
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
boost::shared_ptr<vector<saidx_t> > searchResult1 = concordia2.simpleSearch("to jest");
|
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("to jest");
|
||||||
boost::shared_ptr<vector<saidx_t> > searchResult2 = concordia2.simpleSearch("jest okno");
|
boost::ptr_vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("jest okno");
|
||||||
|
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1->begin(), searchResult1->end(),
|
BOOST_CHECK_EQUAL(searchResult1.size(), 3);
|
||||||
expectedResult1->begin(), expectedResult1->end());
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
|
||||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult2->begin(), searchResult2->end(),
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 0);
|
||||||
expectedResult2->begin(), expectedResult2->end());
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 45);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(2).getId(), 29);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(2).getOffset(), 0);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
@ -17,6 +17,7 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
|
|||||||
BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" );
|
BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" );
|
||||||
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" );
|
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" );
|
||||||
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" );
|
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" );
|
||||||
|
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "tmp/ma.bin" );
|
||||||
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" );
|
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16,7 +16,8 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
|||||||
{
|
{
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
|
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
|
||||||
|
|
||||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
|
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||||
|
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||||
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
||||||
// Test hashed index:
|
// Test hashed index:
|
||||||
// n: 0 1 2 3 4 5 6 7 8
|
// n: 0 1 2 3 4 5 6 7 8
|
||||||
@ -50,4 +51,50 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
|||||||
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
|
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
|
||||||
|
{
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
|
||||||
|
|
||||||
|
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||||
|
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||||
|
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
||||||
|
|
||||||
|
//Test hashed index:
|
||||||
|
// n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
|
// T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(1);
|
||||||
|
T->push_back(2);
|
||||||
|
T->push_back(255);
|
||||||
|
T->push_back(0);
|
||||||
|
T->push_back(1);
|
||||||
|
T->push_back(3);
|
||||||
|
T->push_back(255);
|
||||||
|
T->push_back(4);
|
||||||
|
T->push_back(1);
|
||||||
|
T->push_back(3);
|
||||||
|
T->push_back(255);
|
||||||
|
|
||||||
|
//Test suffix array:
|
||||||
|
// n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
|
//SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(hashGenerator, T);
|
||||||
|
|
||||||
|
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
||||||
|
expectedSA->push_back(0);
|
||||||
|
expectedSA->push_back(4);
|
||||||
|
expectedSA->push_back(1);
|
||||||
|
expectedSA->push_back(9);
|
||||||
|
expectedSA->push_back(5);
|
||||||
|
expectedSA->push_back(2);
|
||||||
|
expectedSA->push_back(10);
|
||||||
|
expectedSA->push_back(6);
|
||||||
|
expectedSA->push_back(8);
|
||||||
|
expectedSA->push_back(11);
|
||||||
|
expectedSA->push_back(3);
|
||||||
|
expectedSA->push_back(7);
|
||||||
|
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
@ -18,6 +18,10 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
|||||||
|
|
||||||
hashed_index_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
|
hashed_index_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
|
||||||
|
|
||||||
|
#File containing suffix markers (sentence ids and offsets)
|
||||||
|
|
||||||
|
markers_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_MARKERS@"
|
||||||
|
|
||||||
#Binarized suffix array
|
#Binarized suffix array
|
||||||
|
|
||||||
suffix_array_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
|
suffix_array_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
|
||||||
|
@ -10,6 +10,8 @@ word_map_path = "tmp/wm.bin"
|
|||||||
|
|
||||||
hashed_index_path = "tmp/hi.bin"
|
hashed_index_path = "tmp/hi.bin"
|
||||||
|
|
||||||
|
markers_path = "tmp/ma.bin"
|
||||||
|
|
||||||
suffix_array_path = "tmp/sa.bin"
|
suffix_array_path = "tmp/sa.bin"
|
||||||
|
|
||||||
### eof
|
### eof
|
||||||
|
@ -18,6 +18,10 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
|||||||
|
|
||||||
hashed_index_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
|
hashed_index_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
|
||||||
|
|
||||||
|
#File containing suffix markers (sentence ids and offsets)
|
||||||
|
|
||||||
|
markers_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_MARKERS@"
|
||||||
|
|
||||||
#Binarized suffix array
|
#Binarized suffix array
|
||||||
|
|
||||||
suffix_array_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
|
suffix_array_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
|
||||||
|
Loading…
Reference in New Issue
Block a user