From fb65cc9c66bde016b66b4c34557bf888b2e1a776 Mon Sep 17 00:00:00 2001 From: rjawor Date: Thu, 20 Feb 2014 10:49:17 +0100 Subject: [PATCH] suffix markers Former-commit-id: 7426cce771f548dcd4eb7478aafa912fb73784bf --- CMakeLists.txt | 10 ++- concordia-console/concordia-console.cpp | 28 +++--- concordia/CMakeLists.txt | 2 + concordia/common/config.hpp.in | 3 + concordia/common/utils.cpp | 11 +++ concordia/common/utils.hpp | 6 ++ concordia/concordia.cpp | 64 ++++++++----- concordia/concordia.hpp | 12 ++- concordia/concordia_config.cpp | 3 + concordia/concordia_config.hpp | 6 ++ concordia/concordia_index.cpp | 77 ++++++++++++---- concordia/concordia_index.hpp | 30 ++++--- concordia/example.cpp | 11 +++ concordia/example.hpp | 36 ++++++++ concordia/index_searcher.cpp | 28 +++--- concordia/index_searcher.hpp | 5 +- concordia/substring_occurence.cpp | 12 +++ concordia/substring_occurence.hpp | 37 ++++++++ concordia/t/test_concordia.cpp | 90 ++++++++++--------- concordia/t/test_concordia_config.cpp | 1 + concordia/t/test_concordia_index.cpp | 49 +++++++++- .../concordia-config/concordia.cfg.in | 4 + .../concordia-config/concordia-mock.cfg | 2 + .../concordia-config/concordia.cfg.in | 4 + 24 files changed, 407 insertions(+), 124 deletions(-) create mode 100644 concordia/example.cpp create mode 100644 concordia/example.hpp create mode 100644 concordia/substring_occurence.cpp create mode 100644 concordia/substring_occurence.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 063cf1f..862b1b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,9 +6,16 @@ project(concordia C CXX) set (CONCORDIA_VERSION_MAJOR 0) set (CONCORDIA_VERSION_MINOR 1) -# Type of the characters in index +# Type of the characters in SA set (INDEX_CHARACTER_TYPE "unsigned int") +# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus. + +# Suffix markers +set (SUFFIX_MARKER_TYPE "unsigned int") +set (SUFFIX_MARKER_DIVISOR 256) +# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset. +# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words. # ============================== # # Production paths @@ -26,6 +33,7 @@ set (TEST_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/tests/resources") set (TEST_PUDDLE_TAGSET_PATH "${TEST_RESOURCES_DIRECTORY}/puddle/basic-tagset.txt") set (TEMP_WORD_MAP "temp_word_map.bin") set (TEMP_HASHED_INDEX "temp_hashed_index.bin") +set (TEMP_MARKERS "temp_markers.bin") set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin") file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp) diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index ec7c184..ab0e3c0 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -3,9 +3,12 @@ #include #include #include +#include #include +#include #include "concordia/concordia.hpp" +#include "concordia/substring_occurence.hpp" #include "concordia/common/config.hpp" #include "concordia/common/utils.hpp" #include "build/libdivsufsort/include/divsufsort.h" @@ -65,18 +68,17 @@ int main(int argc, char** argv) { std::cout << "\tSearching for pattern: \"" << pattern << "\"" << std::endl; time_start = boost::posix_time::microsec_clock::local_time(); - boost::shared_ptr > result = + boost::ptr_vector result = concordia.simpleSearch(pattern); time_end = boost::posix_time::microsec_clock::local_time(); msdiff = time_end - time_start; - std::cout << "\tFound: " << result->size() << " matches. " + std::cout << "\tFound: " << result.size() << " matches. " << "Search took: " << msdiff.total_milliseconds() << "ms." << std::endl; if (!cli.count("silent")) { - for (vector::iterator it = result->begin(); - it != result->end(); ++it) { - std::cout << "\t\tfound match on word number: " << *it - << std::endl; + BOOST_FOREACH(SubstringOccurence occurence, result) { + std::cout << "\t\tfound match in sentence number: " + << occurence.getId() << std::endl; } } } else if (cli.count("read-file")) { @@ -87,16 +89,15 @@ int main(int argc, char** argv) { std::string line; if (text_file.is_open()) { long lineCount = 0; - boost::shared_ptr > - buffer(new std::vector()); + boost::ptr_vector buffer; boost::posix_time::ptime timeStart = boost::posix_time::microsec_clock::local_time(); while (getline(text_file, line)) { lineCount++; - buffer->push_back(line); + buffer.push_back(new Example(line, lineCount)); if (lineCount % READ_BUFFER_LENGTH == 0) { - concordia.addAllSentences(buffer); - buffer->clear(); + concordia.addAllExamples(buffer); + buffer.clear(); boost::posix_time::ptime timeEnd = boost::posix_time::microsec_clock::local_time(); boost::posix_time::time_duration msdiff = @@ -110,8 +111,8 @@ int main(int argc, char** argv) { " sentences per second" << std::endl; } } - if (buffer->size() > 0) { - concordia.addAllSentences(buffer); + if (buffer.size() > 0) { + concordia.addAllExamples(buffer); } text_file.close(); boost::posix_time::ptime timeTotalEnd = @@ -156,6 +157,5 @@ int main(int argc, char** argv) { << std::endl; return 1; } - return 0; } diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index 58a79c1..d2f4010 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -6,6 +6,8 @@ foreach(dir ${ALL_DIRECTORIES}) endforeach(dir) add_library(concordia SHARED + substring_occurence.cpp + example.cpp index_searcher.cpp concordia_index.cpp word_map.cpp diff --git a/concordia/common/config.hpp.in b/concordia/common/config.hpp.in index 0c49ac6..cfbd01e 100644 --- a/concordia/common/config.hpp.in +++ b/concordia/common/config.hpp.in @@ -4,6 +4,7 @@ #define TEST_RESOURCES_DIRECTORY "@TEST_RESOURCES_DIRECTORY@" #define TEMP_WORD_MAP "@TEMP_WORD_MAP@" #define TEMP_HASHED_INDEX "@TEMP_HASHED_INDEX@" +#define TEMP_MARKERS "@TEMP_MARKERS@" #define TEMP_SUFFIX_ARRAY "@TEMP_SUFFIX_ARRAY@" #define PROD_RESOURCES_DIRECTORY "@PROD_RESOURCES_DIRECTORY@" @@ -17,4 +18,6 @@ #define LEXICON_FIELD_SEPARATOR "\t" typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE; +typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE; +#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@ diff --git a/concordia/common/utils.cpp b/concordia/common/utils.cpp index 95c8ef0..00a8fa7 100644 --- a/concordia/common/utils.cpp +++ b/concordia/common/utils.cpp @@ -11,12 +11,23 @@ void Utils::writeIndexCharacter(ofstream & file, file.write(reinterpret_cast(&character), sizeof(character)); } +void Utils::writeMarker(ofstream & file, + SUFFIX_MARKER_TYPE marker) { + file.write(reinterpret_cast(&marker), sizeof(marker)); +} + INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) { INDEX_CHARACTER_TYPE character; file.read(reinterpret_cast(&character), sizeof(character)); return character; } +SUFFIX_MARKER_TYPE Utils::readMarker(ifstream & file) { + SUFFIX_MARKER_TYPE marker; + file.read(reinterpret_cast(&marker), sizeof(marker)); + return marker; +} + sauchar_t * Utils::indexVectorToSaucharArray( boost::shared_ptr > input) { const int kArraySize = input->size()*sizeof(INDEX_CHARACTER_TYPE); diff --git a/concordia/common/utils.hpp b/concordia/common/utils.hpp index 96bbf51..79fbf9b 100644 --- a/concordia/common/utils.hpp +++ b/concordia/common/utils.hpp @@ -23,14 +23,20 @@ public: static void writeIndexCharacter(ofstream & file, INDEX_CHARACTER_TYPE character); + static void writeMarker(ofstream & file, + SUFFIX_MARKER_TYPE marker); + static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file); + static SUFFIX_MARKER_TYPE readMarker(ifstream & file); + static sauchar_t * indexVectorToSaucharArray( boost::shared_ptr > input); static void appendCharToSaucharVector( boost::shared_ptr > vector, INDEX_CHARACTER_TYPE character); + private: static void _insertCharToSaucharArray(sauchar_t * array, INDEX_CHARACTER_TYPE character, int pos); diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 550c53d..7579dc2 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -19,7 +19,8 @@ Concordia::Concordia(const std::string & configFilePath) _config = boost::shared_ptr ( new ConcordiaConfig(configFilePath)); _index = boost::shared_ptr( - new ConcordiaIndex(_config->getHashedIndexFilePath())); + new ConcordiaIndex(_config->getHashedIndexFilePath(), + _config->getMarkersFilePath())); _searcher = boost::shared_ptr(new IndexSearcher()); _initializeIndex(); } @@ -42,30 +43,32 @@ std::string _createLibraryVersion() { } // Sentences are written to disk and added to T. -// SA is generated on command by different methods. -void Concordia::addSentence(const std::string & sentence) +// SA is generated on command by other methods. +// TODO(rjawor): modify SA on the fly +void Concordia::addExample(const Example & example) throw(ConcordiaException) { - _index->addSentence(_hashGenerator, _T, sentence); + _index->addExample(_hashGenerator, _T, _markers, example); } // Sentences are written to disk and added to T. -// SA is generated on command by different methods. -void Concordia::addAllSentences( - boost::shared_ptr > sentences) +// SA is generated on command by other methods. +// TODO(rjawor): modify SA on the fly +void Concordia::addAllExamples(const boost::ptr_vector & examples) throw(ConcordiaException) { - _index->addAllSentences(_hashGenerator, _T, sentences); + _index->addAllExamples(_hashGenerator, _T, _markers, examples); } void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) { if (boost::filesystem::exists(_config->getWordMapFilePath()) - && boost::filesystem::exists(_config->getHashedIndexFilePath())) { - // reading index from files + && boost::filesystem::exists(_config->getHashedIndexFilePath()) + && boost::filesystem::exists(_config->getMarkersFilePath())) { + // reading index from file _T->clear(); ifstream hashedIndexFile; hashedIndexFile.open(_config->getHashedIndexFilePath().c_str(), ios::in | ios::ate | ios::binary); - saidx_t fileSize = hashedIndexFile.tellg(); - if (fileSize > 0) { + saidx_t hiFileSize = hashedIndexFile.tellg(); + if (hiFileSize > 0) { hashedIndexFile.seekg(0, ios::beg); while (!hashedIndexFile.eof()) { @@ -74,12 +77,32 @@ void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) { Utils::appendCharToSaucharVector(_T, character); } hashedIndexFile.close(); - - // generating suffix array - _SA = _index->generateSuffixArray(_hashGenerator, _T); } else { + hashedIndexFile.close(); throw ConcordiaException("Index corrupt: empty hash index file"); } + + // reading markers from file + _markers->clear(); + ifstream markersFile; + markersFile.open(_config->getMarkersFilePath().c_str(), ios::in + | ios::ate | ios::binary); + saidx_t maFileSize = markersFile.tellg(); + if (maFileSize > 0) { + markersFile.seekg(0, ios::beg); + + while (!markersFile.eof()) { + SUFFIX_MARKER_TYPE marker = + Utils::readMarker(markersFile); + _markers->push_back(marker); + } + markersFile.close(); + } else { + markersFile.close(); + throw ConcordiaException("Index corrupt: empty markers file"); + } + // generating suffix array + _SA = _index->generateSuffixArray(_hashGenerator, _T); } else { throw ConcordiaException("Index corrupt: missing files"); } @@ -95,6 +118,8 @@ void Concordia::_initializeIndex() throw(ConcordiaException) { new HashGenerator(_config->getWordMapFilePath())); _T = boost::shared_ptr >( new std::vector); + _markers = boost::shared_ptr >( + new std::vector); if (boost::filesystem::exists(_config->getWordMapFilePath()) && boost::filesystem::exists(_config->getHashedIndexFilePath())) { loadRAMIndexFromDisk(); @@ -108,16 +133,15 @@ void Concordia::_initializeIndex() throw(ConcordiaException) { } } -boost::shared_ptr > Concordia::simpleSearch( +boost::ptr_vector Concordia::simpleSearch( const string & pattern) throw(ConcordiaException) { if (_T->size() > 0) { - return _searcher->simpleSearch(_hashGenerator, _T, _SA, pattern); + return _searcher->simpleSearch(_hashGenerator, _T, + _markers, _SA, pattern); } else { - boost::shared_ptr > result = - boost::shared_ptr >(new std::vector); + boost::ptr_vector result; return result; } } - diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index d2dc5fe..3880081 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -4,8 +4,12 @@ #include #include #include +#include #include +#include "concordia/common/config.hpp" +#include "concordia/example.hpp" +#include "concordia/substring_occurence.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_index.hpp" #include "concordia/index_searcher.hpp" @@ -34,12 +38,12 @@ public: */ std::string & getVersion(); - void addSentence(const std::string & sentence) throw(ConcordiaException); + void addExample(const Example & example) throw(ConcordiaException); - void addAllSentences(boost::shared_ptr > sentences) + void addAllExamples(const boost::ptr_vector & examples) throw(ConcordiaException); - boost::shared_ptr > simpleSearch( + boost::ptr_vector simpleSearch( const std::string & pattern) throw(ConcordiaException); @@ -63,6 +67,8 @@ private: boost::shared_ptr > _T; boost::shared_ptr > _SA; + + boost::shared_ptr > _markers; }; #endif diff --git a/concordia/concordia_config.cpp b/concordia/concordia_config.cpp index 8cb6280..9af6338 100644 --- a/concordia/concordia_config.cpp +++ b/concordia/concordia_config.cpp @@ -5,6 +5,7 @@ #define PUDDLE_TAGSET_PARAM "puddle_tagset_path" #define WORD_MAP_PARAM "word_map_path" #define HASHED_INDEX_PARAM "hashed_index_path" +#define MARKERS_PARAM "markers_path" #define SUFFIX_ARRAY_PARAM "suffix_array_path" ConcordiaConfig::ConcordiaConfig(const string & configFilePath) @@ -24,6 +25,8 @@ ConcordiaConfig::ConcordiaConfig(const string & configFilePath) ConcordiaConfig::_readConfigParameterStr(WORD_MAP_PARAM); _hashedIndexFilePath = ConcordiaConfig::_readConfigParameterStr(HASHED_INDEX_PARAM); + _markersFilePath = + ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM); _suffixArrayFilePath = ConcordiaConfig::_readConfigParameterStr(SUFFIX_ARRAY_PARAM); } diff --git a/concordia/concordia_config.hpp b/concordia/concordia_config.hpp index 8ccb266..7965d17 100644 --- a/concordia/concordia_config.hpp +++ b/concordia/concordia_config.hpp @@ -42,6 +42,10 @@ public: return _hashedIndexFilePath; } + string & getMarkersFilePath() { + return _markersFilePath; + } + string & getSuffixArrayFilePath() { return _suffixArrayFilePath; } @@ -55,6 +59,8 @@ private: string _hashedIndexFilePath; + string _markersFilePath; + string _suffixArrayFilePath; string _readConfigParameterStr(const string & name) diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index abb335d..174e986 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -2,11 +2,15 @@ #include "concordia/common/utils.hpp" #include +#include #include +#include -ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath) +ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath, + const string & markersFilePath) throw(ConcordiaException) : - _hashedIndexFilePath(hashedIndexFilePath) { + _hashedIndexFilePath(hashedIndexFilePath), + _markersFilePath(markersFilePath) { } ConcordiaIndex::~ConcordiaIndex() { @@ -30,45 +34,80 @@ boost::shared_ptr > ConcordiaIndex::generateSuffixArray( return result; } -void ConcordiaIndex::addSentence(boost::shared_ptr hashGenerator, +void ConcordiaIndex::addExample( + boost::shared_ptr hashGenerator, boost::shared_ptr > T, - const string & sentence) { + boost::shared_ptr > markers, + const Example & example) { ofstream hashedIndexFile; hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out| ios::app|ios::binary); - _addSingleSentence(hashedIndexFile, hashGenerator, T, sentence); + ofstream markersFile; + markersFile.open(_markersFilePath.c_str(), ios::out| + ios::app|ios::binary); + _addSingleExample(hashedIndexFile, markersFile, hashGenerator, + T, markers, example); hashedIndexFile.close(); + markersFile.close(); hashGenerator->serializeWordMap(); } -void ConcordiaIndex::addAllSentences( - boost::shared_ptr hashGenerator, - boost::shared_ptr > T, - boost::shared_ptr > sentences) { +void ConcordiaIndex::addAllExamples( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + const boost::ptr_vector & examples) { ofstream hashedIndexFile; hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out| ios::app|ios::binary); - for (vector::iterator sent_it = sentences->begin(); - sent_it != sentences->end(); ++sent_it) { - string sentence = *sent_it; - _addSingleSentence(hashedIndexFile, hashGenerator, T, sentence); + ofstream markersFile; + markersFile.open(_markersFilePath.c_str(), ios::out| + ios::app|ios::binary); + + BOOST_FOREACH(Example example, examples) { + _addSingleExample(hashedIndexFile, markersFile, hashGenerator, + T, markers, example); } + hashedIndexFile.close(); + markersFile.close(); hashGenerator->serializeWordMap(); } -void ConcordiaIndex::_addSingleSentence( - ofstream & hashedIndexFile, - boost::shared_ptr hashGenerator, - boost::shared_ptr > T, - const string & sentence) { +void ConcordiaIndex::_addSingleExample( + ofstream & hashedIndexFile, + ofstream & markersFile, + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + const Example & example) { boost::shared_ptr > hash - = hashGenerator->generateHash(sentence); + = hashGenerator->generateHash(example.getSentence()); + int offset = 0; for (vector::iterator it = hash->begin(); it != hash->end(); ++it) { INDEX_CHARACTER_TYPE character = *it; Utils::writeIndexCharacter(hashedIndexFile, character); Utils::appendCharToSaucharVector(T, character); + + // append to markersFile + + SUFFIX_MARKER_TYPE marker = offset; + marker += example.getId() * SUFFIX_MARKER_DIVISOR; + + Utils::writeMarker(markersFile, marker); + markers->push_back(marker); + + offset++; } + + // append sentence boundary marker + INDEX_CHARACTER_TYPE sentenceBoundaryHI = ULONG_MAX; + Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI); + Utils::appendCharToSaucharVector(T, sentenceBoundaryHI); + + SUFFIX_MARKER_TYPE sentenceBoundaryMA = ULONG_MAX; + Utils::writeMarker(markersFile, sentenceBoundaryMA); + markers->push_back(sentenceBoundaryMA); } diff --git a/concordia/concordia_index.hpp b/concordia/concordia_index.hpp index 8a47fd6..d08fef3 100644 --- a/concordia/concordia_index.hpp +++ b/concordia/concordia_index.hpp @@ -2,10 +2,13 @@ #define CONCORDIA_INDEX_HDR #include +#include #include #include #include +#include "concordia/common/config.hpp" +#include "concordia/example.hpp" #include "concordia/hash_generator.hpp" #include "concordia/concordia_exception.hpp" #include "build/libdivsufsort/include/divsufsort.h" @@ -19,35 +22,42 @@ using namespace std; class ConcordiaIndex { public: - explicit ConcordiaIndex(const string & hashedIndexFilePath) + explicit ConcordiaIndex(const string & hashedIndexFilePath, + const string & markersFilePath) throw(ConcordiaException); /*! Destructor. */ virtual ~ConcordiaIndex(); - void addSentence( + void addExample( boost::shared_ptr hashGenerator, boost::shared_ptr > T, - const string & sentence); + boost::shared_ptr > markers, + const Example & example); - void addAllSentences( + void addAllExamples( boost::shared_ptr hashGenerator, boost::shared_ptr > T, - boost::shared_ptr > sentences); + boost::shared_ptr > markers, + const boost::ptr_vector & examples); boost::shared_ptr > generateSuffixArray( boost::shared_ptr hashGenerator, boost::shared_ptr > T); private: - // Add sentence to disk index and update RAM index. - void _addSingleSentence(ofstream & hashedIndexFile, - boost::shared_ptr hashGenerator, - boost::shared_ptr > T, - const string & sentence); + // Add example to disk index and update RAM index. + void _addSingleExample(ofstream & hashedIndexFile, + ofstream & markersFile, + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + const Example & example); string _hashedIndexFilePath; + + string _markersFilePath; }; #endif diff --git a/concordia/example.cpp b/concordia/example.cpp new file mode 100644 index 0000000..58cee52 --- /dev/null +++ b/concordia/example.cpp @@ -0,0 +1,11 @@ +#include "concordia/example.hpp" + + +Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id): + _sentence(sentence), + _id(id) { +} + +Example::~Example() { +} + diff --git a/concordia/example.hpp b/concordia/example.hpp new file mode 100644 index 0000000..5e28c56 --- /dev/null +++ b/concordia/example.hpp @@ -0,0 +1,36 @@ +#ifndef EXAMPLE_HDR +#define EXAMPLE_HDR + +#include "concordia/common/config.hpp" +#include + +/*! + Class representing a single sentence to be added into index along with its id. + +*/ + +using namespace std; + +class Example { +public: + explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id); + + /*! Destructor. + */ + virtual ~Example(); + + string getSentence() const { + return _sentence; + } + + SUFFIX_MARKER_TYPE getId() const { + return _id; + } + +private: + string _sentence; + + SUFFIX_MARKER_TYPE _id; +}; + +#endif diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index 43be3ca..d88efbe 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -10,13 +10,13 @@ IndexSearcher::IndexSearcher() { IndexSearcher::~IndexSearcher() { } -boost::shared_ptr > IndexSearcher::simpleSearch( - boost::shared_ptr hashGenerator, - boost::shared_ptr > T, - boost::shared_ptr > SA, - const string & pattern) throw(ConcordiaException) { - boost::shared_ptr > result = - boost::shared_ptr >(new vector()); +boost::ptr_vector IndexSearcher::simpleSearch( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + boost::shared_ptr > SA, + const string & pattern) throw(ConcordiaException) { + boost::ptr_vector result; int left; boost::shared_ptr > hash = @@ -27,14 +27,19 @@ boost::shared_ptr > IndexSearcher::simpleSearch( (const sauchar_t *) patternArray, patternLength, SA->data(), (saidx_t) T->size(), &left); for (int i = 0; i < size; ++i) { - saidx_t result_pos = SA->at(left + i); - if (result_pos % sizeof(INDEX_CHARACTER_TYPE) == 0) { + saidx_t resultPos = SA->at(left + i); + if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) { // As we are looking for a pattern in an array of higher // resolution than the hashed index file, we might // obtain accidental results exceeding the boundaries // of characters in hashed index. The above check // removes these accidental results. - result->push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE)); + saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); + SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); + + result.push_back(new SubstringOccurence( + marker / SUFFIX_MARKER_DIVISOR, + marker % SUFFIX_MARKER_DIVISOR)); } } @@ -42,6 +47,3 @@ boost::shared_ptr > IndexSearcher::simpleSearch( return result; } - - - diff --git a/concordia/index_searcher.hpp b/concordia/index_searcher.hpp index f96f33d..419ab82 100644 --- a/concordia/index_searcher.hpp +++ b/concordia/index_searcher.hpp @@ -2,11 +2,13 @@ #define INDEX_SEARCHER_HDR #include +#include #include #include #include "concordia/common/config.hpp" #include "build/libdivsufsort/include/divsufsort.h" +#include "concordia/substring_occurence.hpp" #include "concordia/hash_generator.hpp" #include "concordia/concordia_exception.hpp" @@ -25,9 +27,10 @@ public: */ virtual ~IndexSearcher(); - boost::shared_ptr > simpleSearch( + boost::ptr_vector simpleSearch( boost::shared_ptr hashGenerator, boost::shared_ptr > T, + boost::shared_ptr > markers, boost::shared_ptr > SA, const string & pattern) throw(ConcordiaException); private: diff --git a/concordia/substring_occurence.cpp b/concordia/substring_occurence.cpp new file mode 100644 index 0000000..cef6bc5 --- /dev/null +++ b/concordia/substring_occurence.cpp @@ -0,0 +1,12 @@ +#include "concordia/substring_occurence.hpp" + + +SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & id, + const int & offset): + _id(id), + _offset(offset) { +} + +SubstringOccurence::~SubstringOccurence() { +} + diff --git a/concordia/substring_occurence.hpp b/concordia/substring_occurence.hpp new file mode 100644 index 0000000..2af9535 --- /dev/null +++ b/concordia/substring_occurence.hpp @@ -0,0 +1,37 @@ +#ifndef SUBSTRING_OCCURENCE_HDR +#define SUBSTRING_OCCURENCE_HDR + +#include "concordia/common/config.hpp" +#include + +/*! + Class representing occurence of a searched substring. + +*/ + +using namespace std; + +class SubstringOccurence { +public: + explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & id, + const int & offset); + + /*! Destructor. + */ + virtual ~SubstringOccurence(); + + SUFFIX_MARKER_TYPE getId() const { + return _id; + } + + int getOffset() const { + return _offset; + } + +private: + SUFFIX_MARKER_TYPE _id; + + int _offset; +}; + +#endif diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 96bcc42..3f05191 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -4,6 +4,7 @@ #include "concordia/common/config.hpp" #include +#include #include #include @@ -19,18 +20,19 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion ) BOOST_CHECK_EQUAL( version , "0.1"); } + BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) { Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); - concordia.addSentence("Ala ma kota"); - concordia.addSentence("Ala ma rysia"); - concordia.addSentence("Marysia ma rysia"); + concordia.addExample(Example("Ala ma kota",14)); + concordia.addExample(Example("Ala ma rysia",51)); + concordia.addExample(Example("Marysia ma rysia",123)); concordia.refreshSAfromRAM(); /*The test index contains 3 sentences: - "Ala ma kota" - "Ala ma rysia" - "Marysia ma rysia" + 14: "Ala ma kota" + 51: "Ala ma rysia" + 123: "Marysia ma rysia" Test word map: Ala -> 0 @@ -40,44 +42,48 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) Marysia -> 4 Test hashed index: - n: 0 1 2 3 4 5 6 7 8 - T[n]: 0 1 2 0 1 3 4 1 3 + n: 0 1 2 3 4 5 6 7 8 9 10 11 + T[n]: 0 1 2 | 0 1 3 | 4 1 3 | Test suffix array: - n: 0 1 2 3 4 5 6 7 8 - SA[n]: 0 3 1 7 4 2 8 5 6 + n: 0 1 2 3 4 5 6 7 8 9 10 11 + SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 */ - boost::shared_ptr > expectedResult1(new std::vector()); - expectedResult1->push_back(7); - expectedResult1->push_back(4); - - boost::shared_ptr > searchResult1 = concordia.simpleSearch("ma rysia"); + boost::ptr_vector searchResult1 = concordia.simpleSearch("ma rysia"); + boost::ptr_vector searchResult2 = concordia.simpleSearch("ma kota Ala"); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); - BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1->begin(), searchResult1->end(), - expectedResult1->begin(), expectedResult1->end()); + BOOST_CHECK_EQUAL(searchResult1.size(), 2); + BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123); + BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51); + BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1); + + // Checking pattern spanning over 2 segments + BOOST_CHECK_EQUAL(searchResult2.size(), 0); } BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) { Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); - boost::shared_ptr > testSentences (new vector()); - testSentences->push_back("to jest okno"); - testSentences->push_back("czy jest okno otwarte"); - testSentences->push_back("chyba to jest tutaj"); - testSentences->push_back("to jest"); - concordia.addAllSentences(testSentences); + boost::ptr_vector testExamples; + testExamples.push_back(new Example("to jest okno",312)); + testExamples.push_back(new Example("czy jest okno otwarte",202)); + testExamples.push_back(new Example("chyba to jest tutaj",45)); + testExamples.push_back(new Example("to jest",29)); + concordia.addAllExamples(testExamples); /*The test index contains 4 sentences: - "to jest okno" - "czy jest okno otwarte" - "chyba to jest tutaj" - "to jest" + 312: "to jest okno" + 202: "czy jest okno otwarte" + 45: "chyba to jest tutaj" + 29: "to jest" Test word map: to -> 0 @@ -98,27 +104,27 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) */ - boost::shared_ptr > expectedResult1(new vector()); - expectedResult1->push_back(11); - expectedResult1->push_back(0); - expectedResult1->push_back(8); - - boost::shared_ptr > expectedResult2(new vector()); - expectedResult2->push_back(1); - expectedResult2->push_back(4); - Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); - boost::shared_ptr > searchResult1 = concordia2.simpleSearch("to jest"); - boost::shared_ptr > searchResult2 = concordia2.simpleSearch("jest okno"); + boost::ptr_vector searchResult1 = concordia2.simpleSearch("to jest"); + boost::ptr_vector searchResult2 = concordia2.simpleSearch("jest okno"); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); - BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1->begin(), searchResult1->end(), - expectedResult1->begin(), expectedResult1->end()); - BOOST_CHECK_EQUAL_COLLECTIONS(searchResult2->begin(), searchResult2->end(), - expectedResult2->begin(), expectedResult2->end()); + BOOST_CHECK_EQUAL(searchResult1.size(), 3); + BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); + BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 0); + BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 45); + BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1.at(2).getId(), 29); + BOOST_CHECK_EQUAL(searchResult1.at(2).getOffset(), 0); + BOOST_CHECK_EQUAL(searchResult2.size(), 2); + BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202); + BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1); + BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312); + BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1); } BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_concordia_config.cpp b/concordia/t/test_concordia_config.cpp index b60155f..4af8602 100644 --- a/concordia/t/test_concordia_config.cpp +++ b/concordia/t/test_concordia_config.cpp @@ -17,6 +17,7 @@ BOOST_AUTO_TEST_CASE( ConfigParameters ) BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" ); BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" ); BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" ); + BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "tmp/ma.bin" ); BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" ); } diff --git a/concordia/t/test_concordia_index.cpp b/concordia/t/test_concordia_index.cpp index 7e8ed44..3efeb64 100644 --- a/concordia/t/test_concordia_index.cpp +++ b/concordia/t/test_concordia_index.cpp @@ -16,7 +16,8 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest ) { boost::shared_ptr hashGenerator (new HashGenerator("nonexistent")); - ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")); + ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"), + TestResourcesManager::getTestFilePath("temp","test_markers.bin")); boost::shared_ptr > T = boost::shared_ptr >(new vector()); // Test hashed index: // n: 0 1 2 3 4 5 6 7 8 @@ -50,4 +51,50 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest ) BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end()); } +BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 ) +{ + boost::shared_ptr hashGenerator (new HashGenerator("nonexistent")); + + ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"), + TestResourcesManager::getTestFilePath("temp","test_markers.bin")); + boost::shared_ptr > T = boost::shared_ptr >(new vector()); + + //Test hashed index: + // n: 0 1 2 3 4 5 6 7 8 9 10 11 + // T[n]: 0 1 2 | 0 1 3 | 4 1 3 | + T->push_back(0); + T->push_back(1); + T->push_back(2); + T->push_back(255); + T->push_back(0); + T->push_back(1); + T->push_back(3); + T->push_back(255); + T->push_back(4); + T->push_back(1); + T->push_back(3); + T->push_back(255); + + //Test suffix array: + // n: 0 1 2 3 4 5 6 7 8 9 10 11 + //SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 + + boost::shared_ptr > SA = index.generateSuffixArray(hashGenerator, T); + + boost::shared_ptr > expectedSA = boost::shared_ptr >(new vector()); + expectedSA->push_back(0); + expectedSA->push_back(4); + expectedSA->push_back(1); + expectedSA->push_back(9); + expectedSA->push_back(5); + expectedSA->push_back(2); + expectedSA->push_back(10); + expectedSA->push_back(6); + expectedSA->push_back(8); + expectedSA->push_back(11); + expectedSA->push_back(3); + expectedSA->push_back(7); + BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end()); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/prod/resources/concordia-config/concordia.cfg.in b/prod/resources/concordia-config/concordia.cfg.in index cd6203f..e5d5e60 100644 --- a/prod/resources/concordia-config/concordia.cfg.in +++ b/prod/resources/concordia-config/concordia.cfg.in @@ -18,6 +18,10 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@" hashed_index_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@" +#File containing suffix markers (sentence ids and offsets) + +markers_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_MARKERS@" + #Binarized suffix array suffix_array_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@" diff --git a/tests/resources/concordia-config/concordia-mock.cfg b/tests/resources/concordia-config/concordia-mock.cfg index 4f0ae12..37bd27b 100644 --- a/tests/resources/concordia-config/concordia-mock.cfg +++ b/tests/resources/concordia-config/concordia-mock.cfg @@ -10,6 +10,8 @@ word_map_path = "tmp/wm.bin" hashed_index_path = "tmp/hi.bin" +markers_path = "tmp/ma.bin" + suffix_array_path = "tmp/sa.bin" ### eof diff --git a/tests/resources/concordia-config/concordia.cfg.in b/tests/resources/concordia-config/concordia.cfg.in index 8bd23d9..7ea214b 100644 --- a/tests/resources/concordia-config/concordia.cfg.in +++ b/tests/resources/concordia-config/concordia.cfg.in @@ -18,6 +18,10 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@" hashed_index_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@" +#File containing suffix markers (sentence ids and offsets) + +markers_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_MARKERS@" + #Binarized suffix array suffix_array_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"