From 0d8a057278a3a344b21551e0ccff5c7c3c4c4e8d Mon Sep 17 00:00:00 2001 From: rjawor Date: Thu, 28 Nov 2013 16:47:57 +0100 Subject: [PATCH] suffix array simple search --- CMakeLists.txt | 11 +- concordia/CMakeLists.txt | 1 + concordia/common/config.hpp.in | 3 + concordia/concordia.cpp | 33 +++++- concordia/concordia.hpp | 16 +++ concordia/concordia_config.cpp | 9 ++ concordia/concordia_config.hpp | 18 +++ concordia/concordia_index.cpp | 89 ++++++++------- concordia/concordia_index.hpp | 10 +- concordia/index_searcher.cpp | 90 +++++++++++++++ concordia/index_searcher.hpp | 45 ++++++++ concordia/t/CMakeLists.txt | 1 + concordia/t/test_concordia.cpp | 107 ++++++++++++++++++ concordia/t/test_concordia_config.cpp | 5 +- concordia/t/test_concordia_index.cpp | 39 ++++--- concordia/t/test_index_searcher.cpp | 76 +++++++++++++ tests/common/test_resources_manager.cpp | 21 +--- tests/common/test_resources_manager.hpp | 9 +- ...{concordia-test.cfg => concordia-mock.cfg} | 8 +- .../concordia-config/concordia.cfg.in | 18 +++ tests/resources/index-searcher/test_SA.bin | Bin 0 -> 36 bytes .../index-searcher/test_hash_index.bin | Bin 0 -> 9 bytes .../index-searcher/test_word_map.bin | Bin 0 -> 134 bytes 23 files changed, 510 insertions(+), 99 deletions(-) create mode 100644 concordia/index_searcher.cpp create mode 100644 concordia/index_searcher.hpp create mode 100644 concordia/t/test_index_searcher.cpp rename tests/resources/concordia-config/{concordia-test.cfg => concordia-mock.cfg} (51%) create mode 100644 tests/resources/index-searcher/test_SA.bin create mode 100644 tests/resources/index-searcher/test_hash_index.bin create mode 100644 tests/resources/index-searcher/test_word_map.bin diff --git a/CMakeLists.txt b/CMakeLists.txt index 7abfdc6..c7b5eee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,10 +20,16 @@ set (PROD_PUDDLE_TAGSET_PATH "${PROD_RESOURCES_DIRECTORY}/puddle/tagset.txt") set (TEST_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/tests/resources") set (TEST_PUDDLE_TAGSET_PATH "${TEST_RESOURCES_DIRECTORY}/puddle/basic-tagset.txt") +set (TEMP_WORD_MAP "temp_word_map.bin") +set (TEMP_HASHED_INDEX "temp_hashed_index.bin") +set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin") + SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") set(BASE_TARGETS concordia) + + # ================================================ # Third-party libraries @@ -99,11 +105,6 @@ configure_file ( "${concordia_SOURCE_DIR}/concordia/common/config.hpp" ) -configure_file ( - "${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg.in" - "${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg" - ) - configure_file ( "${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg.in" "${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg" diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index 87a31ce..af8d1f9 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -6,6 +6,7 @@ foreach(dir ${ALL_DIRECTORIES}) endforeach(dir) add_library(concordia SHARED + index_searcher.cpp concordia_index.cpp word_map.cpp hash_generator.cpp diff --git a/concordia/common/config.hpp.in b/concordia/common/config.hpp.in index 16fbda7..d3eb063 100644 --- a/concordia/common/config.hpp.in +++ b/concordia/common/config.hpp.in @@ -2,6 +2,9 @@ #define CONCORDIA_VERSION_MINOR @CONCORDIA_VERSION_MINOR@ #define TEST_RESOURCES_DIRECTORY "@TEST_RESOURCES_DIRECTORY@" +#define TEMP_WORD_MAP "@TEMP_WORD_MAP@" +#define TEMP_HASHED_INDEX "@TEMP_HASHED_INDEX@" +#define TEMP_SUFFIX_ARRAY "@TEMP_SUFFIX_ARRAY@" #define PROD_RESOURCES_DIRECTORY "@PROD_RESOURCES_DIRECTORY@" diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 181d5a0..841eeb6 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -1,8 +1,8 @@ +#include + #include "concordia/concordia.hpp" #include "concordia/common/config.hpp" -#include - // =========================================== std::string _createLibraryVersion(); @@ -13,9 +13,15 @@ std::string Concordia::_libraryVersion = _createLibraryVersion(); // =========================================== -Concordia::Concordia(const string & configFilePath) throw(ConcordiaException) { - boost::shared_ptr _config( +Concordia::Concordia(const std::string & configFilePath) + throw(ConcordiaException) { + _config = boost::shared_ptr ( new ConcordiaConfig(configFilePath)); + _index = boost::shared_ptr( + new ConcordiaIndex(_config->getWordMapFilePath(), + _config->getHashedIndexFilePath(), + _config->getSuffixArrayFilePath())); + _searcher = boost::shared_ptr(new IndexSearcher()); } Concordia::~Concordia() { @@ -35,3 +41,22 @@ std::string _createLibraryVersion() { return version.str(); } +void Concordia::addSentence(const std::string & sentence) + throw(ConcordiaException) { + _index->addSentence(sentence); +} + +void Concordia::generateIndex() throw(ConcordiaException) { + _index->generateSuffixArray(); + _index->serializeWordMap(); + _searcher->loadIndex(_config->getWordMapFilePath(), + _config->getHashedIndexFilePath(), + _config->getSuffixArrayFilePath()); +} + +std::vector Concordia::simpleSearch(const std::string & pattern) + throw(ConcordiaException) { + return _searcher->simpleSearch(pattern); +} + + diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index e735351..2e4c8ce 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -2,9 +2,14 @@ #define CONCORDIA_HDR #include +#include #include #include "concordia/concordia_config.hpp" +#include "concordia/concordia_index.hpp" +#include "concordia/index_searcher.hpp" +#include + /*! The Concordia class is the main access point to the library. @@ -28,10 +33,21 @@ public: */ std::string & getVersion(); + void addSentence(const std::string & sentence) throw(ConcordiaException); + + void generateIndex() throw(ConcordiaException); + + std::vector simpleSearch(const std::string & pattern) + throw(ConcordiaException); + private: static std::string _libraryVersion; boost::shared_ptr _config; + + boost::shared_ptr _index; + + boost::shared_ptr _searcher; }; #endif diff --git a/concordia/concordia_config.cpp b/concordia/concordia_config.cpp index 8126be8..8cb6280 100644 --- a/concordia/concordia_config.cpp +++ b/concordia/concordia_config.cpp @@ -3,6 +3,9 @@ #include "concordia/common/logging.hpp" #define PUDDLE_TAGSET_PARAM "puddle_tagset_path" +#define WORD_MAP_PARAM "word_map_path" +#define HASHED_INDEX_PARAM "hashed_index_path" +#define SUFFIX_ARRAY_PARAM "suffix_array_path" ConcordiaConfig::ConcordiaConfig(const string & configFilePath) throw(ConcordiaException) { @@ -17,6 +20,12 @@ ConcordiaConfig::ConcordiaConfig(const string & configFilePath) _puddleTagsetFilePath = ConcordiaConfig::_readConfigParameterStr(PUDDLE_TAGSET_PARAM); + _wordMapFilePath = + ConcordiaConfig::_readConfigParameterStr(WORD_MAP_PARAM); + _hashedIndexFilePath = + ConcordiaConfig::_readConfigParameterStr(HASHED_INDEX_PARAM); + _suffixArrayFilePath = + ConcordiaConfig::_readConfigParameterStr(SUFFIX_ARRAY_PARAM); } ConcordiaConfig::~ConcordiaConfig() { diff --git a/concordia/concordia_config.hpp b/concordia/concordia_config.hpp index 557b0e6..8ccb266 100644 --- a/concordia/concordia_config.hpp +++ b/concordia/concordia_config.hpp @@ -34,11 +34,29 @@ public: return _puddleTagsetFilePath; } + string & getWordMapFilePath() { + return _wordMapFilePath; + } + + string & getHashedIndexFilePath() { + return _hashedIndexFilePath; + } + + string & getSuffixArrayFilePath() { + return _suffixArrayFilePath; + } + private: Config _config; string _puddleTagsetFilePath; + string _wordMapFilePath; + + string _hashedIndexFilePath; + + string _suffixArrayFilePath; + string _readConfigParameterStr(const string & name) throw(ConcordiaException); }; diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index acf8836..d6226a6 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -1,45 +1,27 @@ #include "concordia/concordia_index.hpp" #include +#include -ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath, - const string & hashedIndexFilepath, - const string & suffixArrayFilepath) - throw(ConcordiaException) { - if (boost::filesystem::exists(wordMapFilepath)) { - if (boost::filesystem::exists(hashedIndexFilepath)) { - _hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out | - ios::app | ios::binary); - if (!_hashedIndexFile.is_open()) { - throw ConcordiaException("E03: Failed to open hashed index " - "file for appending."); - } - } else { +ConcordiaIndex::ConcordiaIndex(const string & wordMapFilePath, + const string & hashedIndexFilePath, + const string & suffixArrayFilePath) + throw(ConcordiaException) : + _hashedIndexFilePath(hashedIndexFilePath), + _suffixArrayFilePath(suffixArrayFilePath) { + if (boost::filesystem::exists(wordMapFilePath)) { + if (!boost::filesystem::exists(hashedIndexFilePath)) { throw ConcordiaException("E01: Word map file exists " "but hashed index file absent."); } } else { // WordMap file does not exist - if (boost::filesystem::exists(hashedIndexFilepath)) { + if (boost::filesystem::exists(hashedIndexFilePath)) { throw ConcordiaException("E02: Hashed index file exists " "but word map file absent."); - } else { - _hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out | - ios::binary); - if (!_hashedIndexFile.is_open()) { - throw ConcordiaException("E04: Failed to open hashed index " - "file for writing."); - } } } - - _suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::out | - ios::binary); - if (!_hashedIndexFile.is_open()) { - throw ConcordiaException("E05: Failed to open suffix array " - "file for writing."); - } _hashGenerator = boost::shared_ptr( - new HashGenerator(wordMapFilepath)); + new HashGenerator(wordMapFilePath)); } ConcordiaIndex::~ConcordiaIndex() { @@ -50,23 +32,30 @@ void ConcordiaIndex::serializeWordMap() { } void ConcordiaIndex::generateSuffixArray() { - /* Get the file size. */ - long n = _hashedIndexFile.tellg(); + ifstream hashedIndexFile; + hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in| + ios::ate|ios::binary); + + /* Get the file size. */ + long n = hashedIndexFile.tellg(); - /* Allocate 5blocksize bytes of memory. */ sauchar_t *T; saidx_t *SA; - T = reinterpret_cast (malloc((size_t)n * sizeof(sauchar_t))); - SA = reinterpret_cast (malloc((size_t)n * sizeof(saidx_t))); - if ((T == NULL) || (SA == NULL)) { - throw ConcordiaException("Cannot allocate memory."); - } + T = new sauchar_t[n]; + SA = new saidx_t[n]; /* Read n bytes of data. */ + hashedIndexFile.seekg(0, ios::beg); - _hashedIndexFile.seekg(0, ios::beg); - _hashedIndexFile.read(reinterpret_cast (T), (size_t)n); + sauchar_t buff; + int pos = 0; + while (!hashedIndexFile.eof()) { + hashedIndexFile.read(reinterpret_cast(&buff), + sizeof(sauchar_t)); + T[pos++] = buff; + } + hashedIndexFile.close(); /* Construct the suffix array. */ if (divsufsort(T, SA, (saidx_t)n) != 0) { @@ -74,18 +63,32 @@ void ConcordiaIndex::generateSuffixArray() { } /* Write the suffix array. */ - _suffixArrayFile << *SA; + + ofstream suffixArrayFile; + suffixArrayFile.open(_suffixArrayFilePath.c_str(), ios::out|ios::binary); + + for (int i = 0; i < n; i++) { + suffixArrayFile.write(reinterpret_cast(&SA[i]), + sizeof(saidx_t)); + } + suffixArrayFile.close(); /* Deallocate memory. */ - free(SA); - free(T); + delete[] T; + delete[] SA; } void ConcordiaIndex::addSentence(const string & sentence) { vector hash = _hashGenerator->generateHash(sentence); + ofstream hashedIndexFile; + hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out| + ios::app|ios::binary); for (vector::iterator it = hash.begin(); it != hash.end(); ++it) { - _hashedIndexFile << *it; + sauchar_t buff = *it; + hashedIndexFile.write(reinterpret_cast(&buff), + sizeof(sauchar_t)); } + hashedIndexFile.close(); } diff --git a/concordia/concordia_index.hpp b/concordia/concordia_index.hpp index c92a066..6e69271 100644 --- a/concordia/concordia_index.hpp +++ b/concordia/concordia_index.hpp @@ -19,9 +19,9 @@ using namespace std; class ConcordiaIndex { public: - explicit ConcordiaIndex(const string & wordMapFilepath, - const string & hashedIndexFilepath, - const string & suffixArrayFilepath) + explicit ConcordiaIndex(const string & wordMapFilePath, + const string & hashedIndexFilePath, + const string & suffixArrayFilePath) throw(ConcordiaException); /*! Destructor. @@ -37,9 +37,9 @@ public: private: boost::shared_ptr _hashGenerator; - fstream _hashedIndexFile; + string _hashedIndexFilePath; - ofstream _suffixArrayFile; + string _suffixArrayFilePath; }; #endif diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp new file mode 100644 index 0000000..59e461d --- /dev/null +++ b/concordia/index_searcher.cpp @@ -0,0 +1,90 @@ +#include "concordia/index_searcher.hpp" + +#include + +IndexSearcher::IndexSearcher(): + _T(NULL), + _SA(NULL), + _n(0) { +} + + +IndexSearcher::~IndexSearcher() { +} + + +void IndexSearcher::loadIndex(const string & wordMapFilepath, + const string & hashedIndexFilepath, + const string & suffixArrayFilepath) + throw(ConcordiaException) { + if (!boost::filesystem::exists(wordMapFilepath)) { + throw ConcordiaException("E06: Failed to open word map " + "file for reading."); + } + + if (!boost::filesystem::exists(hashedIndexFilepath)) { + throw ConcordiaException("E07: Failed to open hashed index file " + "for reading."); + } + + if (!boost::filesystem::exists(suffixArrayFilepath)) { + throw ConcordiaException("E08: Failed to open suffix array file " + "for reading."); + } + + _hashGenerator = boost::shared_ptr( + new HashGenerator(wordMapFilepath)); + + ifstream hashedIndexFile; + hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in + | ios::ate | ios::binary); + _n = hashedIndexFile.tellg(); + _T = new sauchar_t[_n]; + + hashedIndexFile.seekg(0, ios::beg); + hashedIndexFile.read(reinterpret_cast (_T), _n); + hashedIndexFile.close(); + + _SA = new saidx_t[_n]; + + ifstream suffixArrayFile; + suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::in | ios::binary); + + saidx_t buff; + int pos = 0; + while (!suffixArrayFile.eof() && pos < _n) { + suffixArrayFile.read(reinterpret_cast(&buff), sizeof(saidx_t)); + _SA[pos++] = buff; + } + suffixArrayFile.close(); +} + +vector IndexSearcher::simpleSearch(const string & pattern) + throw(ConcordiaException) { + vector result; + + int left; + vector hash = _hashGenerator->generateHash(pattern); + saidx_t patternLength = hash.size(); + sauchar_t * patternArray = new sauchar_t[patternLength]; + int i = 0; + for (vector::iterator it = hash.begin(); + it != hash.end(); ++it) { + patternArray[i] = *it; + i++; + } + + int size = sa_search(_T, (saidx_t) _n, + (const sauchar_t *) patternArray, patternLength, + _SA, (saidx_t) _n, &left); + for (i = 0; i < size; ++i) { + result.push_back(_SA[left + i]); + } + + delete[] patternArray; + return result; +} + + + + diff --git a/concordia/index_searcher.hpp b/concordia/index_searcher.hpp new file mode 100644 index 0000000..a1ad8ee --- /dev/null +++ b/concordia/index_searcher.hpp @@ -0,0 +1,45 @@ +#ifndef INDEX_SEARCHER_HDR +#define INDEX_SEARCHER_HDR + +#include +#include +#include +#include + +#include "concordia/hash_generator.hpp" +#include "concordia/concordia_exception.hpp" + +/*! + Class for searching the index with a sentence. + +*/ + +using namespace std; + +class IndexSearcher { +public: + explicit IndexSearcher(); + + /*! Destructor. + */ + virtual ~IndexSearcher(); + + void loadIndex(const string & wordMapFilepath, + const string & hashedIndexFilepath, + const string & suffixArrayFilepath) + throw(ConcordiaException); + + vector simpleSearch(const string & pattern) + throw(ConcordiaException); + +private: + boost::shared_ptr _hashGenerator; + + sauchar_t * _T; + + saidx_t * _SA; + + size_t _n; +}; + +#endif diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt index 04da115..a677415 100644 --- a/concordia/t/CMakeLists.txt +++ b/concordia/t/CMakeLists.txt @@ -4,6 +4,7 @@ add_library(concordia-tests test_word_map.cpp test_hash_generator.cpp test_concordia_index.cpp + test_index_searcher.cpp ) target_link_libraries(concordia-tests concordia ${LIBCONFIG_LIB} concordia-tests-common) diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 9248238..1f74e2e 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -1,7 +1,10 @@ #include "tests/unit-tests/unit_tests_globals.hpp" #include "concordia/concordia.hpp" #include "tests/common/test_resources_manager.hpp" +#include "concordia/common/config.hpp" +#include +#include #include @@ -16,4 +19,108 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion ) BOOST_CHECK_EQUAL( version , "0.1"); } +BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) +{ + Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + concordia.addSentence("Ala ma kota"); + concordia.addSentence("Ala ma rysia"); + concordia.addSentence("Marysia ma rysia"); + + concordia.generateIndex(); + + /*The test index contains 3 sentences: + "Ala ma kota" + "Ala ma rysia" + "Marysia ma rysia" + + Test word map: + Ala -> 0 + ma -> 1 + kota -> 2 + rysia -> 3 + Marysia -> 4 + + Test hashed index: + n: 0 1 2 3 4 5 6 7 8 + T[n]: 0 1 2 0 1 3 4 1 3 + + Test suffix array: + n: 0 1 2 3 4 5 6 7 8 + SA[n]: 0 3 1 7 4 2 8 5 6 + + */ + + vector expectedResult1; + expectedResult1.push_back(7); + expectedResult1.push_back(4); + + vector searchResult1 = concordia.simpleSearch("ma rysia"); + + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_SUFFIX_ARRAY)); + + BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(), + expectedResult1.begin(), expectedResult1.end()); + +} + +BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) +{ + Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + concordia.addSentence("to jest okno"); + concordia.addSentence("czy jest okno otwarte"); + concordia.addSentence("chyba to jest tutaj"); + concordia.addSentence("to jest"); + + concordia.generateIndex(); + + /*The test index contains 4 sentences: + "to jest okno" + "czy jest okno otwarte" + "chyba to jest tutaj" + "to jest" + + Test word map: + to -> 0 + jest -> 1 + okno -> 2 + czy -> 3 + otwarte -> 4 + chyba -> 5 + tutaj -> 6 + + Test hashed index: + n: 0 1 2 3 4 5 6 7 8 9 10 11 12 + T[n]: 0 1 2 3 1 2 4 5 0 1 6 0 1 + + Test suffix array: + n: 0 1 2 3 4 5 6 7 8 9 10 11 12 + SA[n]: 11 0 8 12 1 4 9 2 5 3 6 7 10 + + */ + + vector expectedResult1; + expectedResult1.push_back(11); + expectedResult1.push_back(0); + expectedResult1.push_back(8); + + vector expectedResult2; + expectedResult2.push_back(1); + expectedResult2.push_back(4); + + vector searchResult1 = concordia.simpleSearch("to jest"); + vector searchResult2 = concordia.simpleSearch("jest okno"); + + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_SUFFIX_ARRAY)); + + BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(), + expectedResult1.begin(), expectedResult1.end()); + BOOST_CHECK_EQUAL_COLLECTIONS(searchResult2.begin(), searchResult2.end(), + expectedResult2.begin(), expectedResult2.end()); + +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_concordia_config.cpp b/concordia/t/test_concordia_config.cpp index 9e6566b..b60155f 100644 --- a/concordia/t/test_concordia_config.cpp +++ b/concordia/t/test_concordia_config.cpp @@ -13,8 +13,11 @@ BOOST_AUTO_TEST_SUITE(concordia_config) BOOST_AUTO_TEST_CASE( ConfigParameters ) { - ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-test.cfg")); + ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")); BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" ); + BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" ); + BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" ); + BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" ); } BOOST_AUTO_TEST_CASE( NonexistentConfigTest ) diff --git a/concordia/t/test_concordia_index.cpp b/concordia/t/test_concordia_index.cpp index bcc9398..c5c309d 100644 --- a/concordia/t/test_concordia_index.cpp +++ b/concordia/t/test_concordia_index.cpp @@ -13,9 +13,9 @@ BOOST_AUTO_TEST_SUITE(concordia_index) BOOST_AUTO_TEST_CASE( ResourcesExistenceTest1 ) { - ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("mock_word_map.bin"), - TestResourcesManager::getTestHashIndexFilePath("mock_hash_index.bin"), - TestResourcesManager::getTestSuffixArrayFilePath()); + ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","mock_word_map.bin"), + TestResourcesManager::getTestFilePath("concordia-index","mock_hash_index.bin"), + TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin")); } @@ -26,9 +26,9 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest2 ) string message = ""; try { - ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("mock_word_map.bin"), - TestResourcesManager::getTestHashIndexFilePath("nonexistent.bin"), - TestResourcesManager::getTestSuffixArrayFilePath()); + ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","mock_word_map.bin"), + TestResourcesManager::getTestFilePath("concordia-index","nonexistent.bin"), + TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin")); } catch (ConcordiaException & e) { exceptionThrown = true; message = e.what(); @@ -44,9 +44,9 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest3 ) string message = ""; try { - ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("nonexistent.bin"), - TestResourcesManager::getTestHashIndexFilePath("mock_hash_index.bin"), - TestResourcesManager::getTestSuffixArrayFilePath()); + ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","nonexistent.bin"), + TestResourcesManager::getTestFilePath("concordia-index","mock_hash_index.bin"), + TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin")); } catch (ConcordiaException & e) { exceptionThrown = true; message = e.what(); @@ -58,20 +58,23 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest3 ) BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest ) { - ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"), - TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"), - TestResourcesManager::getTestSuffixArrayFilePath()); + ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"), + TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"), + TestResourcesManager::getTestFilePath("temp","test_SA.bin")); index.addSentence("Ala ma kota"); + index.addSentence("Ala ma rysia"); + index.addSentence("Marysia ma rysia"); + index.generateSuffixArray(); index.serializeWordMap(); - BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"))); - BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"))); - BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestSuffixArrayFilePath())); + BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"))); + BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"))); + BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_SA.bin"))); - boost::filesystem::remove(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin")); - boost::filesystem::remove(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin")); - boost::filesystem::remove(TestResourcesManager::getTestSuffixArrayFilePath()); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_SA.bin")); } BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_index_searcher.cpp b/concordia/t/test_index_searcher.cpp new file mode 100644 index 0000000..9657ed0 --- /dev/null +++ b/concordia/t/test_index_searcher.cpp @@ -0,0 +1,76 @@ +#include "tests/unit-tests/unit_tests_globals.hpp" + +#include "concordia/index_searcher.hpp" +#include "concordia/concordia_index.hpp" +#include "concordia/concordia_exception.hpp" +#include "tests/common/test_resources_manager.hpp" + +#include +#include + +using namespace std; + +BOOST_AUTO_TEST_SUITE(index_searcher) + + +BOOST_AUTO_TEST_CASE( SimpleSearchTest ) +{ + +ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"), + TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"), + TestResourcesManager::getTestFilePath("temp","test_SA.bin")); + index.addSentence("Ala ma kota"); + index.addSentence("Ala ma rysia"); + index.addSentence("Marysia ma rysia"); + + index.generateSuffixArray(); + index.serializeWordMap(); + + BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"))); + BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"))); + BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_SA.bin"))); + + IndexSearcher searcher; + searcher.loadIndex(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"), + TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"), + TestResourcesManager::getTestFilePath("temp","test_SA.bin")); + + /*The test index contains 3 sentences: + "Ala ma kota" + "Ala ma rysia" + "Marysia ma rysia" + + Test word map: + Ala -> 0 + ma -> 1 + kota -> 2 + rysia -> 3 + Marysia -> 4 + + Test hashed index: + n: 0 1 2 3 4 5 6 7 8 + T[n]: 0 1 2 0 1 3 4 1 3 + + Test suffix array: + n: 0 1 2 3 4 5 6 7 8 + SA[n]: 0 3 1 7 4 2 8 5 6 + + */ + + vector expectedResult1; + expectedResult1.push_back(7); + expectedResult1.push_back(4); + + vector searchResult1 = searcher.simpleSearch("ma rysia"); + + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")); + boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_SA.bin")); + + BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(), + expectedResult1.begin(), expectedResult1.end()); + + +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/common/test_resources_manager.cpp b/tests/common/test_resources_manager.cpp index c6b6f32..7f1501d 100644 --- a/tests/common/test_resources_manager.cpp +++ b/tests/common/test_resources_manager.cpp @@ -3,7 +3,6 @@ #define PUDDLE_TEST_DIRECTORY "puddle" #define CONCORDIA_TAGSET_DIRECTORY "concordia-tagset" #define CONCORDIA_CONFIG_DIRECTORY "concordia-config" -#define CONCORDIA_INDEX_DIRECTORY "concordia-index" string TestResourcesManager::getPuddleFilePath(const string & filename) { string result = string(TEST_RESOURCES_DIRECTORY); @@ -16,23 +15,13 @@ string TestResourcesManager::getTestConcordiaConfigFilePath(const string & filen return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename; } -string TestResourcesManager::getTestWordMapFilePath(const string & filename) { - string result = string(TEST_RESOURCES_DIRECTORY); - return result + "/" + CONCORDIA_INDEX_DIRECTORY + "/" + filename; -} - -string TestResourcesManager::getTestHashIndexFilePath(const string & filename) { - string result = string(TEST_RESOURCES_DIRECTORY); - return result + "/" + CONCORDIA_INDEX_DIRECTORY + "/" + filename; -} - -string TestResourcesManager::getTestSuffixArrayFilePath() { - string result = string(TEST_RESOURCES_DIRECTORY); - return result + "/" + CONCORDIA_INDEX_DIRECTORY + "/test_SA.bin"; -} - string TestResourcesManager::getProdConcordiaConfigFilePath(const string & filename) { string result = string(PROD_RESOURCES_DIRECTORY); return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename; } +string TestResourcesManager::getTestFilePath(const string & module, const string & filename) { + string result = string(TEST_RESOURCES_DIRECTORY); + return result + "/" + module + "/" + filename; +} + diff --git a/tests/common/test_resources_manager.hpp b/tests/common/test_resources_manager.hpp index 846647d..ac4e523 100644 --- a/tests/common/test_resources_manager.hpp +++ b/tests/common/test_resources_manager.hpp @@ -14,13 +14,10 @@ public: static string getTestConcordiaConfigFilePath(const string & filename); - static string getTestWordMapFilePath(const string & filename); - - static string getTestHashIndexFilePath(const string & filename); - - static string getTestSuffixArrayFilePath(); - static string getProdConcordiaConfigFilePath(const string & filename); + + static string getTestFilePath(const string & module, const string & filename); + }; #endif diff --git a/tests/resources/concordia-config/concordia-test.cfg b/tests/resources/concordia-config/concordia-mock.cfg similarity index 51% rename from tests/resources/concordia-config/concordia-test.cfg rename to tests/resources/concordia-config/concordia-mock.cfg index a6f535c..4f0ae12 100644 --- a/tests/resources/concordia-config/concordia-test.cfg +++ b/tests/resources/concordia-config/concordia-mock.cfg @@ -1,9 +1,15 @@ #---------------------------- -# Concordia configuration file +# Concordia mock configuration file #--------------------------- # #Path to the Puddle tagset puddle_tagset_path = "puddle/tagset.txt"; +word_map_path = "tmp/wm.bin" + +hashed_index_path = "tmp/hi.bin" + +suffix_array_path = "tmp/sa.bin" + ### eof diff --git a/tests/resources/concordia-config/concordia.cfg.in b/tests/resources/concordia-config/concordia.cfg.in index 1eb1254..8bd23d9 100644 --- a/tests/resources/concordia-config/concordia.cfg.in +++ b/tests/resources/concordia-config/concordia.cfg.in @@ -6,4 +6,22 @@ #Path to the Puddle tagset puddle_tagset_path = "@TEST_PUDDLE_TAGSET_PATH@"; +#------------------------------------------------------------------------------- +#Word map, hashed index and suffix array files are in a temporary directory +#and should be deleted at the end of each test procedure. + +#Word map file containing unique codes for tokens + +word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@" + +#File containing the "text" for suffix array searching, i.e. sequence of codes + +hashed_index_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@" + +#Binarized suffix array + +suffix_array_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@" +#------------------------------------------------------------------------------- + + ### eof diff --git a/tests/resources/index-searcher/test_SA.bin b/tests/resources/index-searcher/test_SA.bin new file mode 100644 index 0000000000000000000000000000000000000000..b43acb9c42cd94edd319ed8abb67c696e0a61b61 GIT binary patch literal 36 ecmWN=0TKW(1i-Kw#{XZexeGu#JT5=4+usKQSO6se literal 0 HcmV?d00001 diff --git a/tests/resources/index-searcher/test_hash_index.bin b/tests/resources/index-searcher/test_hash_index.bin new file mode 100644 index 0000000000000000000000000000000000000000..3c73acacf97d0184871d7a3ca672e2c6933e8469 GIT binary patch literal 9 QcmZQzWMW`sW?^Io002P%5C8xG literal 0 HcmV?d00001 diff --git a/tests/resources/index-searcher/test_word_map.bin b/tests/resources/index-searcher/test_word_map.bin new file mode 100644 index 0000000000000000000000000000000000000000..5a6a7181f62c7956da7f10ebc0f33f5fa9592966 GIT binary patch literal 134 zcmWe*fPmuEqRhmc%&Nqa%=|nntHh$@jLfoBE(R7377j+J8VJRT!UJ-cp#qLMi45#e hwr^rlWpQRA3k#H=onMm3#02H!CNjbd0jpwW1ppzX5x@Wd literal 0 HcmV?d00001