suffix array simple search

This commit is contained in:
rjawor 2013-11-28 16:47:57 +01:00
parent d3cccff654
commit 0d8a057278
23 changed files with 510 additions and 99 deletions

View File

@ -20,11 +20,17 @@ set (PROD_PUDDLE_TAGSET_PATH "${PROD_RESOURCES_DIRECTORY}/puddle/tagset.txt")
set (TEST_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/tests/resources") set (TEST_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/tests/resources")
set (TEST_PUDDLE_TAGSET_PATH "${TEST_RESOURCES_DIRECTORY}/puddle/basic-tagset.txt") set (TEST_PUDDLE_TAGSET_PATH "${TEST_RESOURCES_DIRECTORY}/puddle/basic-tagset.txt")
set (TEMP_WORD_MAP "temp_word_map.bin")
set (TEMP_HASHED_INDEX "temp_hashed_index.bin")
set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
set(BASE_TARGETS concordia) set(BASE_TARGETS concordia)
# ================================================ # ================================================
# Third-party libraries # Third-party libraries
# ================================================ # ================================================
@ -99,11 +105,6 @@ configure_file (
"${concordia_SOURCE_DIR}/concordia/common/config.hpp" "${concordia_SOURCE_DIR}/concordia/common/config.hpp"
) )
configure_file (
"${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg.in"
"${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg"
)
configure_file ( configure_file (
"${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg.in" "${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg.in"
"${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg" "${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg"

View File

@ -6,6 +6,7 @@ foreach(dir ${ALL_DIRECTORIES})
endforeach(dir) endforeach(dir)
add_library(concordia SHARED add_library(concordia SHARED
index_searcher.cpp
concordia_index.cpp concordia_index.cpp
word_map.cpp word_map.cpp
hash_generator.cpp hash_generator.cpp

View File

@ -2,6 +2,9 @@
#define CONCORDIA_VERSION_MINOR @CONCORDIA_VERSION_MINOR@ #define CONCORDIA_VERSION_MINOR @CONCORDIA_VERSION_MINOR@
#define TEST_RESOURCES_DIRECTORY "@TEST_RESOURCES_DIRECTORY@" #define TEST_RESOURCES_DIRECTORY "@TEST_RESOURCES_DIRECTORY@"
#define TEMP_WORD_MAP "@TEMP_WORD_MAP@"
#define TEMP_HASHED_INDEX "@TEMP_HASHED_INDEX@"
#define TEMP_SUFFIX_ARRAY "@TEMP_SUFFIX_ARRAY@"
#define PROD_RESOURCES_DIRECTORY "@PROD_RESOURCES_DIRECTORY@" #define PROD_RESOURCES_DIRECTORY "@PROD_RESOURCES_DIRECTORY@"

View File

@ -1,8 +1,8 @@
#include <sstream>
#include "concordia/concordia.hpp" #include "concordia/concordia.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include <sstream>
// =========================================== // ===========================================
std::string _createLibraryVersion(); std::string _createLibraryVersion();
@ -13,9 +13,15 @@ std::string Concordia::_libraryVersion = _createLibraryVersion();
// =========================================== // ===========================================
Concordia::Concordia(const string & configFilePath) throw(ConcordiaException) { Concordia::Concordia(const std::string & configFilePath)
boost::shared_ptr<ConcordiaConfig> _config( throw(ConcordiaException) {
_config = boost::shared_ptr<ConcordiaConfig> (
new ConcordiaConfig(configFilePath)); new ConcordiaConfig(configFilePath));
_index = boost::shared_ptr<ConcordiaIndex>(
new ConcordiaIndex(_config->getWordMapFilePath(),
_config->getHashedIndexFilePath(),
_config->getSuffixArrayFilePath()));
_searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher());
} }
Concordia::~Concordia() { Concordia::~Concordia() {
@ -35,3 +41,22 @@ std::string _createLibraryVersion() {
return version.str(); return version.str();
} }
void Concordia::addSentence(const std::string & sentence)
throw(ConcordiaException) {
_index->addSentence(sentence);
}
void Concordia::generateIndex() throw(ConcordiaException) {
_index->generateSuffixArray();
_index->serializeWordMap();
_searcher->loadIndex(_config->getWordMapFilePath(),
_config->getHashedIndexFilePath(),
_config->getSuffixArrayFilePath());
}
std::vector<saidx_t> Concordia::simpleSearch(const std::string & pattern)
throw(ConcordiaException) {
return _searcher->simpleSearch(pattern);
}

View File

@ -2,9 +2,14 @@
#define CONCORDIA_HDR #define CONCORDIA_HDR
#include <string> #include <string>
#include <vector>
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include "concordia/concordia_config.hpp" #include "concordia/concordia_config.hpp"
#include "concordia/concordia_index.hpp"
#include "concordia/index_searcher.hpp"
#include <divsufsort.h>
/*! /*!
The Concordia class is the main access point to the library. The Concordia class is the main access point to the library.
@ -28,10 +33,21 @@ public:
*/ */
std::string & getVersion(); std::string & getVersion();
void addSentence(const std::string & sentence) throw(ConcordiaException);
void generateIndex() throw(ConcordiaException);
std::vector<saidx_t> simpleSearch(const std::string & pattern)
throw(ConcordiaException);
private: private:
static std::string _libraryVersion; static std::string _libraryVersion;
boost::shared_ptr<ConcordiaConfig> _config; boost::shared_ptr<ConcordiaConfig> _config;
boost::shared_ptr<ConcordiaIndex> _index;
boost::shared_ptr<IndexSearcher> _searcher;
}; };
#endif #endif

View File

@ -3,6 +3,9 @@
#include "concordia/common/logging.hpp" #include "concordia/common/logging.hpp"
#define PUDDLE_TAGSET_PARAM "puddle_tagset_path" #define PUDDLE_TAGSET_PARAM "puddle_tagset_path"
#define WORD_MAP_PARAM "word_map_path"
#define HASHED_INDEX_PARAM "hashed_index_path"
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
ConcordiaConfig::ConcordiaConfig(const string & configFilePath) ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
throw(ConcordiaException) { throw(ConcordiaException) {
@ -17,6 +20,12 @@ ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
_puddleTagsetFilePath = _puddleTagsetFilePath =
ConcordiaConfig::_readConfigParameterStr(PUDDLE_TAGSET_PARAM); ConcordiaConfig::_readConfigParameterStr(PUDDLE_TAGSET_PARAM);
_wordMapFilePath =
ConcordiaConfig::_readConfigParameterStr(WORD_MAP_PARAM);
_hashedIndexFilePath =
ConcordiaConfig::_readConfigParameterStr(HASHED_INDEX_PARAM);
_suffixArrayFilePath =
ConcordiaConfig::_readConfigParameterStr(SUFFIX_ARRAY_PARAM);
} }
ConcordiaConfig::~ConcordiaConfig() { ConcordiaConfig::~ConcordiaConfig() {

View File

@ -34,11 +34,29 @@ public:
return _puddleTagsetFilePath; return _puddleTagsetFilePath;
} }
string & getWordMapFilePath() {
return _wordMapFilePath;
}
string & getHashedIndexFilePath() {
return _hashedIndexFilePath;
}
string & getSuffixArrayFilePath() {
return _suffixArrayFilePath;
}
private: private:
Config _config; Config _config;
string _puddleTagsetFilePath; string _puddleTagsetFilePath;
string _wordMapFilePath;
string _hashedIndexFilePath;
string _suffixArrayFilePath;
string _readConfigParameterStr(const string & name) string _readConfigParameterStr(const string & name)
throw(ConcordiaException); throw(ConcordiaException);
}; };

View File

@ -1,45 +1,27 @@
#include "concordia/concordia_index.hpp" #include "concordia/concordia_index.hpp"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <iostream>
ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath, ConcordiaIndex::ConcordiaIndex(const string & wordMapFilePath,
const string & hashedIndexFilepath, const string & hashedIndexFilePath,
const string & suffixArrayFilepath) const string & suffixArrayFilePath)
throw(ConcordiaException) { throw(ConcordiaException) :
if (boost::filesystem::exists(wordMapFilepath)) { _hashedIndexFilePath(hashedIndexFilePath),
if (boost::filesystem::exists(hashedIndexFilepath)) { _suffixArrayFilePath(suffixArrayFilePath) {
_hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out | if (boost::filesystem::exists(wordMapFilePath)) {
ios::app | ios::binary); if (!boost::filesystem::exists(hashedIndexFilePath)) {
if (!_hashedIndexFile.is_open()) {
throw ConcordiaException("E03: Failed to open hashed index "
"file for appending.");
}
} else {
throw ConcordiaException("E01: Word map file exists " throw ConcordiaException("E01: Word map file exists "
"but hashed index file absent."); "but hashed index file absent.");
} }
} else { // WordMap file does not exist } else { // WordMap file does not exist
if (boost::filesystem::exists(hashedIndexFilepath)) { if (boost::filesystem::exists(hashedIndexFilePath)) {
throw ConcordiaException("E02: Hashed index file exists " throw ConcordiaException("E02: Hashed index file exists "
"but word map file absent."); "but word map file absent.");
} else {
_hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out |
ios::binary);
if (!_hashedIndexFile.is_open()) {
throw ConcordiaException("E04: Failed to open hashed index "
"file for writing.");
} }
} }
}
_suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::out |
ios::binary);
if (!_hashedIndexFile.is_open()) {
throw ConcordiaException("E05: Failed to open suffix array "
"file for writing.");
}
_hashGenerator = boost::shared_ptr<HashGenerator>( _hashGenerator = boost::shared_ptr<HashGenerator>(
new HashGenerator(wordMapFilepath)); new HashGenerator(wordMapFilePath));
} }
ConcordiaIndex::~ConcordiaIndex() { ConcordiaIndex::~ConcordiaIndex() {
@ -50,23 +32,30 @@ void ConcordiaIndex::serializeWordMap() {
} }
void ConcordiaIndex::generateSuffixArray() { void ConcordiaIndex::generateSuffixArray() {
/* Get the file size. */ ifstream hashedIndexFile;
long n = _hashedIndexFile.tellg(); hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in|
ios::ate|ios::binary);
/* Get the file size. */
long n = hashedIndexFile.tellg();
/* Allocate 5blocksize bytes of memory. */
sauchar_t *T; sauchar_t *T;
saidx_t *SA; saidx_t *SA;
T = reinterpret_cast<sauchar_t *> (malloc((size_t)n * sizeof(sauchar_t))); T = new sauchar_t[n];
SA = reinterpret_cast<saidx_t *> (malloc((size_t)n * sizeof(saidx_t))); SA = new saidx_t[n];
if ((T == NULL) || (SA == NULL)) {
throw ConcordiaException("Cannot allocate memory.");
}
/* Read n bytes of data. */ /* Read n bytes of data. */
hashedIndexFile.seekg(0, ios::beg);
_hashedIndexFile.seekg(0, ios::beg); sauchar_t buff;
_hashedIndexFile.read(reinterpret_cast<char*> (T), (size_t)n); int pos = 0;
while (!hashedIndexFile.eof()) {
hashedIndexFile.read(reinterpret_cast<char *>(&buff),
sizeof(sauchar_t));
T[pos++] = buff;
}
hashedIndexFile.close();
/* Construct the suffix array. */ /* Construct the suffix array. */
if (divsufsort(T, SA, (saidx_t)n) != 0) { if (divsufsort(T, SA, (saidx_t)n) != 0) {
@ -74,18 +63,32 @@ void ConcordiaIndex::generateSuffixArray() {
} }
/* Write the suffix array. */ /* Write the suffix array. */
_suffixArrayFile << *SA;
ofstream suffixArrayFile;
suffixArrayFile.open(_suffixArrayFilePath.c_str(), ios::out|ios::binary);
for (int i = 0; i < n; i++) {
suffixArrayFile.write(reinterpret_cast<char *>(&SA[i]),
sizeof(saidx_t));
}
suffixArrayFile.close();
/* Deallocate memory. */ /* Deallocate memory. */
free(SA); delete[] T;
free(T); delete[] SA;
} }
void ConcordiaIndex::addSentence(const string & sentence) { void ConcordiaIndex::addSentence(const string & sentence) {
vector<sauchar_t> hash = _hashGenerator->generateHash(sentence); vector<sauchar_t> hash = _hashGenerator->generateHash(sentence);
ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
ios::app|ios::binary);
for (vector<sauchar_t>::iterator it = hash.begin(); for (vector<sauchar_t>::iterator it = hash.begin();
it != hash.end(); ++it) { it != hash.end(); ++it) {
_hashedIndexFile << *it; sauchar_t buff = *it;
hashedIndexFile.write(reinterpret_cast<char *>(&buff),
sizeof(sauchar_t));
} }
hashedIndexFile.close();
} }

View File

@ -19,9 +19,9 @@ using namespace std;
class ConcordiaIndex { class ConcordiaIndex {
public: public:
explicit ConcordiaIndex(const string & wordMapFilepath, explicit ConcordiaIndex(const string & wordMapFilePath,
const string & hashedIndexFilepath, const string & hashedIndexFilePath,
const string & suffixArrayFilepath) const string & suffixArrayFilePath)
throw(ConcordiaException); throw(ConcordiaException);
/*! Destructor. /*! Destructor.
@ -37,9 +37,9 @@ public:
private: private:
boost::shared_ptr<HashGenerator> _hashGenerator; boost::shared_ptr<HashGenerator> _hashGenerator;
fstream _hashedIndexFile; string _hashedIndexFilePath;
ofstream _suffixArrayFile; string _suffixArrayFilePath;
}; };
#endif #endif

View File

@ -0,0 +1,90 @@
#include "concordia/index_searcher.hpp"
#include <boost/filesystem.hpp>
IndexSearcher::IndexSearcher():
_T(NULL),
_SA(NULL),
_n(0) {
}
IndexSearcher::~IndexSearcher() {
}
void IndexSearcher::loadIndex(const string & wordMapFilepath,
const string & hashedIndexFilepath,
const string & suffixArrayFilepath)
throw(ConcordiaException) {
if (!boost::filesystem::exists(wordMapFilepath)) {
throw ConcordiaException("E06: Failed to open word map "
"file for reading.");
}
if (!boost::filesystem::exists(hashedIndexFilepath)) {
throw ConcordiaException("E07: Failed to open hashed index file "
"for reading.");
}
if (!boost::filesystem::exists(suffixArrayFilepath)) {
throw ConcordiaException("E08: Failed to open suffix array file "
"for reading.");
}
_hashGenerator = boost::shared_ptr<HashGenerator>(
new HashGenerator(wordMapFilepath));
ifstream hashedIndexFile;
hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in
| ios::ate | ios::binary);
_n = hashedIndexFile.tellg();
_T = new sauchar_t[_n];
hashedIndexFile.seekg(0, ios::beg);
hashedIndexFile.read(reinterpret_cast<char*> (_T), _n);
hashedIndexFile.close();
_SA = new saidx_t[_n];
ifstream suffixArrayFile;
suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::in | ios::binary);
saidx_t buff;
int pos = 0;
while (!suffixArrayFile.eof() && pos < _n) {
suffixArrayFile.read(reinterpret_cast<char *>(&buff), sizeof(saidx_t));
_SA[pos++] = buff;
}
suffixArrayFile.close();
}
vector<saidx_t> IndexSearcher::simpleSearch(const string & pattern)
throw(ConcordiaException) {
vector<saidx_t> result;
int left;
vector<sauchar_t> hash = _hashGenerator->generateHash(pattern);
saidx_t patternLength = hash.size();
sauchar_t * patternArray = new sauchar_t[patternLength];
int i = 0;
for (vector<sauchar_t>::iterator it = hash.begin();
it != hash.end(); ++it) {
patternArray[i] = *it;
i++;
}
int size = sa_search(_T, (saidx_t) _n,
(const sauchar_t *) patternArray, patternLength,
_SA, (saidx_t) _n, &left);
for (i = 0; i < size; ++i) {
result.push_back(_SA[left + i]);
}
delete[] patternArray;
return result;
}

View File

@ -0,0 +1,45 @@
#ifndef INDEX_SEARCHER_HDR
#define INDEX_SEARCHER_HDR
#include <divsufsort.h>
#include <boost/shared_ptr.hpp>
#include <fstream>
#include <iostream>
#include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp"
/*!
Class for searching the index with a sentence.
*/
using namespace std;
class IndexSearcher {
public:
explicit IndexSearcher();
/*! Destructor.
*/
virtual ~IndexSearcher();
void loadIndex(const string & wordMapFilepath,
const string & hashedIndexFilepath,
const string & suffixArrayFilepath)
throw(ConcordiaException);
vector<saidx_t> simpleSearch(const string & pattern)
throw(ConcordiaException);
private:
boost::shared_ptr<HashGenerator> _hashGenerator;
sauchar_t * _T;
saidx_t * _SA;
size_t _n;
};
#endif

View File

@ -4,6 +4,7 @@ add_library(concordia-tests
test_word_map.cpp test_word_map.cpp
test_hash_generator.cpp test_hash_generator.cpp
test_concordia_index.cpp test_concordia_index.cpp
test_index_searcher.cpp
) )
target_link_libraries(concordia-tests concordia ${LIBCONFIG_LIB} concordia-tests-common) target_link_libraries(concordia-tests concordia ${LIBCONFIG_LIB} concordia-tests-common)

View File

@ -1,7 +1,10 @@
#include "tests/unit-tests/unit_tests_globals.hpp" #include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/concordia.hpp" #include "concordia/concordia.hpp"
#include "tests/common/test_resources_manager.hpp" #include "tests/common/test_resources_manager.hpp"
#include "concordia/common/config.hpp"
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
#include <string> #include <string>
@ -16,4 +19,108 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
BOOST_CHECK_EQUAL( version , "0.1"); BOOST_CHECK_EQUAL( version , "0.1");
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
concordia.addSentence("Ala ma kota");
concordia.addSentence("Ala ma rysia");
concordia.addSentence("Marysia ma rysia");
concordia.generateIndex();
/*The test index contains 3 sentences:
"Ala ma kota"
"Ala ma rysia"
"Marysia ma rysia"
Test word map:
Ala -> 0
ma -> 1
kota -> 2
rysia -> 3
Marysia -> 4
Test hashed index:
n: 0 1 2 3 4 5 6 7 8
T[n]: 0 1 2 0 1 3 4 1 3
Test suffix array:
n: 0 1 2 3 4 5 6 7 8
SA[n]: 0 3 1 7 4 2 8 5 6
*/
vector<saidx_t> expectedResult1;
expectedResult1.push_back(7);
expectedResult1.push_back(4);
vector<saidx_t> searchResult1 = concordia.simpleSearch("ma rysia");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_SUFFIX_ARRAY));
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(),
expectedResult1.begin(), expectedResult1.end());
}
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
concordia.addSentence("to jest okno");
concordia.addSentence("czy jest okno otwarte");
concordia.addSentence("chyba to jest tutaj");
concordia.addSentence("to jest");
concordia.generateIndex();
/*The test index contains 4 sentences:
"to jest okno"
"czy jest okno otwarte"
"chyba to jest tutaj"
"to jest"
Test word map:
to -> 0
jest -> 1
okno -> 2
czy -> 3
otwarte -> 4
chyba -> 5
tutaj -> 6
Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11 12
T[n]: 0 1 2 3 1 2 4 5 0 1 6 0 1
Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11 12
SA[n]: 11 0 8 12 1 4 9 2 5 3 6 7 10
*/
vector<saidx_t> expectedResult1;
expectedResult1.push_back(11);
expectedResult1.push_back(0);
expectedResult1.push_back(8);
vector<saidx_t> expectedResult2;
expectedResult2.push_back(1);
expectedResult2.push_back(4);
vector<saidx_t> searchResult1 = concordia.simpleSearch("to jest");
vector<saidx_t> searchResult2 = concordia.simpleSearch("jest okno");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_SUFFIX_ARRAY));
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(),
expectedResult1.begin(), expectedResult1.end());
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult2.begin(), searchResult2.end(),
expectedResult2.begin(), expectedResult2.end());
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -13,8 +13,11 @@ BOOST_AUTO_TEST_SUITE(concordia_config)
BOOST_AUTO_TEST_CASE( ConfigParameters ) BOOST_AUTO_TEST_CASE( ConfigParameters )
{ {
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-test.cfg")); ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg"));
BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" ); BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" );
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" );
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" );
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" );
} }
BOOST_AUTO_TEST_CASE( NonexistentConfigTest ) BOOST_AUTO_TEST_CASE( NonexistentConfigTest )

View File

@ -13,9 +13,9 @@ BOOST_AUTO_TEST_SUITE(concordia_index)
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest1 ) BOOST_AUTO_TEST_CASE( ResourcesExistenceTest1 )
{ {
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("mock_word_map.bin"), ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","mock_word_map.bin"),
TestResourcesManager::getTestHashIndexFilePath("mock_hash_index.bin"), TestResourcesManager::getTestFilePath("concordia-index","mock_hash_index.bin"),
TestResourcesManager::getTestSuffixArrayFilePath()); TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin"));
} }
@ -26,9 +26,9 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest2 )
string message = ""; string message = "";
try { try {
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("mock_word_map.bin"), ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","mock_word_map.bin"),
TestResourcesManager::getTestHashIndexFilePath("nonexistent.bin"), TestResourcesManager::getTestFilePath("concordia-index","nonexistent.bin"),
TestResourcesManager::getTestSuffixArrayFilePath()); TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin"));
} catch (ConcordiaException & e) { } catch (ConcordiaException & e) {
exceptionThrown = true; exceptionThrown = true;
message = e.what(); message = e.what();
@ -44,9 +44,9 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest3 )
string message = ""; string message = "";
try { try {
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("nonexistent.bin"), ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","nonexistent.bin"),
TestResourcesManager::getTestHashIndexFilePath("mock_hash_index.bin"), TestResourcesManager::getTestFilePath("concordia-index","mock_hash_index.bin"),
TestResourcesManager::getTestSuffixArrayFilePath()); TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin"));
} catch (ConcordiaException & e) { } catch (ConcordiaException & e) {
exceptionThrown = true; exceptionThrown = true;
message = e.what(); message = e.what();
@ -58,20 +58,23 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest3 )
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest ) BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
{ {
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"), ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"), TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestSuffixArrayFilePath()); TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
index.addSentence("Ala ma kota"); index.addSentence("Ala ma kota");
index.addSentence("Ala ma rysia");
index.addSentence("Marysia ma rysia");
index.generateSuffixArray(); index.generateSuffixArray();
index.serializeWordMap(); index.serializeWordMap();
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"))); BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"))); BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestSuffixArrayFilePath())); BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_SA.bin")));
boost::filesystem::remove(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin")); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"));
boost::filesystem::remove(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin")); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
boost::filesystem::remove(TestResourcesManager::getTestSuffixArrayFilePath()); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -0,0 +1,76 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/index_searcher.hpp"
#include "concordia/concordia_index.hpp"
#include "concordia/concordia_exception.hpp"
#include "tests/common/test_resources_manager.hpp"
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
using namespace std;
BOOST_AUTO_TEST_SUITE(index_searcher)
BOOST_AUTO_TEST_CASE( SimpleSearchTest )
{
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
index.addSentence("Ala ma kota");
index.addSentence("Ala ma rysia");
index.addSentence("Marysia ma rysia");
index.generateSuffixArray();
index.serializeWordMap();
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_SA.bin")));
IndexSearcher searcher;
searcher.loadIndex(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
/*The test index contains 3 sentences:
"Ala ma kota"
"Ala ma rysia"
"Marysia ma rysia"
Test word map:
Ala -> 0
ma -> 1
kota -> 2
rysia -> 3
Marysia -> 4
Test hashed index:
n: 0 1 2 3 4 5 6 7 8
T[n]: 0 1 2 0 1 3 4 1 3
Test suffix array:
n: 0 1 2 3 4 5 6 7 8
SA[n]: 0 3 1 7 4 2 8 5 6
*/
vector<saidx_t> expectedResult1;
expectedResult1.push_back(7);
expectedResult1.push_back(4);
vector<saidx_t> searchResult1 = searcher.simpleSearch("ma rysia");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(),
expectedResult1.begin(), expectedResult1.end());
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -3,7 +3,6 @@
#define PUDDLE_TEST_DIRECTORY "puddle" #define PUDDLE_TEST_DIRECTORY "puddle"
#define CONCORDIA_TAGSET_DIRECTORY "concordia-tagset" #define CONCORDIA_TAGSET_DIRECTORY "concordia-tagset"
#define CONCORDIA_CONFIG_DIRECTORY "concordia-config" #define CONCORDIA_CONFIG_DIRECTORY "concordia-config"
#define CONCORDIA_INDEX_DIRECTORY "concordia-index"
string TestResourcesManager::getPuddleFilePath(const string & filename) { string TestResourcesManager::getPuddleFilePath(const string & filename) {
string result = string(TEST_RESOURCES_DIRECTORY); string result = string(TEST_RESOURCES_DIRECTORY);
@ -16,23 +15,13 @@ string TestResourcesManager::getTestConcordiaConfigFilePath(const string & filen
return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename; return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename;
} }
string TestResourcesManager::getTestWordMapFilePath(const string & filename) {
string result = string(TEST_RESOURCES_DIRECTORY);
return result + "/" + CONCORDIA_INDEX_DIRECTORY + "/" + filename;
}
string TestResourcesManager::getTestHashIndexFilePath(const string & filename) {
string result = string(TEST_RESOURCES_DIRECTORY);
return result + "/" + CONCORDIA_INDEX_DIRECTORY + "/" + filename;
}
string TestResourcesManager::getTestSuffixArrayFilePath() {
string result = string(TEST_RESOURCES_DIRECTORY);
return result + "/" + CONCORDIA_INDEX_DIRECTORY + "/test_SA.bin";
}
string TestResourcesManager::getProdConcordiaConfigFilePath(const string & filename) { string TestResourcesManager::getProdConcordiaConfigFilePath(const string & filename) {
string result = string(PROD_RESOURCES_DIRECTORY); string result = string(PROD_RESOURCES_DIRECTORY);
return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename; return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename;
} }
string TestResourcesManager::getTestFilePath(const string & module, const string & filename) {
string result = string(TEST_RESOURCES_DIRECTORY);
return result + "/" + module + "/" + filename;
}

View File

@ -14,13 +14,10 @@ public:
static string getTestConcordiaConfigFilePath(const string & filename); static string getTestConcordiaConfigFilePath(const string & filename);
static string getTestWordMapFilePath(const string & filename);
static string getTestHashIndexFilePath(const string & filename);
static string getTestSuffixArrayFilePath();
static string getProdConcordiaConfigFilePath(const string & filename); static string getProdConcordiaConfigFilePath(const string & filename);
static string getTestFilePath(const string & module, const string & filename);
}; };
#endif #endif

View File

@ -1,9 +1,15 @@
#---------------------------- #----------------------------
# Concordia configuration file # Concordia mock configuration file
#--------------------------- #---------------------------
# #
#Path to the Puddle tagset #Path to the Puddle tagset
puddle_tagset_path = "puddle/tagset.txt"; puddle_tagset_path = "puddle/tagset.txt";
word_map_path = "tmp/wm.bin"
hashed_index_path = "tmp/hi.bin"
suffix_array_path = "tmp/sa.bin"
### eof ### eof

View File

@ -6,4 +6,22 @@
#Path to the Puddle tagset #Path to the Puddle tagset
puddle_tagset_path = "@TEST_PUDDLE_TAGSET_PATH@"; puddle_tagset_path = "@TEST_PUDDLE_TAGSET_PATH@";
#-------------------------------------------------------------------------------
#Word map, hashed index and suffix array files are in a temporary directory
#and should be deleted at the end of each test procedure.
#Word map file containing unique codes for tokens
word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
#File containing the "text" for suffix array searching, i.e. sequence of codes
hashed_index_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
#Binarized suffix array
suffix_array_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
#-------------------------------------------------------------------------------
### eof ### eof

Binary file not shown.

Binary file not shown.

Binary file not shown.