suffix array simple search
This commit is contained in:
parent
d3cccff654
commit
0d8a057278
@ -20,10 +20,16 @@ set (PROD_PUDDLE_TAGSET_PATH "${PROD_RESOURCES_DIRECTORY}/puddle/tagset.txt")
|
||||
|
||||
set (TEST_RESOURCES_DIRECTORY "${concordia_SOURCE_DIR}/tests/resources")
|
||||
set (TEST_PUDDLE_TAGSET_PATH "${TEST_RESOURCES_DIRECTORY}/puddle/basic-tagset.txt")
|
||||
set (TEMP_WORD_MAP "temp_word_map.bin")
|
||||
set (TEMP_HASHED_INDEX "temp_hashed_index.bin")
|
||||
set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin")
|
||||
|
||||
|
||||
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
||||
|
||||
set(BASE_TARGETS concordia)
|
||||
|
||||
|
||||
|
||||
# ================================================
|
||||
# Third-party libraries
|
||||
@ -99,11 +105,6 @@ configure_file (
|
||||
"${concordia_SOURCE_DIR}/concordia/common/config.hpp"
|
||||
)
|
||||
|
||||
configure_file (
|
||||
"${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg.in"
|
||||
"${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg"
|
||||
)
|
||||
|
||||
configure_file (
|
||||
"${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg.in"
|
||||
"${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg"
|
||||
|
@ -6,6 +6,7 @@ foreach(dir ${ALL_DIRECTORIES})
|
||||
endforeach(dir)
|
||||
|
||||
add_library(concordia SHARED
|
||||
index_searcher.cpp
|
||||
concordia_index.cpp
|
||||
word_map.cpp
|
||||
hash_generator.cpp
|
||||
|
@ -2,6 +2,9 @@
|
||||
#define CONCORDIA_VERSION_MINOR @CONCORDIA_VERSION_MINOR@
|
||||
|
||||
#define TEST_RESOURCES_DIRECTORY "@TEST_RESOURCES_DIRECTORY@"
|
||||
#define TEMP_WORD_MAP "@TEMP_WORD_MAP@"
|
||||
#define TEMP_HASHED_INDEX "@TEMP_HASHED_INDEX@"
|
||||
#define TEMP_SUFFIX_ARRAY "@TEMP_SUFFIX_ARRAY@"
|
||||
|
||||
#define PROD_RESOURCES_DIRECTORY "@PROD_RESOURCES_DIRECTORY@"
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
#include <sstream>
|
||||
|
||||
#include "concordia/concordia.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
// ===========================================
|
||||
|
||||
std::string _createLibraryVersion();
|
||||
@ -13,9 +13,15 @@ std::string Concordia::_libraryVersion = _createLibraryVersion();
|
||||
|
||||
// ===========================================
|
||||
|
||||
Concordia::Concordia(const string & configFilePath) throw(ConcordiaException) {
|
||||
boost::shared_ptr<ConcordiaConfig> _config(
|
||||
Concordia::Concordia(const std::string & configFilePath)
|
||||
throw(ConcordiaException) {
|
||||
_config = boost::shared_ptr<ConcordiaConfig> (
|
||||
new ConcordiaConfig(configFilePath));
|
||||
_index = boost::shared_ptr<ConcordiaIndex>(
|
||||
new ConcordiaIndex(_config->getWordMapFilePath(),
|
||||
_config->getHashedIndexFilePath(),
|
||||
_config->getSuffixArrayFilePath()));
|
||||
_searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher());
|
||||
}
|
||||
|
||||
Concordia::~Concordia() {
|
||||
@ -35,3 +41,22 @@ std::string _createLibraryVersion() {
|
||||
return version.str();
|
||||
}
|
||||
|
||||
void Concordia::addSentence(const std::string & sentence)
|
||||
throw(ConcordiaException) {
|
||||
_index->addSentence(sentence);
|
||||
}
|
||||
|
||||
void Concordia::generateIndex() throw(ConcordiaException) {
|
||||
_index->generateSuffixArray();
|
||||
_index->serializeWordMap();
|
||||
_searcher->loadIndex(_config->getWordMapFilePath(),
|
||||
_config->getHashedIndexFilePath(),
|
||||
_config->getSuffixArrayFilePath());
|
||||
}
|
||||
|
||||
std::vector<saidx_t> Concordia::simpleSearch(const std::string & pattern)
|
||||
throw(ConcordiaException) {
|
||||
return _searcher->simpleSearch(pattern);
|
||||
}
|
||||
|
||||
|
||||
|
@ -2,9 +2,14 @@
|
||||
#define CONCORDIA_HDR
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/concordia_index.hpp"
|
||||
#include "concordia/index_searcher.hpp"
|
||||
#include <divsufsort.h>
|
||||
|
||||
|
||||
/*!
|
||||
The Concordia class is the main access point to the library.
|
||||
@ -28,10 +33,21 @@ public:
|
||||
*/
|
||||
std::string & getVersion();
|
||||
|
||||
void addSentence(const std::string & sentence) throw(ConcordiaException);
|
||||
|
||||
void generateIndex() throw(ConcordiaException);
|
||||
|
||||
std::vector<saidx_t> simpleSearch(const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
private:
|
||||
static std::string _libraryVersion;
|
||||
|
||||
boost::shared_ptr<ConcordiaConfig> _config;
|
||||
|
||||
boost::shared_ptr<ConcordiaIndex> _index;
|
||||
|
||||
boost::shared_ptr<IndexSearcher> _searcher;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -3,6 +3,9 @@
|
||||
#include "concordia/common/logging.hpp"
|
||||
|
||||
#define PUDDLE_TAGSET_PARAM "puddle_tagset_path"
|
||||
#define WORD_MAP_PARAM "word_map_path"
|
||||
#define HASHED_INDEX_PARAM "hashed_index_path"
|
||||
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
|
||||
|
||||
ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
|
||||
throw(ConcordiaException) {
|
||||
@ -17,6 +20,12 @@ ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
|
||||
|
||||
_puddleTagsetFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(PUDDLE_TAGSET_PARAM);
|
||||
_wordMapFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(WORD_MAP_PARAM);
|
||||
_hashedIndexFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(HASHED_INDEX_PARAM);
|
||||
_suffixArrayFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(SUFFIX_ARRAY_PARAM);
|
||||
}
|
||||
|
||||
ConcordiaConfig::~ConcordiaConfig() {
|
||||
|
@ -34,11 +34,29 @@ public:
|
||||
return _puddleTagsetFilePath;
|
||||
}
|
||||
|
||||
string & getWordMapFilePath() {
|
||||
return _wordMapFilePath;
|
||||
}
|
||||
|
||||
string & getHashedIndexFilePath() {
|
||||
return _hashedIndexFilePath;
|
||||
}
|
||||
|
||||
string & getSuffixArrayFilePath() {
|
||||
return _suffixArrayFilePath;
|
||||
}
|
||||
|
||||
private:
|
||||
Config _config;
|
||||
|
||||
string _puddleTagsetFilePath;
|
||||
|
||||
string _wordMapFilePath;
|
||||
|
||||
string _hashedIndexFilePath;
|
||||
|
||||
string _suffixArrayFilePath;
|
||||
|
||||
string _readConfigParameterStr(const string & name)
|
||||
throw(ConcordiaException);
|
||||
};
|
||||
|
@ -1,45 +1,27 @@
|
||||
#include "concordia/concordia_index.hpp"
|
||||
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <iostream>
|
||||
|
||||
ConcordiaIndex::ConcordiaIndex(const string & wordMapFilepath,
|
||||
const string & hashedIndexFilepath,
|
||||
const string & suffixArrayFilepath)
|
||||
throw(ConcordiaException) {
|
||||
if (boost::filesystem::exists(wordMapFilepath)) {
|
||||
if (boost::filesystem::exists(hashedIndexFilepath)) {
|
||||
_hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out |
|
||||
ios::app | ios::binary);
|
||||
if (!_hashedIndexFile.is_open()) {
|
||||
throw ConcordiaException("E03: Failed to open hashed index "
|
||||
"file for appending.");
|
||||
}
|
||||
} else {
|
||||
ConcordiaIndex::ConcordiaIndex(const string & wordMapFilePath,
|
||||
const string & hashedIndexFilePath,
|
||||
const string & suffixArrayFilePath)
|
||||
throw(ConcordiaException) :
|
||||
_hashedIndexFilePath(hashedIndexFilePath),
|
||||
_suffixArrayFilePath(suffixArrayFilePath) {
|
||||
if (boost::filesystem::exists(wordMapFilePath)) {
|
||||
if (!boost::filesystem::exists(hashedIndexFilePath)) {
|
||||
throw ConcordiaException("E01: Word map file exists "
|
||||
"but hashed index file absent.");
|
||||
}
|
||||
} else { // WordMap file does not exist
|
||||
if (boost::filesystem::exists(hashedIndexFilepath)) {
|
||||
if (boost::filesystem::exists(hashedIndexFilePath)) {
|
||||
throw ConcordiaException("E02: Hashed index file exists "
|
||||
"but word map file absent.");
|
||||
} else {
|
||||
_hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::out |
|
||||
ios::binary);
|
||||
if (!_hashedIndexFile.is_open()) {
|
||||
throw ConcordiaException("E04: Failed to open hashed index "
|
||||
"file for writing.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::out |
|
||||
ios::binary);
|
||||
if (!_hashedIndexFile.is_open()) {
|
||||
throw ConcordiaException("E05: Failed to open suffix array "
|
||||
"file for writing.");
|
||||
}
|
||||
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
||||
new HashGenerator(wordMapFilepath));
|
||||
new HashGenerator(wordMapFilePath));
|
||||
}
|
||||
|
||||
ConcordiaIndex::~ConcordiaIndex() {
|
||||
@ -50,23 +32,30 @@ void ConcordiaIndex::serializeWordMap() {
|
||||
}
|
||||
|
||||
void ConcordiaIndex::generateSuffixArray() {
|
||||
/* Get the file size. */
|
||||
long n = _hashedIndexFile.tellg();
|
||||
ifstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in|
|
||||
ios::ate|ios::binary);
|
||||
|
||||
/* Get the file size. */
|
||||
long n = hashedIndexFile.tellg();
|
||||
|
||||
/* Allocate 5blocksize bytes of memory. */
|
||||
sauchar_t *T;
|
||||
saidx_t *SA;
|
||||
|
||||
T = reinterpret_cast<sauchar_t *> (malloc((size_t)n * sizeof(sauchar_t)));
|
||||
SA = reinterpret_cast<saidx_t *> (malloc((size_t)n * sizeof(saidx_t)));
|
||||
if ((T == NULL) || (SA == NULL)) {
|
||||
throw ConcordiaException("Cannot allocate memory.");
|
||||
}
|
||||
T = new sauchar_t[n];
|
||||
SA = new saidx_t[n];
|
||||
|
||||
/* Read n bytes of data. */
|
||||
hashedIndexFile.seekg(0, ios::beg);
|
||||
|
||||
_hashedIndexFile.seekg(0, ios::beg);
|
||||
_hashedIndexFile.read(reinterpret_cast<char*> (T), (size_t)n);
|
||||
sauchar_t buff;
|
||||
int pos = 0;
|
||||
while (!hashedIndexFile.eof()) {
|
||||
hashedIndexFile.read(reinterpret_cast<char *>(&buff),
|
||||
sizeof(sauchar_t));
|
||||
T[pos++] = buff;
|
||||
}
|
||||
hashedIndexFile.close();
|
||||
|
||||
/* Construct the suffix array. */
|
||||
if (divsufsort(T, SA, (saidx_t)n) != 0) {
|
||||
@ -74,18 +63,32 @@ void ConcordiaIndex::generateSuffixArray() {
|
||||
}
|
||||
|
||||
/* Write the suffix array. */
|
||||
_suffixArrayFile << *SA;
|
||||
|
||||
ofstream suffixArrayFile;
|
||||
suffixArrayFile.open(_suffixArrayFilePath.c_str(), ios::out|ios::binary);
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
suffixArrayFile.write(reinterpret_cast<char *>(&SA[i]),
|
||||
sizeof(saidx_t));
|
||||
}
|
||||
suffixArrayFile.close();
|
||||
|
||||
/* Deallocate memory. */
|
||||
free(SA);
|
||||
free(T);
|
||||
delete[] T;
|
||||
delete[] SA;
|
||||
}
|
||||
|
||||
void ConcordiaIndex::addSentence(const string & sentence) {
|
||||
vector<sauchar_t> hash = _hashGenerator->generateHash(sentence);
|
||||
ofstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
||||
ios::app|ios::binary);
|
||||
for (vector<sauchar_t>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
_hashedIndexFile << *it;
|
||||
sauchar_t buff = *it;
|
||||
hashedIndexFile.write(reinterpret_cast<char *>(&buff),
|
||||
sizeof(sauchar_t));
|
||||
}
|
||||
hashedIndexFile.close();
|
||||
}
|
||||
|
||||
|
@ -19,9 +19,9 @@ using namespace std;
|
||||
|
||||
class ConcordiaIndex {
|
||||
public:
|
||||
explicit ConcordiaIndex(const string & wordMapFilepath,
|
||||
const string & hashedIndexFilepath,
|
||||
const string & suffixArrayFilepath)
|
||||
explicit ConcordiaIndex(const string & wordMapFilePath,
|
||||
const string & hashedIndexFilePath,
|
||||
const string & suffixArrayFilePath)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Destructor.
|
||||
@ -37,9 +37,9 @@ public:
|
||||
private:
|
||||
boost::shared_ptr<HashGenerator> _hashGenerator;
|
||||
|
||||
fstream _hashedIndexFile;
|
||||
string _hashedIndexFilePath;
|
||||
|
||||
ofstream _suffixArrayFile;
|
||||
string _suffixArrayFilePath;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
90
concordia/index_searcher.cpp
Normal file
90
concordia/index_searcher.cpp
Normal file
@ -0,0 +1,90 @@
|
||||
#include "concordia/index_searcher.hpp"
|
||||
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
IndexSearcher::IndexSearcher():
|
||||
_T(NULL),
|
||||
_SA(NULL),
|
||||
_n(0) {
|
||||
}
|
||||
|
||||
|
||||
IndexSearcher::~IndexSearcher() {
|
||||
}
|
||||
|
||||
|
||||
void IndexSearcher::loadIndex(const string & wordMapFilepath,
|
||||
const string & hashedIndexFilepath,
|
||||
const string & suffixArrayFilepath)
|
||||
throw(ConcordiaException) {
|
||||
if (!boost::filesystem::exists(wordMapFilepath)) {
|
||||
throw ConcordiaException("E06: Failed to open word map "
|
||||
"file for reading.");
|
||||
}
|
||||
|
||||
if (!boost::filesystem::exists(hashedIndexFilepath)) {
|
||||
throw ConcordiaException("E07: Failed to open hashed index file "
|
||||
"for reading.");
|
||||
}
|
||||
|
||||
if (!boost::filesystem::exists(suffixArrayFilepath)) {
|
||||
throw ConcordiaException("E08: Failed to open suffix array file "
|
||||
"for reading.");
|
||||
}
|
||||
|
||||
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
||||
new HashGenerator(wordMapFilepath));
|
||||
|
||||
ifstream hashedIndexFile;
|
||||
hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in
|
||||
| ios::ate | ios::binary);
|
||||
_n = hashedIndexFile.tellg();
|
||||
_T = new sauchar_t[_n];
|
||||
|
||||
hashedIndexFile.seekg(0, ios::beg);
|
||||
hashedIndexFile.read(reinterpret_cast<char*> (_T), _n);
|
||||
hashedIndexFile.close();
|
||||
|
||||
_SA = new saidx_t[_n];
|
||||
|
||||
ifstream suffixArrayFile;
|
||||
suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::in | ios::binary);
|
||||
|
||||
saidx_t buff;
|
||||
int pos = 0;
|
||||
while (!suffixArrayFile.eof() && pos < _n) {
|
||||
suffixArrayFile.read(reinterpret_cast<char *>(&buff), sizeof(saidx_t));
|
||||
_SA[pos++] = buff;
|
||||
}
|
||||
suffixArrayFile.close();
|
||||
}
|
||||
|
||||
vector<saidx_t> IndexSearcher::simpleSearch(const string & pattern)
|
||||
throw(ConcordiaException) {
|
||||
vector<saidx_t> result;
|
||||
|
||||
int left;
|
||||
vector<sauchar_t> hash = _hashGenerator->generateHash(pattern);
|
||||
saidx_t patternLength = hash.size();
|
||||
sauchar_t * patternArray = new sauchar_t[patternLength];
|
||||
int i = 0;
|
||||
for (vector<sauchar_t>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
patternArray[i] = *it;
|
||||
i++;
|
||||
}
|
||||
|
||||
int size = sa_search(_T, (saidx_t) _n,
|
||||
(const sauchar_t *) patternArray, patternLength,
|
||||
_SA, (saidx_t) _n, &left);
|
||||
for (i = 0; i < size; ++i) {
|
||||
result.push_back(_SA[left + i]);
|
||||
}
|
||||
|
||||
delete[] patternArray;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
45
concordia/index_searcher.hpp
Normal file
45
concordia/index_searcher.hpp
Normal file
@ -0,0 +1,45 @@
|
||||
#ifndef INDEX_SEARCHER_HDR
|
||||
#define INDEX_SEARCHER_HDR
|
||||
|
||||
#include <divsufsort.h>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "concordia/hash_generator.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
|
||||
/*!
|
||||
Class for searching the index with a sentence.
|
||||
|
||||
*/
|
||||
|
||||
using namespace std;
|
||||
|
||||
class IndexSearcher {
|
||||
public:
|
||||
explicit IndexSearcher();
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~IndexSearcher();
|
||||
|
||||
void loadIndex(const string & wordMapFilepath,
|
||||
const string & hashedIndexFilepath,
|
||||
const string & suffixArrayFilepath)
|
||||
throw(ConcordiaException);
|
||||
|
||||
vector<saidx_t> simpleSearch(const string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
private:
|
||||
boost::shared_ptr<HashGenerator> _hashGenerator;
|
||||
|
||||
sauchar_t * _T;
|
||||
|
||||
saidx_t * _SA;
|
||||
|
||||
size_t _n;
|
||||
};
|
||||
|
||||
#endif
|
@ -4,6 +4,7 @@ add_library(concordia-tests
|
||||
test_word_map.cpp
|
||||
test_hash_generator.cpp
|
||||
test_concordia_index.cpp
|
||||
test_index_searcher.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(concordia-tests concordia ${LIBCONFIG_LIB} concordia-tests-common)
|
||||
|
@ -1,7 +1,10 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/concordia.hpp"
|
||||
#include "tests/common/test_resources_manager.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
#include <string>
|
||||
|
||||
@ -16,4 +19,108 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion )
|
||||
BOOST_CHECK_EQUAL( version , "0.1");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
concordia.addSentence("Ala ma kota");
|
||||
concordia.addSentence("Ala ma rysia");
|
||||
concordia.addSentence("Marysia ma rysia");
|
||||
|
||||
concordia.generateIndex();
|
||||
|
||||
/*The test index contains 3 sentences:
|
||||
"Ala ma kota"
|
||||
"Ala ma rysia"
|
||||
"Marysia ma rysia"
|
||||
|
||||
Test word map:
|
||||
Ala -> 0
|
||||
ma -> 1
|
||||
kota -> 2
|
||||
rysia -> 3
|
||||
Marysia -> 4
|
||||
|
||||
Test hashed index:
|
||||
n: 0 1 2 3 4 5 6 7 8
|
||||
T[n]: 0 1 2 0 1 3 4 1 3
|
||||
|
||||
Test suffix array:
|
||||
n: 0 1 2 3 4 5 6 7 8
|
||||
SA[n]: 0 3 1 7 4 2 8 5 6
|
||||
|
||||
*/
|
||||
|
||||
vector<saidx_t> expectedResult1;
|
||||
expectedResult1.push_back(7);
|
||||
expectedResult1.push_back(4);
|
||||
|
||||
vector<saidx_t> searchResult1 = concordia.simpleSearch("ma rysia");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_SUFFIX_ARRAY));
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(),
|
||||
expectedResult1.begin(), expectedResult1.end());
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
concordia.addSentence("to jest okno");
|
||||
concordia.addSentence("czy jest okno otwarte");
|
||||
concordia.addSentence("chyba to jest tutaj");
|
||||
concordia.addSentence("to jest");
|
||||
|
||||
concordia.generateIndex();
|
||||
|
||||
/*The test index contains 4 sentences:
|
||||
"to jest okno"
|
||||
"czy jest okno otwarte"
|
||||
"chyba to jest tutaj"
|
||||
"to jest"
|
||||
|
||||
Test word map:
|
||||
to -> 0
|
||||
jest -> 1
|
||||
okno -> 2
|
||||
czy -> 3
|
||||
otwarte -> 4
|
||||
chyba -> 5
|
||||
tutaj -> 6
|
||||
|
||||
Test hashed index:
|
||||
n: 0 1 2 3 4 5 6 7 8 9 10 11 12
|
||||
T[n]: 0 1 2 3 1 2 4 5 0 1 6 0 1
|
||||
|
||||
Test suffix array:
|
||||
n: 0 1 2 3 4 5 6 7 8 9 10 11 12
|
||||
SA[n]: 11 0 8 12 1 4 9 2 5 3 6 7 10
|
||||
|
||||
*/
|
||||
|
||||
vector<saidx_t> expectedResult1;
|
||||
expectedResult1.push_back(11);
|
||||
expectedResult1.push_back(0);
|
||||
expectedResult1.push_back(8);
|
||||
|
||||
vector<saidx_t> expectedResult2;
|
||||
expectedResult2.push_back(1);
|
||||
expectedResult2.push_back(4);
|
||||
|
||||
vector<saidx_t> searchResult1 = concordia.simpleSearch("to jest");
|
||||
vector<saidx_t> searchResult2 = concordia.simpleSearch("jest okno");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_SUFFIX_ARRAY));
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(),
|
||||
expectedResult1.begin(), expectedResult1.end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult2.begin(), searchResult2.end(),
|
||||
expectedResult2.begin(), expectedResult2.end());
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
@ -13,8 +13,11 @@ BOOST_AUTO_TEST_SUITE(concordia_config)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConfigParameters )
|
||||
{
|
||||
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-test.cfg"));
|
||||
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg"));
|
||||
BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" );
|
||||
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" );
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
|
||||
|
@ -13,9 +13,9 @@ BOOST_AUTO_TEST_SUITE(concordia_index)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest1 )
|
||||
{
|
||||
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("mock_word_map.bin"),
|
||||
TestResourcesManager::getTestHashIndexFilePath("mock_hash_index.bin"),
|
||||
TestResourcesManager::getTestSuffixArrayFilePath());
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","mock_word_map.bin"),
|
||||
TestResourcesManager::getTestFilePath("concordia-index","mock_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin"));
|
||||
|
||||
}
|
||||
|
||||
@ -26,9 +26,9 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest2 )
|
||||
string message = "";
|
||||
|
||||
try {
|
||||
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("mock_word_map.bin"),
|
||||
TestResourcesManager::getTestHashIndexFilePath("nonexistent.bin"),
|
||||
TestResourcesManager::getTestSuffixArrayFilePath());
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","mock_word_map.bin"),
|
||||
TestResourcesManager::getTestFilePath("concordia-index","nonexistent.bin"),
|
||||
TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin"));
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
@ -44,9 +44,9 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest3 )
|
||||
string message = "";
|
||||
|
||||
try {
|
||||
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("nonexistent.bin"),
|
||||
TestResourcesManager::getTestHashIndexFilePath("mock_hash_index.bin"),
|
||||
TestResourcesManager::getTestSuffixArrayFilePath());
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","nonexistent.bin"),
|
||||
TestResourcesManager::getTestFilePath("concordia-index","mock_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin"));
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
@ -58,20 +58,23 @@ BOOST_AUTO_TEST_CASE( ResourcesExistenceTest3 )
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
||||
{
|
||||
ConcordiaIndex index(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"),
|
||||
TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"),
|
||||
TestResourcesManager::getTestSuffixArrayFilePath());
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
|
||||
index.addSentence("Ala ma kota");
|
||||
index.addSentence("Ala ma rysia");
|
||||
index.addSentence("Marysia ma rysia");
|
||||
|
||||
index.generateSuffixArray();
|
||||
index.serializeWordMap();
|
||||
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestSuffixArrayFilePath()));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_SA.bin")));
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestWordMapFilePath("test_word_map.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestHashIndexFilePath("test_hash_index.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestSuffixArrayFilePath());
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
76
concordia/t/test_index_searcher.cpp
Normal file
76
concordia/t/test_index_searcher.cpp
Normal file
@ -0,0 +1,76 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
|
||||
#include "concordia/index_searcher.hpp"
|
||||
#include "concordia/concordia_index.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include "tests/common/test_resources_manager.hpp"
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
using namespace std;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(index_searcher)
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SimpleSearchTest )
|
||||
{
|
||||
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
|
||||
index.addSentence("Ala ma kota");
|
||||
index.addSentence("Ala ma rysia");
|
||||
index.addSentence("Marysia ma rysia");
|
||||
|
||||
index.generateSuffixArray();
|
||||
index.serializeWordMap();
|
||||
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_SA.bin")));
|
||||
|
||||
IndexSearcher searcher;
|
||||
searcher.loadIndex(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
|
||||
|
||||
/*The test index contains 3 sentences:
|
||||
"Ala ma kota"
|
||||
"Ala ma rysia"
|
||||
"Marysia ma rysia"
|
||||
|
||||
Test word map:
|
||||
Ala -> 0
|
||||
ma -> 1
|
||||
kota -> 2
|
||||
rysia -> 3
|
||||
Marysia -> 4
|
||||
|
||||
Test hashed index:
|
||||
n: 0 1 2 3 4 5 6 7 8
|
||||
T[n]: 0 1 2 0 1 3 4 1 3
|
||||
|
||||
Test suffix array:
|
||||
n: 0 1 2 3 4 5 6 7 8
|
||||
SA[n]: 0 3 1 7 4 2 8 5 6
|
||||
|
||||
*/
|
||||
|
||||
vector<saidx_t> expectedResult1;
|
||||
expectedResult1.push_back(7);
|
||||
expectedResult1.push_back(4);
|
||||
|
||||
vector<saidx_t> searchResult1 = searcher.simpleSearch("ma rysia");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(),
|
||||
expectedResult1.begin(), expectedResult1.end());
|
||||
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
@ -3,7 +3,6 @@
|
||||
#define PUDDLE_TEST_DIRECTORY "puddle"
|
||||
#define CONCORDIA_TAGSET_DIRECTORY "concordia-tagset"
|
||||
#define CONCORDIA_CONFIG_DIRECTORY "concordia-config"
|
||||
#define CONCORDIA_INDEX_DIRECTORY "concordia-index"
|
||||
|
||||
string TestResourcesManager::getPuddleFilePath(const string & filename) {
|
||||
string result = string(TEST_RESOURCES_DIRECTORY);
|
||||
@ -16,23 +15,13 @@ string TestResourcesManager::getTestConcordiaConfigFilePath(const string & filen
|
||||
return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename;
|
||||
}
|
||||
|
||||
string TestResourcesManager::getTestWordMapFilePath(const string & filename) {
|
||||
string result = string(TEST_RESOURCES_DIRECTORY);
|
||||
return result + "/" + CONCORDIA_INDEX_DIRECTORY + "/" + filename;
|
||||
}
|
||||
|
||||
string TestResourcesManager::getTestHashIndexFilePath(const string & filename) {
|
||||
string result = string(TEST_RESOURCES_DIRECTORY);
|
||||
return result + "/" + CONCORDIA_INDEX_DIRECTORY + "/" + filename;
|
||||
}
|
||||
|
||||
string TestResourcesManager::getTestSuffixArrayFilePath() {
|
||||
string result = string(TEST_RESOURCES_DIRECTORY);
|
||||
return result + "/" + CONCORDIA_INDEX_DIRECTORY + "/test_SA.bin";
|
||||
}
|
||||
|
||||
string TestResourcesManager::getProdConcordiaConfigFilePath(const string & filename) {
|
||||
string result = string(PROD_RESOURCES_DIRECTORY);
|
||||
return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename;
|
||||
}
|
||||
|
||||
string TestResourcesManager::getTestFilePath(const string & module, const string & filename) {
|
||||
string result = string(TEST_RESOURCES_DIRECTORY);
|
||||
return result + "/" + module + "/" + filename;
|
||||
}
|
||||
|
||||
|
@ -14,13 +14,10 @@ public:
|
||||
|
||||
static string getTestConcordiaConfigFilePath(const string & filename);
|
||||
|
||||
static string getTestWordMapFilePath(const string & filename);
|
||||
|
||||
static string getTestHashIndexFilePath(const string & filename);
|
||||
|
||||
static string getTestSuffixArrayFilePath();
|
||||
|
||||
static string getProdConcordiaConfigFilePath(const string & filename);
|
||||
|
||||
static string getTestFilePath(const string & module, const string & filename);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,9 +1,15 @@
|
||||
#----------------------------
|
||||
# Concordia configuration file
|
||||
# Concordia mock configuration file
|
||||
#---------------------------
|
||||
#
|
||||
|
||||
#Path to the Puddle tagset
|
||||
puddle_tagset_path = "puddle/tagset.txt";
|
||||
|
||||
word_map_path = "tmp/wm.bin"
|
||||
|
||||
hashed_index_path = "tmp/hi.bin"
|
||||
|
||||
suffix_array_path = "tmp/sa.bin"
|
||||
|
||||
### eof
|
@ -6,4 +6,22 @@
|
||||
#Path to the Puddle tagset
|
||||
puddle_tagset_path = "@TEST_PUDDLE_TAGSET_PATH@";
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
#Word map, hashed index and suffix array files are in a temporary directory
|
||||
#and should be deleted at the end of each test procedure.
|
||||
|
||||
#Word map file containing unique codes for tokens
|
||||
|
||||
word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
||||
|
||||
#File containing the "text" for suffix array searching, i.e. sequence of codes
|
||||
|
||||
hashed_index_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
|
||||
|
||||
#Binarized suffix array
|
||||
|
||||
suffix_array_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
|
||||
### eof
|
||||
|
BIN
tests/resources/index-searcher/test_SA.bin
Normal file
BIN
tests/resources/index-searcher/test_SA.bin
Normal file
Binary file not shown.
BIN
tests/resources/index-searcher/test_hash_index.bin
Normal file
BIN
tests/resources/index-searcher/test_hash_index.bin
Normal file
Binary file not shown.
BIN
tests/resources/index-searcher/test_word_map.bin
Normal file
BIN
tests/resources/index-searcher/test_word_map.bin
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user