redesigned project

Former-commit-id: d35841126fda627a04a1a16a26b91943401b6fcf
This commit is contained in:
rjawor 2013-12-14 15:23:17 +01:00
parent 9ff5f05205
commit b318770752
19 changed files with 331 additions and 555 deletions

View File

@ -3,6 +3,7 @@
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/date_time/posix_time/posix_time.hpp> #include <boost/date_time/posix_time/posix_time.hpp>
#include <boost/shared_ptr.hpp>
#include "concordia/concordia.hpp" #include "concordia/concordia.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
@ -20,8 +21,6 @@ int main(int argc, char** argv) {
("help,h", "Display this message") ("help,h", "Display this message")
("config,c", boost::program_options::value<std::string>(), ("config,c", boost::program_options::value<std::string>(),
"Concordia configuration file (required)") "Concordia configuration file (required)")
("generate-index,g", "Generate suffix array based index out of "
"added sentences")
("simple-search,s", boost::program_options::value<std::string>(), ("simple-search,s", boost::program_options::value<std::string>(),
"Pattern to be searched in the index") "Pattern to be searched in the index")
("silent,n", "While searching, do not output search results") ("silent,n", "While searching, do not output search results")
@ -48,43 +47,34 @@ int main(int argc, char** argv) {
try { try {
std::cout << "\tInitializing concordia..." << std::endl;
boost::posix_time::ptime time_start =
boost::posix_time::microsec_clock::local_time();
Concordia concordia(configFile); Concordia concordia(configFile);
std::cout << "Welcome to Concordia. Version = "
<< concordia.getVersion() << std::endl;
if (cli.count("generate-index")) {
std::cout << "\tGenerating index..." << std::endl;
boost::posix_time::ptime time_start =
boost::posix_time::microsec_clock::local_time();
concordia.generateIndex();
boost::posix_time::ptime time_end = boost::posix_time::ptime time_end =
boost::posix_time::microsec_clock::local_time(); boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration msdiff = time_end - time_start; boost::posix_time::time_duration msdiff = time_end - time_start;
std::cout << "\tIndex generated in: " << std::cout << "\tInitialization (loading index from disk "
msdiff.total_milliseconds() << "ms." << std::endl; << "and regenerating SA) took: "
} else if (cli.count("simple-search")) { << msdiff.total_milliseconds() << "ms." << std::endl;
std::cout << "\tLoading index..." << std::endl;
boost::posix_time::ptime time_start =
boost::posix_time::microsec_clock::local_time();
concordia.loadIndex();
boost::posix_time::ptime time_end =
boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration msdiff = time_end - time_start;
std::cout << "\tIndex loaded in: " <<
msdiff.total_milliseconds() << "ms." << std::endl;
std::cout << "\tWelcome to Concordia. Version = "
<< concordia.getVersion() << std::endl;
if (cli.count("simple-search")) {
std::string pattern = cli["simple-search"].as<std::string>(); std::string pattern = cli["simple-search"].as<std::string>();
std::cout << "\tSearching for pattern: \"" << pattern << std::cout << "\tSearching for pattern: \"" << pattern <<
"\"" << std::endl; "\"" << std::endl;
time_start = boost::posix_time::microsec_clock::local_time(); time_start = boost::posix_time::microsec_clock::local_time();
vector<saidx_t> result = concordia.simpleSearch(pattern); boost::shared_ptr<vector<saidx_t> > result =
concordia.simpleSearch(pattern);
time_end = boost::posix_time::microsec_clock::local_time(); time_end = boost::posix_time::microsec_clock::local_time();
msdiff = time_end - time_start; msdiff = time_end - time_start;
std::cout << "\tFound: " << result.size() << " matches. " std::cout << "\tFound: " << result->size() << " matches. "
<< "Search took: " << << "Search took: " <<
msdiff.total_milliseconds() << "ms." << std::endl; msdiff.total_milliseconds() << "ms." << std::endl;
if (!cli.count("silent")) { if (!cli.count("silent")) {
for (vector<saidx_t>::iterator it = result.begin(); for (vector<saidx_t>::iterator it = result->begin();
it != result.end(); ++it) { it != result->end(); ++it) {
std::cout << "\t\tfound match on word number: " << *it std::cout << "\t\tfound match on word number: " << *it
<< std::endl; << std::endl;
} }
@ -97,15 +87,16 @@ int main(int argc, char** argv) {
std::string line; std::string line;
if (text_file.is_open()) { if (text_file.is_open()) {
long lineCount = 0; long lineCount = 0;
vector<std::string> buffer; boost::shared_ptr<std::vector<std::string> >
buffer(new std::vector<std::string>());
boost::posix_time::ptime timeStart = boost::posix_time::ptime timeStart =
boost::posix_time::microsec_clock::local_time(); boost::posix_time::microsec_clock::local_time();
while (getline(text_file, line)) { while (getline(text_file, line)) {
lineCount++; lineCount++;
buffer.push_back(line); buffer->push_back(line);
if (lineCount % READ_BUFFER_LENGTH == 0) { if (lineCount % READ_BUFFER_LENGTH == 0) {
concordia.addAllSentences(buffer); concordia.addAllSentences(buffer);
buffer.clear(); buffer->clear();
boost::posix_time::ptime timeEnd = boost::posix_time::ptime timeEnd =
boost::posix_time::microsec_clock::local_time(); boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration msdiff = boost::posix_time::time_duration msdiff =
@ -119,7 +110,7 @@ int main(int argc, char** argv) {
" sentences per second" << std::endl; " sentences per second" << std::endl;
} }
} }
if (buffer.size() > 0) { if (buffer->size() > 0) {
concordia.addAllSentences(buffer); concordia.addAllSentences(buffer);
} }
text_file.close(); text_file.close();
@ -146,7 +137,7 @@ int main(int argc, char** argv) {
return 1; return 1;
} }
std::cout << "Concordia operation completed without errors." std::cout << "\tConcordia operation completed without errors."
<< std::endl; << std::endl;
} catch(ConcordiaException & e) { } catch(ConcordiaException & e) {
std::cerr << "ConcordiaException caught with message: " std::cerr << "ConcordiaException caught with message: "

View File

@ -1,15 +1,18 @@
#!/bin/sh #!/bin/sh
echo "Decompressing test file" echo "CONCORDIA RUNNER: Decompressing test file"
bunzip2 --keep prod/resources/text-files/large.txt.bz2 bunzip2 --keep prod/resources/text-files/large.txt.bz2
echo "Running Concordia" echo "CONCORDIA RUNNER: Running Concordia"
rm prod/resources/temp/* rm prod/resources/temp/*
echo "CONCORDIA RUNNER: reading from file"
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/large.txt ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/large.txt
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -g echo "CONCORDIA RUNNER: searching for pattern: \"drawn from his own\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "drawn from his own" -n ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "drawn from his own" -n
echo "CONCORDIA RUNNER: searching for pattern: \"it is\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "it is" -n ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "it is" -n
rm prod/resources/text-files/large.txt rm prod/resources/text-files/large.txt

View File

@ -1,9 +1,11 @@
#!/bin/sh #!/bin/sh
echo "Running Concordia" echo "CONCORDIA RUNNER: Running Concordia"
rm prod/resources/temp/* rm prod/resources/temp/*
echo "CONCORDIA RUNNER: reading from file"
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/medium.txt ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/medium.txt
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -g echo "CONCORDIA RUNNER: searching for pattern: \"drawn from his own\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "drawn from his own" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "drawn from his own"
echo "CONCORDIA RUNNER: searching for pattern: \"it is\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "it is" -n ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "it is" -n

View File

@ -18,20 +18,29 @@ INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) {
} }
sauchar_t * Utils::indexVectorToSaucharArray( sauchar_t * Utils::indexVectorToSaucharArray(
vector<INDEX_CHARACTER_TYPE> & input) { boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input) {
const int kArraySize = input.size()*sizeof(INDEX_CHARACTER_TYPE); const int kArraySize = input->size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = sauchar_t * patternArray =
new sauchar_t[kArraySize]; new sauchar_t[kArraySize];
int pos = 0; int pos = 0;
for (vector<INDEX_CHARACTER_TYPE>::iterator it = input.begin(); for (vector<INDEX_CHARACTER_TYPE>::iterator it = input->begin();
it != input.end(); ++it) { it != input->end(); ++it) {
insertCharToSaucharArray(patternArray, *it, pos); _insertCharToSaucharArray(patternArray, *it, pos);
pos += sizeof(INDEX_CHARACTER_TYPE); pos += sizeof(INDEX_CHARACTER_TYPE);
} }
return patternArray; return patternArray;
} }
void Utils::insertCharToSaucharArray(sauchar_t * array, void Utils::appendCharToSaucharVector(
boost::shared_ptr<std::vector<sauchar_t> > vector,
INDEX_CHARACTER_TYPE character) {
sauchar_t * characterArray = reinterpret_cast<sauchar_t *>(&character);
for (int i = 0; i < sizeof(character); i++) {
vector->push_back(characterArray[i]);
}
}
void Utils::_insertCharToSaucharArray(sauchar_t * array,
INDEX_CHARACTER_TYPE character, int pos) { INDEX_CHARACTER_TYPE character, int pos) {
sauchar_t * characterArray = reinterpret_cast<sauchar_t *>(&character); sauchar_t * characterArray = reinterpret_cast<sauchar_t *>(&character);
for (int i = pos; i < pos+sizeof(character); i++) { for (int i = pos; i < pos+sizeof(character); i++) {

View File

@ -26,11 +26,14 @@ public:
static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file); static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file);
static sauchar_t * indexVectorToSaucharArray( static sauchar_t * indexVectorToSaucharArray(
vector<INDEX_CHARACTER_TYPE> & input); boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input);
static void insertCharToSaucharArray(sauchar_t * array, static void appendCharToSaucharVector(
INDEX_CHARACTER_TYPE character, int pos); boost::shared_ptr<std::vector<sauchar_t> > vector,
INDEX_CHARACTER_TYPE character);
private: private:
static void _insertCharToSaucharArray(sauchar_t * array,
INDEX_CHARACTER_TYPE character, int pos);
}; };
#endif #endif

View File

@ -2,6 +2,7 @@
#include "concordia/concordia.hpp" #include "concordia/concordia.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
// =========================================== // ===========================================
@ -18,10 +19,9 @@ Concordia::Concordia(const std::string & configFilePath)
_config = boost::shared_ptr<ConcordiaConfig> ( _config = boost::shared_ptr<ConcordiaConfig> (
new ConcordiaConfig(configFilePath)); new ConcordiaConfig(configFilePath));
_index = boost::shared_ptr<ConcordiaIndex>( _index = boost::shared_ptr<ConcordiaIndex>(
new ConcordiaIndex(_config->getWordMapFilePath(), new ConcordiaIndex(_config->getHashedIndexFilePath()));
_config->getHashedIndexFilePath(),
_config->getSuffixArrayFilePath()));
_searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher()); _searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher());
_initializeIndex();
} }
Concordia::~Concordia() { Concordia::~Concordia() {
@ -41,29 +41,83 @@ std::string _createLibraryVersion() {
return version.str(); return version.str();
} }
// Sentences are written to disk and added to T.
// SA is generated on command by different methods.
void Concordia::addSentence(const std::string & sentence) void Concordia::addSentence(const std::string & sentence)
throw(ConcordiaException) { throw(ConcordiaException) {
_index->addSentence(sentence); _index->addSentence(_hashGenerator, _T, sentence);
} }
void Concordia::addAllSentences(vector<std::string> & sentences) // Sentences are written to disk and added to T.
// SA is generated on command by different methods.
void Concordia::addAllSentences(
boost::shared_ptr<std::vector<std::string> > sentences)
throw(ConcordiaException) { throw(ConcordiaException) {
_index->addAllSentences(sentences); _index->addAllSentences(_hashGenerator, _T, sentences);
} }
void Concordia::generateIndex() throw(ConcordiaException) { void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
_index->generateSuffixArray(); if (boost::filesystem::exists(_config->getWordMapFilePath())
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) {
// reading index from files
_T->clear();
ifstream hashedIndexFile;
hashedIndexFile.open(_config->getHashedIndexFilePath().c_str(), ios::in
| ios::ate | ios::binary);
saidx_t fileSize = hashedIndexFile.tellg();
if (fileSize > 0) {
hashedIndexFile.seekg(0, ios::beg);
while (!hashedIndexFile.eof()) {
INDEX_CHARACTER_TYPE character =
Utils::readIndexCharacter(hashedIndexFile);
Utils::appendCharToSaucharVector(_T, character);
}
hashedIndexFile.close();
// generating suffix array
_SA = _index->generateSuffixArray(_hashGenerator, _T);
} else {
throw ConcordiaException("Index corrupt: empty hash index file");
}
} else {
throw ConcordiaException("Index corrupt: missing files");
}
} }
void Concordia::loadIndex() throw(ConcordiaException) { void Concordia::refreshSAfromRAM() throw(ConcordiaException) {
_searcher->loadIndex(_config->getWordMapFilePath(), _SA = _index->generateSuffixArray(_hashGenerator, _T);
_config->getHashedIndexFilePath(),
_config->getSuffixArrayFilePath());
} }
std::vector<saidx_t> Concordia::simpleSearch(const std::string & pattern)
void Concordia::_initializeIndex() throw(ConcordiaException) {
_hashGenerator = boost::shared_ptr<HashGenerator>(
new HashGenerator(_config->getWordMapFilePath()));
_T = boost::shared_ptr<std::vector<sauchar_t> >(
new std::vector<sauchar_t>);
if (boost::filesystem::exists(_config->getWordMapFilePath())
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) {
loadRAMIndexFromDisk();
} else if (!boost::filesystem::exists(_config->getWordMapFilePath())
&& !boost::filesystem::exists(_config->getHashedIndexFilePath())) {
// empty index
_SA = boost::shared_ptr<std::vector<saidx_t> >(
new std::vector<saidx_t>);
} else {
throw ConcordiaException("Index corrupt: missing files");
}
}
boost::shared_ptr<std::vector<saidx_t> > Concordia::simpleSearch(
const string & pattern)
throw(ConcordiaException) { throw(ConcordiaException) {
return _searcher->simpleSearch(pattern); if (_T->size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T, _SA, pattern);
} else {
boost::shared_ptr<std::vector<saidx_t> > result =
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
return result;
}
} }

View File

@ -4,6 +4,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/filesystem.hpp>
#include "concordia/concordia_config.hpp" #include "concordia/concordia_config.hpp"
#include "concordia/concordia_index.hpp" #include "concordia/concordia_index.hpp"
@ -35,17 +36,20 @@ public:
void addSentence(const std::string & sentence) throw(ConcordiaException); void addSentence(const std::string & sentence) throw(ConcordiaException);
void addAllSentences(vector<std::string> & sentences) void addAllSentences(boost::shared_ptr<std::vector<std::string> > sentences)
throw(ConcordiaException); throw(ConcordiaException);
void generateIndex() throw(ConcordiaException); boost::shared_ptr<std::vector<saidx_t> > simpleSearch(
const std::string & pattern)
void loadIndex() throw(ConcordiaException);
std::vector<saidx_t> simpleSearch(const std::string & pattern)
throw(ConcordiaException); throw(ConcordiaException);
void loadRAMIndexFromDisk() throw(ConcordiaException);
void refreshSAfromRAM() throw(ConcordiaException);
private: private:
void _initializeIndex() throw(ConcordiaException);
static std::string _libraryVersion; static std::string _libraryVersion;
boost::shared_ptr<ConcordiaConfig> _config; boost::shared_ptr<ConcordiaConfig> _config;
@ -53,6 +57,12 @@ private:
boost::shared_ptr<ConcordiaIndex> _index; boost::shared_ptr<ConcordiaIndex> _index;
boost::shared_ptr<IndexSearcher> _searcher; boost::shared_ptr<IndexSearcher> _searcher;
boost::shared_ptr<HashGenerator> _hashGenerator;
boost::shared_ptr<std::vector<sauchar_t> > _T;
boost::shared_ptr<std::vector<saidx_t> > _SA;
}; };
#endif #endif

View File

@ -4,118 +4,71 @@
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <iostream> #include <iostream>
ConcordiaIndex::ConcordiaIndex(const string & wordMapFilePath, ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath)
const string & hashedIndexFilePath,
const string & suffixArrayFilePath)
throw(ConcordiaException) : throw(ConcordiaException) :
_hashedIndexFilePath(hashedIndexFilePath), _hashedIndexFilePath(hashedIndexFilePath) {
_suffixArrayFilePath(suffixArrayFilePath) {
if (boost::filesystem::exists(wordMapFilePath)) {
if (!boost::filesystem::exists(hashedIndexFilePath)) {
throw ConcordiaException("E01: Word map file exists "
"but hashed index file absent.");
}
} else { // WordMap file does not exist
if (boost::filesystem::exists(hashedIndexFilePath)) {
throw ConcordiaException("E02: Hashed index file exists "
"but word map file absent.");
}
}
_hashGenerator = boost::shared_ptr<HashGenerator>(
new HashGenerator(wordMapFilePath));
} }
ConcordiaIndex::~ConcordiaIndex() { ConcordiaIndex::~ConcordiaIndex() {
} }
void ConcordiaIndex::_serializeWordMap() { boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
_hashGenerator->serializeWordMap(); boost::shared_ptr<HashGenerator> hashGenerator,
} boost::shared_ptr<vector<sauchar_t> > T) {
saidx_t * SA_array = new saidx_t[T->size()];
void ConcordiaIndex::generateSuffixArray() { if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) {
if (boost::filesystem::exists(_hashedIndexFilePath.c_str())) {
ifstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in|
ios::ate|ios::binary);
/* Get the file size. */
saidx_t n = hashedIndexFile.tellg();
if (n > 0) {
sauchar_t *T;
saidx_t *SA;
/* Read n bytes of data. */
hashedIndexFile.seekg(0, ios::beg);
T = new sauchar_t[n];
int pos = 0;
while (!hashedIndexFile.eof()) {
INDEX_CHARACTER_TYPE character =
Utils::readIndexCharacter(hashedIndexFile);
Utils::insertCharToSaucharArray(T, character, pos);
pos+=sizeof(character);
}
hashedIndexFile.close();
SA = new saidx_t[n];
/* Construct the suffix array. */
if (divsufsort(T, SA, (saidx_t)n) != 0) {
throw ConcordiaException("Error creating suffix array."); throw ConcordiaException("Error creating suffix array.");
} }
/* Write the suffix array. */ boost::shared_ptr<vector<saidx_t> > result =
boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>);
ofstream suffixArrayFile; for (int i = 0; i < T->size(); i++) {
suffixArrayFile.open(_suffixArrayFilePath.c_str(), result->push_back(SA_array[i]);
ios::out|ios::binary);
for (int i = 0; i < n; i++) {
suffixArrayFile.write(reinterpret_cast<char *>(&SA[i]),
sizeof(saidx_t));
} }
suffixArrayFile.close();
/* Deallocate memory. */ delete[] SA_array;
delete[] T; return result;
delete[] SA;
} else {
throw ConcordiaException("Can not generate suffix array: "
"hashed index file is empty");
}
} else {
throw ConcordiaException("Can not generate suffix array: "
"hashed index file does not exist");
}
} }
void ConcordiaIndex::addSentence(const string & sentence) { void ConcordiaIndex::addSentence(boost::shared_ptr<HashGenerator> hashGenerator,
vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(sentence); boost::shared_ptr<vector<sauchar_t> > T,
const string & sentence) {
ofstream hashedIndexFile; ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out| hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
ios::app|ios::binary); ios::app|ios::binary);
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin(); _addSingleSentence(hashedIndexFile, hashGenerator, T, sentence);
it != hash.end(); ++it) {
Utils::writeIndexCharacter(hashedIndexFile, *it);
}
hashedIndexFile.close(); hashedIndexFile.close();
_serializeWordMap(); hashGenerator->serializeWordMap();
} }
void ConcordiaIndex::addAllSentences(vector<std::string> & sentences) { void ConcordiaIndex::addAllSentences(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T,
boost::shared_ptr<vector<string> > sentences) {
ofstream hashedIndexFile; ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out| hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
ios::app|ios::binary); ios::app|ios::binary);
for (vector<string>::iterator sent_it = sentences.begin(); for (vector<string>::iterator sent_it = sentences->begin();
sent_it != sentences.end(); ++sent_it) { sent_it != sentences->end(); ++sent_it) {
string sentence = *sent_it; string sentence = *sent_it;
vector<INDEX_CHARACTER_TYPE> hash = _addSingleSentence(hashedIndexFile, hashGenerator, T, sentence);
_hashGenerator->generateHash(sentence);
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
it != hash.end(); ++it) {
Utils::writeIndexCharacter(hashedIndexFile, *it);
}
} }
hashedIndexFile.close(); hashedIndexFile.close();
_serializeWordMap(); hashGenerator->serializeWordMap();
}
void ConcordiaIndex::_addSingleSentence(
ofstream & hashedIndexFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
const string & sentence) {
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash
= hashGenerator->generateHash(sentence);
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash->begin();
it != hash->end(); ++it) {
INDEX_CHARACTER_TYPE character = *it;
Utils::writeIndexCharacter(hashedIndexFile, character);
Utils::appendCharToSaucharVector(T, character);
}
} }

View File

@ -19,29 +19,35 @@ using namespace std;
class ConcordiaIndex { class ConcordiaIndex {
public: public:
explicit ConcordiaIndex(const string & wordMapFilePath, explicit ConcordiaIndex(const string & hashedIndexFilePath)
const string & hashedIndexFilePath,
const string & suffixArrayFilePath)
throw(ConcordiaException); throw(ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */
virtual ~ConcordiaIndex(); virtual ~ConcordiaIndex();
void addSentence(const string & sentence); void addSentence(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T,
const string & sentence);
void addAllSentences(vector<string> & sentences); void addAllSentences(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T,
boost::shared_ptr<vector<string> > sentences);
void generateSuffixArray(); boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T);
private: private:
void _serializeWordMap(); // Add sentence to disk index and update RAM index.
void _addSingleSentence(ofstream & hashedIndexFile,
boost::shared_ptr<HashGenerator> _hashGenerator; boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
const string & sentence);
string _hashedIndexFilePath; string _hashedIndexFilePath;
string _suffixArrayFilePath;
}; };
#endif #endif

View File

@ -20,17 +20,18 @@ HashGenerator::HashGenerator(const string & wordMapFilePath)
HashGenerator::~HashGenerator() { HashGenerator::~HashGenerator() {
} }
vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash( boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
const string & sentence) { const string & sentence) {
vector<INDEX_CHARACTER_TYPE> result; boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
vector<string> tokenTexts; result(new vector<INDEX_CHARACTER_TYPE>());
boost::split(tokenTexts, sentence, boost::is_any_of(" ")); boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
boost::split(*tokenTexts, sentence, boost::is_any_of(" "));
for (vector<string>::iterator it = tokenTexts.begin(); for (vector<string>::iterator it = tokenTexts->begin();
it != tokenTexts.end(); ++it) { it != tokenTexts->end(); ++it) {
string token = *it; string token = *it;
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token); INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
result.push_back(code); result->push_back(code);
} }
return result; return result;

View File

@ -26,7 +26,8 @@ public:
*/ */
virtual ~HashGenerator(); virtual ~HashGenerator();
vector<INDEX_CHARACTER_TYPE> generateHash(const string & sentence); boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
generateHash(const string & sentence);
void serializeWordMap(); void serializeWordMap();

View File

@ -3,89 +3,38 @@
#include "concordia/common/utils.hpp" #include "concordia/common/utils.hpp"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
IndexSearcher::IndexSearcher(): IndexSearcher::IndexSearcher() {
_T(NULL),
_SA(NULL),
_n(0) {
} }
IndexSearcher::~IndexSearcher() { IndexSearcher::~IndexSearcher() {
} }
boost::shared_ptr<vector<saidx_t> > IndexSearcher::simpleSearch(
void IndexSearcher::loadIndex(const string & wordMapFilepath, boost::shared_ptr<HashGenerator> hashGenerator,
const string & hashedIndexFilepath, boost::shared_ptr<std::vector<sauchar_t> > T,
const string & suffixArrayFilepath) boost::shared_ptr<std::vector<saidx_t> > SA,
throw(ConcordiaException) { const string & pattern) throw(ConcordiaException) {
if (!boost::filesystem::exists(wordMapFilepath)) { boost::shared_ptr<vector<saidx_t> > result =
throw ConcordiaException("E06: Failed to open word map " boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
"file for reading.");
}
if (!boost::filesystem::exists(hashedIndexFilepath)) {
throw ConcordiaException("E07: Failed to open hashed index file "
"for reading.");
}
if (!boost::filesystem::exists(suffixArrayFilepath)) {
throw ConcordiaException("E08: Failed to open suffix array file "
"for reading.");
}
_hashGenerator = boost::shared_ptr<HashGenerator>(
new HashGenerator(wordMapFilepath));
ifstream hashedIndexFile;
hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in
| ios::ate | ios::binary);
_n = hashedIndexFile.tellg();
hashedIndexFile.seekg(0, ios::beg);
_T = new sauchar_t[_n];
int pos = 0;
while (!hashedIndexFile.eof()) {
INDEX_CHARACTER_TYPE character =
Utils::readIndexCharacter(hashedIndexFile);
Utils::insertCharToSaucharArray(_T, character, pos);
pos+=sizeof(character);
}
hashedIndexFile.close();
_SA = new saidx_t[_n];
ifstream suffixArrayFile;
suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::in | ios::binary);
saidx_t saidx_buff;
pos = 0;
while (!suffixArrayFile.eof() && pos < _n) {
suffixArrayFile.read(reinterpret_cast<char *>(&saidx_buff),
sizeof(saidx_t));
_SA[pos++] = saidx_buff;
}
suffixArrayFile.close();
}
vector<saidx_t> IndexSearcher::simpleSearch(const string & pattern)
throw(ConcordiaException) {
vector<saidx_t> result;
int left; int left;
vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(pattern); boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); hashGenerator->generateHash(pattern);
saidx_t patternLength = hash->size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
int size = sa_search(_T, (saidx_t) _n, int size = sa_search(T->data(), (saidx_t) T->size(),
(const sauchar_t *) patternArray, patternLength, (const sauchar_t *) patternArray, patternLength,
_SA, (saidx_t) _n, &left); SA->data(), (saidx_t) T->size(), &left);
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
saidx_t result_pos = _SA[left + i]; saidx_t result_pos = SA->at(left + i);
if (result_pos % sizeof(INDEX_CHARACTER_TYPE) == 0) { if (result_pos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
// As we are looking for a pattern in an array of higher // As we are looking for a pattern in an array of higher
// resolution than the hashed index file, we might // resolution than the hashed index file, we might
// obtain accidental results exceeding the boundaries // obtain accidental results exceeding the boundaries
// of characters in hashed index. The above check // of characters in hashed index. The above check
// removes these accidental results. // removes these accidental results.
result.push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE)); result->push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE));
} }
} }

View File

@ -25,22 +25,12 @@ public:
*/ */
virtual ~IndexSearcher(); virtual ~IndexSearcher();
void loadIndex(const string & wordMapFilepath, boost::shared_ptr<vector<saidx_t> > simpleSearch(
const string & hashedIndexFilepath, boost::shared_ptr<HashGenerator> hashGenerator,
const string & suffixArrayFilepath) boost::shared_ptr<std::vector<sauchar_t> > T,
throw(ConcordiaException); boost::shared_ptr<std::vector<saidx_t> > SA,
const string & pattern) throw(ConcordiaException);
vector<saidx_t> simpleSearch(const string & pattern)
throw(ConcordiaException);
private: private:
boost::shared_ptr<HashGenerator> _hashGenerator;
sauchar_t * _T;
saidx_t * _SA;
saidx_t _n;
}; };
#endif #endif

View File

@ -4,7 +4,6 @@ add_library(concordia-tests
test_word_map.cpp test_word_map.cpp
test_hash_generator.cpp test_hash_generator.cpp
test_concordia_index.cpp test_concordia_index.cpp
test_index_searcher.cpp
test_concordia_config.cpp test_concordia_config.cpp
test_concordia.cpp test_concordia.cpp
) )

View File

@ -25,8 +25,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
concordia.addSentence("Ala ma kota"); concordia.addSentence("Ala ma kota");
concordia.addSentence("Ala ma rysia"); concordia.addSentence("Ala ma rysia");
concordia.addSentence("Marysia ma rysia"); concordia.addSentence("Marysia ma rysia");
concordia.refreshSAfromRAM();
concordia.generateIndex();
/*The test index contains 3 sentences: /*The test index contains 3 sentences:
"Ala ma kota" "Ala ma kota"
@ -50,34 +49,30 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
*/ */
vector<saidx_t> expectedResult1; boost::shared_ptr<std::vector<saidx_t> > expectedResult1(new std::vector<saidx_t>());
expectedResult1.push_back(7); expectedResult1->push_back(7);
expectedResult1.push_back(4); expectedResult1->push_back(4);
concordia.loadIndex(); boost::shared_ptr<std::vector<saidx_t> > searchResult1 = concordia.simpleSearch("ma rysia");
vector<saidx_t> searchResult1 = concordia.simpleSearch("ma rysia");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_SUFFIX_ARRAY));
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(), BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1->begin(), searchResult1->end(),
expectedResult1.begin(), expectedResult1.end()); expectedResult1->begin(), expectedResult1->end());
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
{ {
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
vector<string> testSentences; boost::shared_ptr<vector<string> > testSentences (new vector<string>());
testSentences.push_back("to jest okno"); testSentences->push_back("to jest okno");
testSentences.push_back("czy jest okno otwarte"); testSentences->push_back("czy jest okno otwarte");
testSentences.push_back("chyba to jest tutaj"); testSentences->push_back("chyba to jest tutaj");
testSentences.push_back("to jest"); testSentences->push_back("to jest");
concordia.addAllSentences(testSentences); concordia.addAllSentences(testSentences);
concordia.generateIndex();
/*The test index contains 4 sentences: /*The test index contains 4 sentences:
"to jest okno" "to jest okno"
"czy jest okno otwarte" "czy jest okno otwarte"
@ -103,27 +98,26 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
*/ */
vector<saidx_t> expectedResult1; boost::shared_ptr<vector<saidx_t> > expectedResult1(new vector<saidx_t>());
expectedResult1.push_back(11); expectedResult1->push_back(11);
expectedResult1.push_back(0); expectedResult1->push_back(0);
expectedResult1.push_back(8); expectedResult1->push_back(8);
vector<saidx_t> expectedResult2; boost::shared_ptr<vector<saidx_t> > expectedResult2(new vector<saidx_t>());
expectedResult2.push_back(1); expectedResult2->push_back(1);
expectedResult2.push_back(4); expectedResult2->push_back(4);
concordia.loadIndex(); Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
vector<saidx_t> searchResult1 = concordia.simpleSearch("to jest"); boost::shared_ptr<vector<saidx_t> > searchResult1 = concordia2.simpleSearch("to jest");
vector<saidx_t> searchResult2 = concordia.simpleSearch("jest okno"); boost::shared_ptr<vector<saidx_t> > searchResult2 = concordia2.simpleSearch("jest okno");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_SUFFIX_ARRAY));
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(), BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1->begin(), searchResult1->end(),
expectedResult1.begin(), expectedResult1.end()); expectedResult1->begin(), expectedResult1->end());
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult2.begin(), searchResult2.end(), BOOST_CHECK_EQUAL_COLLECTIONS(searchResult2->begin(), searchResult2->end(),
expectedResult2.begin(), expectedResult2.end()); expectedResult2->begin(), expectedResult2->end());
} }

View File

@ -12,60 +12,42 @@ using namespace std;
BOOST_AUTO_TEST_SUITE(concordia_index) BOOST_AUTO_TEST_SUITE(concordia_index)
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest1 )
{
bool exceptionThrown = false;
string message = "";
try {
ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","mock_word_map.bin"),
TestResourcesManager::getTestFilePath("concordia-index","nonexistent.bin"),
TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin"));
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
}
BOOST_CHECK(exceptionThrown);
BOOST_CHECK_EQUAL(boost::starts_with(message, "E01"), true);
}
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest2 )
{
bool exceptionThrown = false;
string message = "";
try {
ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","nonexistent.bin"),
TestResourcesManager::getTestFilePath("concordia-index","mock_hash_index.bin"),
TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin"));
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
}
BOOST_CHECK(exceptionThrown);
BOOST_CHECK_EQUAL(boost::starts_with(message, "E02"), true);
}
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest ) BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
{ {
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"), boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
index.addSentence("Ala ma kota");
index.addSentence("Ala ma rysia");
index.addSentence("Marysia ma rysia");
index.generateSuffixArray(); ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
// Test hashed index:
// n: 0 1 2 3 4 5 6 7 8
// T[n]: 0 1 2 0 1 3 4 1 3
T->push_back(0);
T->push_back(1);
T->push_back(2);
T->push_back(0);
T->push_back(1);
T->push_back(3);
T->push_back(4);
T->push_back(1);
T->push_back(3);
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"))); // Test suffix array:
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"))); // n: 0 1 2 3 4 5 6 7 8
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_SA.bin"))); //SA[n]: 0 3 1 7 4 2 8 5 6
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")); boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(hashGenerator, T);
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_SA.bin")); boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
expectedSA->push_back(0);
expectedSA->push_back(3);
expectedSA->push_back(1);
expectedSA->push_back(7);
expectedSA->push_back(4);
expectedSA->push_back(2);
expectedSA->push_back(8);
expectedSA->push_back(5);
expectedSA->push_back(6);
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -19,13 +19,13 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH); HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala ma kota"); boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash("Ala ma kota");
vector<INDEX_CHARACTER_TYPE> expected; boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
expected.push_back(0); expected->push_back(0);
expected.push_back(1); expected->push_back(1);
expected.push_back(2); expected->push_back(2);
BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end()); BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
} }
BOOST_AUTO_TEST_CASE( HashSerializationTest ) BOOST_AUTO_TEST_CASE( HashSerializationTest )
@ -35,22 +35,22 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
} }
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH); HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala ma kota"); boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash1 = hashGenerator1.generateHash("Ala ma kota");
vector<INDEX_CHARACTER_TYPE> expected1; boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected1(new vector<INDEX_CHARACTER_TYPE>());
expected1.push_back(0); expected1->push_back(0);
expected1.push_back(1); expected1->push_back(1);
expected1.push_back(2); expected1->push_back(2);
BOOST_CHECK_EQUAL_COLLECTIONS(hash1.begin(), hash1.end(), expected1.begin(), expected1.end()); BOOST_CHECK_EQUAL_COLLECTIONS(hash1->begin(), hash1->end(), expected1->begin(), expected1->end());
hashGenerator1.serializeWordMap(); hashGenerator1.serializeWordMap();
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH); HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala ma psa"); boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash2 = hashGenerator2.generateHash("Ala ma psa");
vector<INDEX_CHARACTER_TYPE> expected2; boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected2(new vector<INDEX_CHARACTER_TYPE>());
expected2.push_back(0); expected2->push_back(0);
expected2.push_back(1); expected2->push_back(1);
expected2.push_back(3); expected2->push_back(3);
BOOST_CHECK_EQUAL_COLLECTIONS(hash2.begin(), hash2.end(), expected2.begin(), expected2.end()); BOOST_CHECK_EQUAL_COLLECTIONS(hash2->begin(), hash2->end(), expected2->begin(), expected2->end());
boost::filesystem::remove(TEST_WORD_MAP_PATH); boost::filesystem::remove(TEST_WORD_MAP_PATH);
} }

View File

@ -1,75 +0,0 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/index_searcher.hpp"
#include "concordia/concordia_index.hpp"
#include "concordia/concordia_exception.hpp"
#include "tests/common/test_resources_manager.hpp"
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
using namespace std;
BOOST_AUTO_TEST_SUITE(index_searcher)
BOOST_AUTO_TEST_CASE( SimpleSearchTest )
{
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
index.addSentence("Ala ma kota");
index.addSentence("Ala ma rysia");
index.addSentence("Marysia ma rysia");
index.generateSuffixArray();
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_SA.bin")));
IndexSearcher searcher;
searcher.loadIndex(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
/*The test index contains 3 sentences:
"Ala ma kota"
"Ala ma rysia"
"Marysia ma rysia"
Test word map:
Ala -> 0
ma -> 1
kota -> 2
rysia -> 3
Marysia -> 4
Test hashed index:
n: 0 1 2 3 4 5 6 7 8
T[n]: 0 1 2 0 1 3 4 1 3
Test suffix array:
n: 0 1 2 3 4 5 6 7 8
SA[n]: 0 3 1 7 4 2 8 5 6
*/
vector<saidx_t> expectedResult1;
expectedResult1.push_back(7);
expectedResult1.push_back(4);
vector<saidx_t> searchResult1 = searcher.simpleSearch("ma rysia");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(),
expectedResult1.begin(), expectedResult1.end());
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -11,7 +11,7 @@ using namespace std;
BOOST_AUTO_TEST_SUITE(utils) BOOST_AUTO_TEST_SUITE(utils)
BOOST_AUTO_TEST_CASE( UtilsTest1 ) BOOST_AUTO_TEST_CASE( WriteReadSingleCharacter )
{ {
ofstream testFileOutput; ofstream testFileOutput;
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(), testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
@ -29,133 +29,37 @@ BOOST_AUTO_TEST_CASE( UtilsTest1 )
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin")); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
} }
BOOST_AUTO_TEST_CASE( UtilsTest2 ) BOOST_AUTO_TEST_CASE( IndexVectorToSaucharArray )
{ {
ofstream testFileOutput; boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash(new vector<INDEX_CHARACTER_TYPE>());
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(), hash->push_back(123456789); // in hex: 75BCD15
ios::out|ios::binary); // in memory: 15 cd 5b 07
Utils::writeIndexCharacter(testFileOutput,123456789); //in hex: 75BCD15 // in memory DEC: 21 205 91 7
//in memory: 15 cd 5b 07
// in DEC: 21 205 91 7
Utils::writeIndexCharacter(testFileOutput,987654321); //in hex: 3ADE68B1
//in memory: b1 68 de 3a
// in DEC: 177 104 222 58
testFileOutput.close();
sauchar_t * dataArray = new sauchar_t[8];
ifstream testFileInput;
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
INDEX_CHARACTER_TYPE retrievedCharacter1 = Utils::readIndexCharacter(testFileInput);
BOOST_CHECK_EQUAL(retrievedCharacter1, 123456789);
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter1, 0);
INDEX_CHARACTER_TYPE retrievedCharacter2 = Utils::readIndexCharacter(testFileInput);
BOOST_CHECK_EQUAL(retrievedCharacter2, 987654321);
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter2, 4);
testFileInput.close();
vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(21);
expected.push_back(205);
expected.push_back(91);
expected.push_back(7);
expected.push_back(177);
expected.push_back(104);
expected.push_back(222);
expected.push_back(58);
vector<INDEX_CHARACTER_TYPE> result;
for (int i=0;i<8;i++) {
INDEX_CHARACTER_TYPE a = dataArray[i];
result.push_back(a);
}
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
}
BOOST_AUTO_TEST_CASE( UtilsTest3 )
{
vector<INDEX_CHARACTER_TYPE> hash;
hash.push_back(123456789);
hash.push_back(987654321);
hash->push_back(987654321); // in hex: 3ADE68B1
// in memory: b1 68 de 3a
// in memory DEC: 177 104 222 58
sauchar_t * dataArray = Utils::indexVectorToSaucharArray(hash); sauchar_t * dataArray = Utils::indexVectorToSaucharArray(hash);
vector<INDEX_CHARACTER_TYPE> result; boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > result(new vector<INDEX_CHARACTER_TYPE>());
for (int i=0;i<8;i++) { for (int i=0;i<8;i++) {
INDEX_CHARACTER_TYPE a = dataArray[i]; INDEX_CHARACTER_TYPE a = dataArray[i];
result.push_back(a); result->push_back(a);
} }
vector<INDEX_CHARACTER_TYPE> expected; boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
expected.push_back(21); expected->push_back(21);
expected.push_back(205); expected->push_back(205);
expected.push_back(91); expected->push_back(91);
expected.push_back(7); expected->push_back(7);
expected.push_back(177); expected->push_back(177);
expected.push_back(104); expected->push_back(104);
expected.push_back(222); expected->push_back(222);
expected.push_back(58); expected->push_back(58);
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end()); BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());
} }
/*
BOOST_AUTO_TEST_CASE( UtilsTest4 )
{
ofstream testFileOutput;
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
ios::out|ios::binary);
Utils::writeIndexCharacter(testFileOutput,123456789); //in hex: 75BCD15
//in memory: 15 cd 5b 07
// in DEC: 21 205 91 7
Utils::writeIndexCharacter(testFileOutput,987654321); //in hex: 3ADE68B1
//in memory: b1 68 de 3a
// in DEC: 177 104 222 58
testFileOutput.close();
sauchar_t * dataArray = Utils::readIndexFromFile(
ifstream testFileInput;
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
INDEX_CHARACTER_TYPE retrievedCharacter1 = Utils::readIndexCharacter(testFileInput);
BOOST_CHECK_EQUAL(retrievedCharacter1, 123456789);
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter1, 0);
INDEX_CHARACTER_TYPE retrievedCharacter2 = Utils::readIndexCharacter(testFileInput);
BOOST_CHECK_EQUAL(retrievedCharacter2, 987654321);
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter2, 4);
testFileInput.close();
vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(21);
expected.push_back(205);
expected.push_back(91);
expected.push_back(7);
expected.push_back(177);
expected.push_back(104);
expected.push_back(222);
expected.push_back(58);
vector<INDEX_CHARACTER_TYPE> result;
for (int i=0;i<8;i++) {
INDEX_CHARACTER_TYPE a = dataArray[i];
result.push_back(a);
}
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
}
*/
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()