redesigned project
Former-commit-id: d35841126fda627a04a1a16a26b91943401b6fcf
This commit is contained in:
parent
9ff5f05205
commit
b318770752
@ -3,6 +3,7 @@
|
||||
#include <boost/program_options.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/date_time/posix_time/posix_time.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "concordia/concordia.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
@ -20,8 +21,6 @@ int main(int argc, char** argv) {
|
||||
("help,h", "Display this message")
|
||||
("config,c", boost::program_options::value<std::string>(),
|
||||
"Concordia configuration file (required)")
|
||||
("generate-index,g", "Generate suffix array based index out of "
|
||||
"added sentences")
|
||||
("simple-search,s", boost::program_options::value<std::string>(),
|
||||
"Pattern to be searched in the index")
|
||||
("silent,n", "While searching, do not output search results")
|
||||
@ -48,43 +47,34 @@ int main(int argc, char** argv) {
|
||||
|
||||
|
||||
try {
|
||||
std::cout << "\tInitializing concordia..." << std::endl;
|
||||
boost::posix_time::ptime time_start =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
Concordia concordia(configFile);
|
||||
std::cout << "Welcome to Concordia. Version = "
|
||||
<< concordia.getVersion() << std::endl;
|
||||
if (cli.count("generate-index")) {
|
||||
std::cout << "\tGenerating index..." << std::endl;
|
||||
boost::posix_time::ptime time_start =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
concordia.generateIndex();
|
||||
boost::posix_time::ptime time_end =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration msdiff = time_end - time_start;
|
||||
std::cout << "\tIndex generated in: " <<
|
||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||
} else if (cli.count("simple-search")) {
|
||||
std::cout << "\tLoading index..." << std::endl;
|
||||
boost::posix_time::ptime time_start =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
concordia.loadIndex();
|
||||
boost::posix_time::ptime time_end =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration msdiff = time_end - time_start;
|
||||
std::cout << "\tIndex loaded in: " <<
|
||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||
std::cout << "\tInitialization (loading index from disk "
|
||||
<< "and regenerating SA) took: "
|
||||
<< msdiff.total_milliseconds() << "ms." << std::endl;
|
||||
|
||||
std::cout << "\tWelcome to Concordia. Version = "
|
||||
<< concordia.getVersion() << std::endl;
|
||||
if (cli.count("simple-search")) {
|
||||
std::string pattern = cli["simple-search"].as<std::string>();
|
||||
std::cout << "\tSearching for pattern: \"" << pattern <<
|
||||
"\"" << std::endl;
|
||||
time_start = boost::posix_time::microsec_clock::local_time();
|
||||
vector<saidx_t> result = concordia.simpleSearch(pattern);
|
||||
boost::shared_ptr<vector<saidx_t> > result =
|
||||
concordia.simpleSearch(pattern);
|
||||
time_end = boost::posix_time::microsec_clock::local_time();
|
||||
msdiff = time_end - time_start;
|
||||
std::cout << "\tFound: " << result.size() << " matches. "
|
||||
std::cout << "\tFound: " << result->size() << " matches. "
|
||||
<< "Search took: " <<
|
||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||
if (!cli.count("silent")) {
|
||||
for (vector<saidx_t>::iterator it = result.begin();
|
||||
it != result.end(); ++it) {
|
||||
for (vector<saidx_t>::iterator it = result->begin();
|
||||
it != result->end(); ++it) {
|
||||
std::cout << "\t\tfound match on word number: " << *it
|
||||
<< std::endl;
|
||||
}
|
||||
@ -97,15 +87,16 @@ int main(int argc, char** argv) {
|
||||
std::string line;
|
||||
if (text_file.is_open()) {
|
||||
long lineCount = 0;
|
||||
vector<std::string> buffer;
|
||||
boost::shared_ptr<std::vector<std::string> >
|
||||
buffer(new std::vector<std::string>());
|
||||
boost::posix_time::ptime timeStart =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
while (getline(text_file, line)) {
|
||||
lineCount++;
|
||||
buffer.push_back(line);
|
||||
buffer->push_back(line);
|
||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||
concordia.addAllSentences(buffer);
|
||||
buffer.clear();
|
||||
buffer->clear();
|
||||
boost::posix_time::ptime timeEnd =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration msdiff =
|
||||
@ -119,7 +110,7 @@ int main(int argc, char** argv) {
|
||||
" sentences per second" << std::endl;
|
||||
}
|
||||
}
|
||||
if (buffer.size() > 0) {
|
||||
if (buffer->size() > 0) {
|
||||
concordia.addAllSentences(buffer);
|
||||
}
|
||||
text_file.close();
|
||||
@ -146,7 +137,7 @@ int main(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "Concordia operation completed without errors."
|
||||
std::cout << "\tConcordia operation completed without errors."
|
||||
<< std::endl;
|
||||
} catch(ConcordiaException & e) {
|
||||
std::cerr << "ConcordiaException caught with message: "
|
||||
|
@ -1,15 +1,18 @@
|
||||
#!/bin/sh
|
||||
|
||||
echo "Decompressing test file"
|
||||
echo "CONCORDIA RUNNER: Decompressing test file"
|
||||
|
||||
bunzip2 --keep prod/resources/text-files/large.txt.bz2
|
||||
|
||||
echo "Running Concordia"
|
||||
echo "CONCORDIA RUNNER: Running Concordia"
|
||||
|
||||
rm prod/resources/temp/*
|
||||
|
||||
echo "CONCORDIA RUNNER: reading from file"
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/large.txt
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -g
|
||||
echo "CONCORDIA RUNNER: searching for pattern: \"drawn from his own\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "drawn from his own" -n
|
||||
echo "CONCORDIA RUNNER: searching for pattern: \"it is\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "it is" -n
|
||||
|
||||
rm prod/resources/text-files/large.txt
|
||||
|
@ -1,9 +1,11 @@
|
||||
#!/bin/sh
|
||||
|
||||
echo "Running Concordia"
|
||||
echo "CONCORDIA RUNNER: Running Concordia"
|
||||
|
||||
rm prod/resources/temp/*
|
||||
echo "CONCORDIA RUNNER: reading from file"
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/medium.txt
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -g
|
||||
echo "CONCORDIA RUNNER: searching for pattern: \"drawn from his own\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "drawn from his own"
|
||||
echo "CONCORDIA RUNNER: searching for pattern: \"it is\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "it is" -n
|
||||
|
@ -18,20 +18,29 @@ INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) {
|
||||
}
|
||||
|
||||
sauchar_t * Utils::indexVectorToSaucharArray(
|
||||
vector<INDEX_CHARACTER_TYPE> & input) {
|
||||
const int kArraySize = input.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input) {
|
||||
const int kArraySize = input->size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray =
|
||||
new sauchar_t[kArraySize];
|
||||
int pos = 0;
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = input.begin();
|
||||
it != input.end(); ++it) {
|
||||
insertCharToSaucharArray(patternArray, *it, pos);
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = input->begin();
|
||||
it != input->end(); ++it) {
|
||||
_insertCharToSaucharArray(patternArray, *it, pos);
|
||||
pos += sizeof(INDEX_CHARACTER_TYPE);
|
||||
}
|
||||
return patternArray;
|
||||
}
|
||||
|
||||
void Utils::insertCharToSaucharArray(sauchar_t * array,
|
||||
void Utils::appendCharToSaucharVector(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
||||
INDEX_CHARACTER_TYPE character) {
|
||||
sauchar_t * characterArray = reinterpret_cast<sauchar_t *>(&character);
|
||||
for (int i = 0; i < sizeof(character); i++) {
|
||||
vector->push_back(characterArray[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void Utils::_insertCharToSaucharArray(sauchar_t * array,
|
||||
INDEX_CHARACTER_TYPE character, int pos) {
|
||||
sauchar_t * characterArray = reinterpret_cast<sauchar_t *>(&character);
|
||||
for (int i = pos; i < pos+sizeof(character); i++) {
|
||||
|
@ -26,11 +26,14 @@ public:
|
||||
static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file);
|
||||
|
||||
static sauchar_t * indexVectorToSaucharArray(
|
||||
vector<INDEX_CHARACTER_TYPE> & input);
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input);
|
||||
|
||||
static void insertCharToSaucharArray(sauchar_t * array,
|
||||
INDEX_CHARACTER_TYPE character, int pos);
|
||||
static void appendCharToSaucharVector(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
||||
INDEX_CHARACTER_TYPE character);
|
||||
private:
|
||||
static void _insertCharToSaucharArray(sauchar_t * array,
|
||||
INDEX_CHARACTER_TYPE character, int pos);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include "concordia/concordia.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/common/utils.hpp"
|
||||
|
||||
// ===========================================
|
||||
|
||||
@ -18,10 +19,9 @@ Concordia::Concordia(const std::string & configFilePath)
|
||||
_config = boost::shared_ptr<ConcordiaConfig> (
|
||||
new ConcordiaConfig(configFilePath));
|
||||
_index = boost::shared_ptr<ConcordiaIndex>(
|
||||
new ConcordiaIndex(_config->getWordMapFilePath(),
|
||||
_config->getHashedIndexFilePath(),
|
||||
_config->getSuffixArrayFilePath()));
|
||||
new ConcordiaIndex(_config->getHashedIndexFilePath()));
|
||||
_searcher = boost::shared_ptr<IndexSearcher>(new IndexSearcher());
|
||||
_initializeIndex();
|
||||
}
|
||||
|
||||
Concordia::~Concordia() {
|
||||
@ -41,29 +41,83 @@ std::string _createLibraryVersion() {
|
||||
return version.str();
|
||||
}
|
||||
|
||||
// Sentences are written to disk and added to T.
|
||||
// SA is generated on command by different methods.
|
||||
void Concordia::addSentence(const std::string & sentence)
|
||||
throw(ConcordiaException) {
|
||||
_index->addSentence(sentence);
|
||||
_index->addSentence(_hashGenerator, _T, sentence);
|
||||
}
|
||||
|
||||
void Concordia::addAllSentences(vector<std::string> & sentences)
|
||||
// Sentences are written to disk and added to T.
|
||||
// SA is generated on command by different methods.
|
||||
void Concordia::addAllSentences(
|
||||
boost::shared_ptr<std::vector<std::string> > sentences)
|
||||
throw(ConcordiaException) {
|
||||
_index->addAllSentences(sentences);
|
||||
_index->addAllSentences(_hashGenerator, _T, sentences);
|
||||
}
|
||||
|
||||
void Concordia::generateIndex() throw(ConcordiaException) {
|
||||
_index->generateSuffixArray();
|
||||
void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
||||
if (boost::filesystem::exists(_config->getWordMapFilePath())
|
||||
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) {
|
||||
// reading index from files
|
||||
_T->clear();
|
||||
ifstream hashedIndexFile;
|
||||
hashedIndexFile.open(_config->getHashedIndexFilePath().c_str(), ios::in
|
||||
| ios::ate | ios::binary);
|
||||
saidx_t fileSize = hashedIndexFile.tellg();
|
||||
if (fileSize > 0) {
|
||||
hashedIndexFile.seekg(0, ios::beg);
|
||||
|
||||
while (!hashedIndexFile.eof()) {
|
||||
INDEX_CHARACTER_TYPE character =
|
||||
Utils::readIndexCharacter(hashedIndexFile);
|
||||
Utils::appendCharToSaucharVector(_T, character);
|
||||
}
|
||||
hashedIndexFile.close();
|
||||
|
||||
// generating suffix array
|
||||
_SA = _index->generateSuffixArray(_hashGenerator, _T);
|
||||
} else {
|
||||
throw ConcordiaException("Index corrupt: empty hash index file");
|
||||
}
|
||||
} else {
|
||||
throw ConcordiaException("Index corrupt: missing files");
|
||||
}
|
||||
}
|
||||
|
||||
void Concordia::loadIndex() throw(ConcordiaException) {
|
||||
_searcher->loadIndex(_config->getWordMapFilePath(),
|
||||
_config->getHashedIndexFilePath(),
|
||||
_config->getSuffixArrayFilePath());
|
||||
void Concordia::refreshSAfromRAM() throw(ConcordiaException) {
|
||||
_SA = _index->generateSuffixArray(_hashGenerator, _T);
|
||||
}
|
||||
|
||||
std::vector<saidx_t> Concordia::simpleSearch(const std::string & pattern)
|
||||
|
||||
void Concordia::_initializeIndex() throw(ConcordiaException) {
|
||||
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
||||
new HashGenerator(_config->getWordMapFilePath()));
|
||||
_T = boost::shared_ptr<std::vector<sauchar_t> >(
|
||||
new std::vector<sauchar_t>);
|
||||
if (boost::filesystem::exists(_config->getWordMapFilePath())
|
||||
&& boost::filesystem::exists(_config->getHashedIndexFilePath())) {
|
||||
loadRAMIndexFromDisk();
|
||||
} else if (!boost::filesystem::exists(_config->getWordMapFilePath())
|
||||
&& !boost::filesystem::exists(_config->getHashedIndexFilePath())) {
|
||||
// empty index
|
||||
_SA = boost::shared_ptr<std::vector<saidx_t> >(
|
||||
new std::vector<saidx_t>);
|
||||
} else {
|
||||
throw ConcordiaException("Index corrupt: missing files");
|
||||
}
|
||||
}
|
||||
|
||||
boost::shared_ptr<std::vector<saidx_t> > Concordia::simpleSearch(
|
||||
const string & pattern)
|
||||
throw(ConcordiaException) {
|
||||
return _searcher->simpleSearch(pattern);
|
||||
if (_T->size() > 0) {
|
||||
return _searcher->simpleSearch(_hashGenerator, _T, _SA, pattern);
|
||||
} else {
|
||||
boost::shared_ptr<std::vector<saidx_t> > result =
|
||||
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/concordia_index.hpp"
|
||||
@ -35,17 +36,20 @@ public:
|
||||
|
||||
void addSentence(const std::string & sentence) throw(ConcordiaException);
|
||||
|
||||
void addAllSentences(vector<std::string> & sentences)
|
||||
void addAllSentences(boost::shared_ptr<std::vector<std::string> > sentences)
|
||||
throw(ConcordiaException);
|
||||
|
||||
void generateIndex() throw(ConcordiaException);
|
||||
|
||||
void loadIndex() throw(ConcordiaException);
|
||||
|
||||
std::vector<saidx_t> simpleSearch(const std::string & pattern)
|
||||
boost::shared_ptr<std::vector<saidx_t> > simpleSearch(
|
||||
const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
||||
|
||||
void refreshSAfromRAM() throw(ConcordiaException);
|
||||
|
||||
private:
|
||||
void _initializeIndex() throw(ConcordiaException);
|
||||
|
||||
static std::string _libraryVersion;
|
||||
|
||||
boost::shared_ptr<ConcordiaConfig> _config;
|
||||
@ -53,6 +57,12 @@ private:
|
||||
boost::shared_ptr<ConcordiaIndex> _index;
|
||||
|
||||
boost::shared_ptr<IndexSearcher> _searcher;
|
||||
|
||||
boost::shared_ptr<HashGenerator> _hashGenerator;
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > _T;
|
||||
|
||||
boost::shared_ptr<std::vector<saidx_t> > _SA;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -4,118 +4,71 @@
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <iostream>
|
||||
|
||||
ConcordiaIndex::ConcordiaIndex(const string & wordMapFilePath,
|
||||
const string & hashedIndexFilePath,
|
||||
const string & suffixArrayFilePath)
|
||||
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath)
|
||||
throw(ConcordiaException) :
|
||||
_hashedIndexFilePath(hashedIndexFilePath),
|
||||
_suffixArrayFilePath(suffixArrayFilePath) {
|
||||
if (boost::filesystem::exists(wordMapFilePath)) {
|
||||
if (!boost::filesystem::exists(hashedIndexFilePath)) {
|
||||
throw ConcordiaException("E01: Word map file exists "
|
||||
"but hashed index file absent.");
|
||||
}
|
||||
} else { // WordMap file does not exist
|
||||
if (boost::filesystem::exists(hashedIndexFilePath)) {
|
||||
throw ConcordiaException("E02: Hashed index file exists "
|
||||
"but word map file absent.");
|
||||
}
|
||||
}
|
||||
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
||||
new HashGenerator(wordMapFilePath));
|
||||
_hashedIndexFilePath(hashedIndexFilePath) {
|
||||
}
|
||||
|
||||
ConcordiaIndex::~ConcordiaIndex() {
|
||||
}
|
||||
|
||||
void ConcordiaIndex::_serializeWordMap() {
|
||||
_hashGenerator->serializeWordMap();
|
||||
}
|
||||
|
||||
void ConcordiaIndex::generateSuffixArray() {
|
||||
if (boost::filesystem::exists(_hashedIndexFilePath.c_str())) {
|
||||
ifstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in|
|
||||
ios::ate|ios::binary);
|
||||
|
||||
/* Get the file size. */
|
||||
saidx_t n = hashedIndexFile.tellg();
|
||||
if (n > 0) {
|
||||
sauchar_t *T;
|
||||
saidx_t *SA;
|
||||
|
||||
/* Read n bytes of data. */
|
||||
hashedIndexFile.seekg(0, ios::beg);
|
||||
T = new sauchar_t[n];
|
||||
int pos = 0;
|
||||
while (!hashedIndexFile.eof()) {
|
||||
INDEX_CHARACTER_TYPE character =
|
||||
Utils::readIndexCharacter(hashedIndexFile);
|
||||
Utils::insertCharToSaucharArray(T, character, pos);
|
||||
pos+=sizeof(character);
|
||||
}
|
||||
hashedIndexFile.close();
|
||||
|
||||
SA = new saidx_t[n];
|
||||
|
||||
/* Construct the suffix array. */
|
||||
if (divsufsort(T, SA, (saidx_t)n) != 0) {
|
||||
boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<vector<sauchar_t> > T) {
|
||||
saidx_t * SA_array = new saidx_t[T->size()];
|
||||
if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) {
|
||||
throw ConcordiaException("Error creating suffix array.");
|
||||
}
|
||||
|
||||
/* Write the suffix array. */
|
||||
|
||||
ofstream suffixArrayFile;
|
||||
suffixArrayFile.open(_suffixArrayFilePath.c_str(),
|
||||
ios::out|ios::binary);
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
suffixArrayFile.write(reinterpret_cast<char *>(&SA[i]),
|
||||
sizeof(saidx_t));
|
||||
boost::shared_ptr<vector<saidx_t> > result =
|
||||
boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>);
|
||||
for (int i = 0; i < T->size(); i++) {
|
||||
result->push_back(SA_array[i]);
|
||||
}
|
||||
suffixArrayFile.close();
|
||||
|
||||
/* Deallocate memory. */
|
||||
delete[] T;
|
||||
delete[] SA;
|
||||
} else {
|
||||
throw ConcordiaException("Can not generate suffix array: "
|
||||
"hashed index file is empty");
|
||||
}
|
||||
} else {
|
||||
throw ConcordiaException("Can not generate suffix array: "
|
||||
"hashed index file does not exist");
|
||||
}
|
||||
delete[] SA_array;
|
||||
return result;
|
||||
}
|
||||
|
||||
void ConcordiaIndex::addSentence(const string & sentence) {
|
||||
vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(sentence);
|
||||
void ConcordiaIndex::addSentence(boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<vector<sauchar_t> > T,
|
||||
const string & sentence) {
|
||||
ofstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
||||
ios::app|ios::binary);
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
Utils::writeIndexCharacter(hashedIndexFile, *it);
|
||||
}
|
||||
_addSingleSentence(hashedIndexFile, hashGenerator, T, sentence);
|
||||
hashedIndexFile.close();
|
||||
_serializeWordMap();
|
||||
hashGenerator->serializeWordMap();
|
||||
}
|
||||
|
||||
void ConcordiaIndex::addAllSentences(vector<std::string> & sentences) {
|
||||
void ConcordiaIndex::addAllSentences(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<vector<sauchar_t> > T,
|
||||
boost::shared_ptr<vector<string> > sentences) {
|
||||
ofstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
||||
ios::app|ios::binary);
|
||||
for (vector<string>::iterator sent_it = sentences.begin();
|
||||
sent_it != sentences.end(); ++sent_it) {
|
||||
for (vector<string>::iterator sent_it = sentences->begin();
|
||||
sent_it != sentences->end(); ++sent_it) {
|
||||
string sentence = *sent_it;
|
||||
vector<INDEX_CHARACTER_TYPE> hash =
|
||||
_hashGenerator->generateHash(sentence);
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
Utils::writeIndexCharacter(hashedIndexFile, *it);
|
||||
}
|
||||
_addSingleSentence(hashedIndexFile, hashGenerator, T, sentence);
|
||||
}
|
||||
hashedIndexFile.close();
|
||||
_serializeWordMap();
|
||||
hashGenerator->serializeWordMap();
|
||||
}
|
||||
|
||||
void ConcordiaIndex::_addSingleSentence(
|
||||
ofstream & hashedIndexFile,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
const string & sentence) {
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash
|
||||
= hashGenerator->generateHash(sentence);
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash->begin();
|
||||
it != hash->end(); ++it) {
|
||||
INDEX_CHARACTER_TYPE character = *it;
|
||||
Utils::writeIndexCharacter(hashedIndexFile, character);
|
||||
Utils::appendCharToSaucharVector(T, character);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -19,29 +19,35 @@ using namespace std;
|
||||
|
||||
class ConcordiaIndex {
|
||||
public:
|
||||
explicit ConcordiaIndex(const string & wordMapFilePath,
|
||||
const string & hashedIndexFilePath,
|
||||
const string & suffixArrayFilePath)
|
||||
explicit ConcordiaIndex(const string & hashedIndexFilePath)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~ConcordiaIndex();
|
||||
|
||||
void addSentence(const string & sentence);
|
||||
void addSentence(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<vector<sauchar_t> > T,
|
||||
const string & sentence);
|
||||
|
||||
void addAllSentences(vector<string> & sentences);
|
||||
void addAllSentences(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<vector<sauchar_t> > T,
|
||||
boost::shared_ptr<vector<string> > sentences);
|
||||
|
||||
void generateSuffixArray();
|
||||
boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<vector<sauchar_t> > T);
|
||||
|
||||
private:
|
||||
void _serializeWordMap();
|
||||
|
||||
boost::shared_ptr<HashGenerator> _hashGenerator;
|
||||
// Add sentence to disk index and update RAM index.
|
||||
void _addSingleSentence(ofstream & hashedIndexFile,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
const string & sentence);
|
||||
|
||||
string _hashedIndexFilePath;
|
||||
|
||||
string _suffixArrayFilePath;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -20,17 +20,18 @@ HashGenerator::HashGenerator(const string & wordMapFilePath)
|
||||
HashGenerator::~HashGenerator() {
|
||||
}
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
|
||||
const string & sentence) {
|
||||
vector<INDEX_CHARACTER_TYPE> result;
|
||||
vector<string> tokenTexts;
|
||||
boost::split(tokenTexts, sentence, boost::is_any_of(" "));
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
||||
result(new vector<INDEX_CHARACTER_TYPE>());
|
||||
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
|
||||
boost::split(*tokenTexts, sentence, boost::is_any_of(" "));
|
||||
|
||||
for (vector<string>::iterator it = tokenTexts.begin();
|
||||
it != tokenTexts.end(); ++it) {
|
||||
for (vector<string>::iterator it = tokenTexts->begin();
|
||||
it != tokenTexts->end(); ++it) {
|
||||
string token = *it;
|
||||
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
|
||||
result.push_back(code);
|
||||
result->push_back(code);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -26,7 +26,8 @@ public:
|
||||
*/
|
||||
virtual ~HashGenerator();
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> generateHash(const string & sentence);
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
||||
generateHash(const string & sentence);
|
||||
|
||||
void serializeWordMap();
|
||||
|
||||
|
@ -3,89 +3,38 @@
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
IndexSearcher::IndexSearcher():
|
||||
_T(NULL),
|
||||
_SA(NULL),
|
||||
_n(0) {
|
||||
IndexSearcher::IndexSearcher() {
|
||||
}
|
||||
|
||||
|
||||
IndexSearcher::~IndexSearcher() {
|
||||
}
|
||||
|
||||
|
||||
void IndexSearcher::loadIndex(const string & wordMapFilepath,
|
||||
const string & hashedIndexFilepath,
|
||||
const string & suffixArrayFilepath)
|
||||
throw(ConcordiaException) {
|
||||
if (!boost::filesystem::exists(wordMapFilepath)) {
|
||||
throw ConcordiaException("E06: Failed to open word map "
|
||||
"file for reading.");
|
||||
}
|
||||
|
||||
if (!boost::filesystem::exists(hashedIndexFilepath)) {
|
||||
throw ConcordiaException("E07: Failed to open hashed index file "
|
||||
"for reading.");
|
||||
}
|
||||
|
||||
if (!boost::filesystem::exists(suffixArrayFilepath)) {
|
||||
throw ConcordiaException("E08: Failed to open suffix array file "
|
||||
"for reading.");
|
||||
}
|
||||
|
||||
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
||||
new HashGenerator(wordMapFilepath));
|
||||
|
||||
ifstream hashedIndexFile;
|
||||
hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in
|
||||
| ios::ate | ios::binary);
|
||||
_n = hashedIndexFile.tellg();
|
||||
hashedIndexFile.seekg(0, ios::beg);
|
||||
_T = new sauchar_t[_n];
|
||||
int pos = 0;
|
||||
while (!hashedIndexFile.eof()) {
|
||||
INDEX_CHARACTER_TYPE character =
|
||||
Utils::readIndexCharacter(hashedIndexFile);
|
||||
Utils::insertCharToSaucharArray(_T, character, pos);
|
||||
pos+=sizeof(character);
|
||||
}
|
||||
hashedIndexFile.close();
|
||||
|
||||
_SA = new saidx_t[_n];
|
||||
|
||||
ifstream suffixArrayFile;
|
||||
suffixArrayFile.open(suffixArrayFilepath.c_str(), ios::in | ios::binary);
|
||||
|
||||
saidx_t saidx_buff;
|
||||
pos = 0;
|
||||
while (!suffixArrayFile.eof() && pos < _n) {
|
||||
suffixArrayFile.read(reinterpret_cast<char *>(&saidx_buff),
|
||||
sizeof(saidx_t));
|
||||
_SA[pos++] = saidx_buff;
|
||||
}
|
||||
suffixArrayFile.close();
|
||||
}
|
||||
|
||||
vector<saidx_t> IndexSearcher::simpleSearch(const string & pattern)
|
||||
throw(ConcordiaException) {
|
||||
vector<saidx_t> result;
|
||||
boost::shared_ptr<vector<saidx_t> > IndexSearcher::simpleSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const string & pattern) throw(ConcordiaException) {
|
||||
boost::shared_ptr<vector<saidx_t> > result =
|
||||
boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
||||
|
||||
int left;
|
||||
vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(pattern);
|
||||
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
|
||||
hashGenerator->generateHash(pattern);
|
||||
saidx_t patternLength = hash->size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||
int size = sa_search(_T, (saidx_t) _n,
|
||||
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||
(const sauchar_t *) patternArray, patternLength,
|
||||
_SA, (saidx_t) _n, &left);
|
||||
SA->data(), (saidx_t) T->size(), &left);
|
||||
for (int i = 0; i < size; ++i) {
|
||||
saidx_t result_pos = _SA[left + i];
|
||||
saidx_t result_pos = SA->at(left + i);
|
||||
if (result_pos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||
// As we are looking for a pattern in an array of higher
|
||||
// resolution than the hashed index file, we might
|
||||
// obtain accidental results exceeding the boundaries
|
||||
// of characters in hashed index. The above check
|
||||
// removes these accidental results.
|
||||
result.push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE));
|
||||
result->push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -25,22 +25,12 @@ public:
|
||||
*/
|
||||
virtual ~IndexSearcher();
|
||||
|
||||
void loadIndex(const string & wordMapFilepath,
|
||||
const string & hashedIndexFilepath,
|
||||
const string & suffixArrayFilepath)
|
||||
throw(ConcordiaException);
|
||||
|
||||
vector<saidx_t> simpleSearch(const string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
boost::shared_ptr<vector<saidx_t> > simpleSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const string & pattern) throw(ConcordiaException);
|
||||
private:
|
||||
boost::shared_ptr<HashGenerator> _hashGenerator;
|
||||
|
||||
sauchar_t * _T;
|
||||
|
||||
saidx_t * _SA;
|
||||
|
||||
saidx_t _n;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -4,7 +4,6 @@ add_library(concordia-tests
|
||||
test_word_map.cpp
|
||||
test_hash_generator.cpp
|
||||
test_concordia_index.cpp
|
||||
test_index_searcher.cpp
|
||||
test_concordia_config.cpp
|
||||
test_concordia.cpp
|
||||
)
|
||||
|
@ -25,8 +25,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||
concordia.addSentence("Ala ma kota");
|
||||
concordia.addSentence("Ala ma rysia");
|
||||
concordia.addSentence("Marysia ma rysia");
|
||||
|
||||
concordia.generateIndex();
|
||||
concordia.refreshSAfromRAM();
|
||||
|
||||
/*The test index contains 3 sentences:
|
||||
"Ala ma kota"
|
||||
@ -50,34 +49,30 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||
|
||||
*/
|
||||
|
||||
vector<saidx_t> expectedResult1;
|
||||
expectedResult1.push_back(7);
|
||||
expectedResult1.push_back(4);
|
||||
boost::shared_ptr<std::vector<saidx_t> > expectedResult1(new std::vector<saidx_t>());
|
||||
expectedResult1->push_back(7);
|
||||
expectedResult1->push_back(4);
|
||||
|
||||
concordia.loadIndex();
|
||||
vector<saidx_t> searchResult1 = concordia.simpleSearch("ma rysia");
|
||||
boost::shared_ptr<std::vector<saidx_t> > searchResult1 = concordia.simpleSearch("ma rysia");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_SUFFIX_ARRAY));
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(),
|
||||
expectedResult1.begin(), expectedResult1.end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1->begin(), searchResult1->end(),
|
||||
expectedResult1->begin(), expectedResult1->end());
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
vector<string> testSentences;
|
||||
testSentences.push_back("to jest okno");
|
||||
testSentences.push_back("czy jest okno otwarte");
|
||||
testSentences.push_back("chyba to jest tutaj");
|
||||
testSentences.push_back("to jest");
|
||||
boost::shared_ptr<vector<string> > testSentences (new vector<string>());
|
||||
testSentences->push_back("to jest okno");
|
||||
testSentences->push_back("czy jest okno otwarte");
|
||||
testSentences->push_back("chyba to jest tutaj");
|
||||
testSentences->push_back("to jest");
|
||||
concordia.addAllSentences(testSentences);
|
||||
|
||||
concordia.generateIndex();
|
||||
|
||||
/*The test index contains 4 sentences:
|
||||
"to jest okno"
|
||||
"czy jest okno otwarte"
|
||||
@ -103,27 +98,26 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
|
||||
*/
|
||||
|
||||
vector<saidx_t> expectedResult1;
|
||||
expectedResult1.push_back(11);
|
||||
expectedResult1.push_back(0);
|
||||
expectedResult1.push_back(8);
|
||||
boost::shared_ptr<vector<saidx_t> > expectedResult1(new vector<saidx_t>());
|
||||
expectedResult1->push_back(11);
|
||||
expectedResult1->push_back(0);
|
||||
expectedResult1->push_back(8);
|
||||
|
||||
vector<saidx_t> expectedResult2;
|
||||
expectedResult2.push_back(1);
|
||||
expectedResult2.push_back(4);
|
||||
boost::shared_ptr<vector<saidx_t> > expectedResult2(new vector<saidx_t>());
|
||||
expectedResult2->push_back(1);
|
||||
expectedResult2->push_back(4);
|
||||
|
||||
concordia.loadIndex();
|
||||
vector<saidx_t> searchResult1 = concordia.simpleSearch("to jest");
|
||||
vector<saidx_t> searchResult2 = concordia.simpleSearch("jest okno");
|
||||
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
boost::shared_ptr<vector<saidx_t> > searchResult1 = concordia2.simpleSearch("to jest");
|
||||
boost::shared_ptr<vector<saidx_t> > searchResult2 = concordia2.simpleSearch("jest okno");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_SUFFIX_ARRAY));
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(),
|
||||
expectedResult1.begin(), expectedResult1.end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult2.begin(), searchResult2.end(),
|
||||
expectedResult2.begin(), expectedResult2.end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1->begin(), searchResult1->end(),
|
||||
expectedResult1->begin(), expectedResult1->end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult2->begin(), searchResult2->end(),
|
||||
expectedResult2->begin(), expectedResult2->end());
|
||||
|
||||
}
|
||||
|
||||
|
@ -12,60 +12,42 @@ using namespace std;
|
||||
BOOST_AUTO_TEST_SUITE(concordia_index)
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest1 )
|
||||
{
|
||||
bool exceptionThrown = false;
|
||||
string message = "";
|
||||
|
||||
try {
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","mock_word_map.bin"),
|
||||
TestResourcesManager::getTestFilePath("concordia-index","nonexistent.bin"),
|
||||
TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin"));
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
}
|
||||
|
||||
BOOST_CHECK(exceptionThrown);
|
||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "E01"), true);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ResourcesExistenceTest2 )
|
||||
{
|
||||
bool exceptionThrown = false;
|
||||
string message = "";
|
||||
|
||||
try {
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("concordia-index","nonexistent.bin"),
|
||||
TestResourcesManager::getTestFilePath("concordia-index","mock_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("concordia-index","test_SA.bin"));
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
}
|
||||
|
||||
BOOST_CHECK(exceptionThrown);
|
||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "E02"), true);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
||||
{
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
|
||||
index.addSentence("Ala ma kota");
|
||||
index.addSentence("Ala ma rysia");
|
||||
index.addSentence("Marysia ma rysia");
|
||||
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
|
||||
|
||||
index.generateSuffixArray();
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
|
||||
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
||||
// Test hashed index:
|
||||
// n: 0 1 2 3 4 5 6 7 8
|
||||
// T[n]: 0 1 2 0 1 3 4 1 3
|
||||
T->push_back(0);
|
||||
T->push_back(1);
|
||||
T->push_back(2);
|
||||
T->push_back(0);
|
||||
T->push_back(1);
|
||||
T->push_back(3);
|
||||
T->push_back(4);
|
||||
T->push_back(1);
|
||||
T->push_back(3);
|
||||
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_SA.bin")));
|
||||
// Test suffix array:
|
||||
// n: 0 1 2 3 4 5 6 7 8
|
||||
//SA[n]: 0 3 1 7 4 2 8 5 6
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(hashGenerator, T);
|
||||
|
||||
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
||||
expectedSA->push_back(0);
|
||||
expectedSA->push_back(3);
|
||||
expectedSA->push_back(1);
|
||||
expectedSA->push_back(7);
|
||||
expectedSA->push_back(4);
|
||||
expectedSA->push_back(2);
|
||||
expectedSA->push_back(8);
|
||||
expectedSA->push_back(5);
|
||||
expectedSA->push_back(6);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(SA->begin(), SA->end(), expectedSA->begin(), expectedSA->end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
@ -19,13 +19,13 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||
|
||||
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala ma kota");
|
||||
vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(0);
|
||||
expected.push_back(1);
|
||||
expected.push_back(2);
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash("Ala ma kota");
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
|
||||
expected->push_back(0);
|
||||
expected->push_back(1);
|
||||
expected->push_back(2);
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
@ -35,22 +35,22 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
}
|
||||
|
||||
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala ma kota");
|
||||
vector<INDEX_CHARACTER_TYPE> expected1;
|
||||
expected1.push_back(0);
|
||||
expected1.push_back(1);
|
||||
expected1.push_back(2);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash1.begin(), hash1.end(), expected1.begin(), expected1.end());
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash1 = hashGenerator1.generateHash("Ala ma kota");
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected1(new vector<INDEX_CHARACTER_TYPE>());
|
||||
expected1->push_back(0);
|
||||
expected1->push_back(1);
|
||||
expected1->push_back(2);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash1->begin(), hash1->end(), expected1->begin(), expected1->end());
|
||||
|
||||
hashGenerator1.serializeWordMap();
|
||||
|
||||
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala ma psa");
|
||||
vector<INDEX_CHARACTER_TYPE> expected2;
|
||||
expected2.push_back(0);
|
||||
expected2.push_back(1);
|
||||
expected2.push_back(3);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash2.begin(), hash2.end(), expected2.begin(), expected2.end());
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash2 = hashGenerator2.generateHash("Ala ma psa");
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected2(new vector<INDEX_CHARACTER_TYPE>());
|
||||
expected2->push_back(0);
|
||||
expected2->push_back(1);
|
||||
expected2->push_back(3);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash2->begin(), hash2->end(), expected2->begin(), expected2->end());
|
||||
|
||||
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
||||
}
|
||||
|
@ -1,75 +0,0 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
|
||||
#include "concordia/index_searcher.hpp"
|
||||
#include "concordia/concordia_index.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include "tests/common/test_resources_manager.hpp"
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
using namespace std;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(index_searcher)
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SimpleSearchTest )
|
||||
{
|
||||
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
|
||||
index.addSentence("Ala ma kota");
|
||||
index.addSentence("Ala ma rysia");
|
||||
index.addSentence("Marysia ma rysia");
|
||||
|
||||
index.generateSuffixArray();
|
||||
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_SA.bin")));
|
||||
|
||||
IndexSearcher searcher;
|
||||
searcher.loadIndex(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
|
||||
|
||||
/*The test index contains 3 sentences:
|
||||
"Ala ma kota"
|
||||
"Ala ma rysia"
|
||||
"Marysia ma rysia"
|
||||
|
||||
Test word map:
|
||||
Ala -> 0
|
||||
ma -> 1
|
||||
kota -> 2
|
||||
rysia -> 3
|
||||
Marysia -> 4
|
||||
|
||||
Test hashed index:
|
||||
n: 0 1 2 3 4 5 6 7 8
|
||||
T[n]: 0 1 2 0 1 3 4 1 3
|
||||
|
||||
Test suffix array:
|
||||
n: 0 1 2 3 4 5 6 7 8
|
||||
SA[n]: 0 3 1 7 4 2 8 5 6
|
||||
|
||||
*/
|
||||
|
||||
vector<saidx_t> expectedResult1;
|
||||
expectedResult1.push_back(7);
|
||||
expectedResult1.push_back(4);
|
||||
|
||||
vector<saidx_t> searchResult1 = searcher.simpleSearch("ma rysia");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","test_SA.bin"));
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(searchResult1.begin(), searchResult1.end(),
|
||||
expectedResult1.begin(), expectedResult1.end());
|
||||
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
@ -11,7 +11,7 @@ using namespace std;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(utils)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( UtilsTest1 )
|
||||
BOOST_AUTO_TEST_CASE( WriteReadSingleCharacter )
|
||||
{
|
||||
ofstream testFileOutput;
|
||||
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
||||
@ -29,133 +29,37 @@ BOOST_AUTO_TEST_CASE( UtilsTest1 )
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( UtilsTest2 )
|
||||
BOOST_AUTO_TEST_CASE( IndexVectorToSaucharArray )
|
||||
{
|
||||
ofstream testFileOutput;
|
||||
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
||||
ios::out|ios::binary);
|
||||
Utils::writeIndexCharacter(testFileOutput,123456789); //in hex: 75BCD15
|
||||
//in memory: 15 cd 5b 07
|
||||
// in DEC: 21 205 91 7
|
||||
|
||||
Utils::writeIndexCharacter(testFileOutput,987654321); //in hex: 3ADE68B1
|
||||
//in memory: b1 68 de 3a
|
||||
// in DEC: 177 104 222 58
|
||||
testFileOutput.close();
|
||||
|
||||
sauchar_t * dataArray = new sauchar_t[8];
|
||||
ifstream testFileInput;
|
||||
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
|
||||
|
||||
INDEX_CHARACTER_TYPE retrievedCharacter1 = Utils::readIndexCharacter(testFileInput);
|
||||
BOOST_CHECK_EQUAL(retrievedCharacter1, 123456789);
|
||||
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter1, 0);
|
||||
|
||||
INDEX_CHARACTER_TYPE retrievedCharacter2 = Utils::readIndexCharacter(testFileInput);
|
||||
BOOST_CHECK_EQUAL(retrievedCharacter2, 987654321);
|
||||
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter2, 4);
|
||||
|
||||
testFileInput.close();
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(21);
|
||||
expected.push_back(205);
|
||||
expected.push_back(91);
|
||||
expected.push_back(7);
|
||||
expected.push_back(177);
|
||||
expected.push_back(104);
|
||||
expected.push_back(222);
|
||||
expected.push_back(58);
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> result;
|
||||
for (int i=0;i<8;i++) {
|
||||
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||
result.push_back(a);
|
||||
}
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( UtilsTest3 )
|
||||
{
|
||||
vector<INDEX_CHARACTER_TYPE> hash;
|
||||
hash.push_back(123456789);
|
||||
hash.push_back(987654321);
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash(new vector<INDEX_CHARACTER_TYPE>());
|
||||
hash->push_back(123456789); // in hex: 75BCD15
|
||||
// in memory: 15 cd 5b 07
|
||||
// in memory DEC: 21 205 91 7
|
||||
|
||||
hash->push_back(987654321); // in hex: 3ADE68B1
|
||||
// in memory: b1 68 de 3a
|
||||
// in memory DEC: 177 104 222 58
|
||||
sauchar_t * dataArray = Utils::indexVectorToSaucharArray(hash);
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> result;
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > result(new vector<INDEX_CHARACTER_TYPE>());
|
||||
for (int i=0;i<8;i++) {
|
||||
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||
result.push_back(a);
|
||||
result->push_back(a);
|
||||
}
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(21);
|
||||
expected.push_back(205);
|
||||
expected.push_back(91);
|
||||
expected.push_back(7);
|
||||
expected.push_back(177);
|
||||
expected.push_back(104);
|
||||
expected.push_back(222);
|
||||
expected.push_back(58);
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
|
||||
expected->push_back(21);
|
||||
expected->push_back(205);
|
||||
expected->push_back(91);
|
||||
expected->push_back(7);
|
||||
expected->push_back(177);
|
||||
expected->push_back(104);
|
||||
expected->push_back(222);
|
||||
expected->push_back(58);
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());
|
||||
}
|
||||
|
||||
/*
|
||||
BOOST_AUTO_TEST_CASE( UtilsTest4 )
|
||||
{
|
||||
ofstream testFileOutput;
|
||||
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
||||
ios::out|ios::binary);
|
||||
Utils::writeIndexCharacter(testFileOutput,123456789); //in hex: 75BCD15
|
||||
//in memory: 15 cd 5b 07
|
||||
// in DEC: 21 205 91 7
|
||||
|
||||
Utils::writeIndexCharacter(testFileOutput,987654321); //in hex: 3ADE68B1
|
||||
//in memory: b1 68 de 3a
|
||||
// in DEC: 177 104 222 58
|
||||
testFileOutput.close();
|
||||
|
||||
sauchar_t * dataArray = Utils::readIndexFromFile(
|
||||
ifstream testFileInput;
|
||||
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
|
||||
|
||||
INDEX_CHARACTER_TYPE retrievedCharacter1 = Utils::readIndexCharacter(testFileInput);
|
||||
BOOST_CHECK_EQUAL(retrievedCharacter1, 123456789);
|
||||
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter1, 0);
|
||||
|
||||
INDEX_CHARACTER_TYPE retrievedCharacter2 = Utils::readIndexCharacter(testFileInput);
|
||||
BOOST_CHECK_EQUAL(retrievedCharacter2, 987654321);
|
||||
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter2, 4);
|
||||
|
||||
testFileInput.close();
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(21);
|
||||
expected.push_back(205);
|
||||
expected.push_back(91);
|
||||
expected.push_back(7);
|
||||
expected.push_back(177);
|
||||
expected.push_back(104);
|
||||
expected.push_back(222);
|
||||
expected.push_back(58);
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> result;
|
||||
for (int i=0;i<8;i++) {
|
||||
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||
result.push_back(a);
|
||||
}
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
Loading…
Reference in New Issue
Block a user