concordia-console, new approach to suffix array - 4 sauchars per one saidx

This commit is contained in:
rjawor 2013-12-06 22:29:25 +01:00
parent 7c1ed7fb6e
commit 47405834a3
28 changed files with 6763 additions and 101 deletions

2
.gitignore vendored
View File

@ -5,5 +5,5 @@ prod/resources/concordia-config/concordia.cfg
concordia/common/config.hpp concordia/common/config.hpp
tests/resources/concordia-config/concordia.cfg tests/resources/concordia-config/concordia.cfg
tests/resources/temp tests/resources/temp
prod/resources/temp

View File

@ -6,6 +6,10 @@ project(concordia C CXX)
set (CONCORDIA_VERSION_MAJOR 0) set (CONCORDIA_VERSION_MAJOR 0)
set (CONCORDIA_VERSION_MINOR 1) set (CONCORDIA_VERSION_MINOR 1)
# Type of the characters in index
set (INDEX_CHARACTER_TYPE "unsigned int")
# ============================== # # ============================== #
# Production paths # Production paths
# ============================== # # ============================== #
@ -25,7 +29,7 @@ set (TEMP_HASHED_INDEX "temp_hashed_index.bin")
set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin") set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin")
file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp) file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp)
file(MAKE_DIRECTORY ${PROD_RESOURCES_DIRECTORY}/temp)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
@ -112,6 +116,10 @@ configure_file (
"${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg" "${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg"
) )
configure_file (
"${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg.in"
"${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg"
)
# ================================================ # ================================================
# Concordia: sub-projects # Concordia: sub-projects

View File

@ -2,10 +2,15 @@
#include <fstream> #include <fstream>
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/date_time/posix_time/posix_time.hpp>
#include "concordia/concordia.hpp" #include "concordia/concordia.hpp"
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
#include "build/libdivsufsort/include/divsufsort.h" #include "build/libdivsufsort/include/divsufsort.h"
#define READ_BUFFER_LENGTH 1000
namespace po = boost::program_options; namespace po = boost::program_options;
int main(int argc, char** argv) { int main(int argc, char** argv) {
@ -14,7 +19,14 @@ int main(int argc, char** argv) {
desc.add_options() desc.add_options()
("help,h", "Display this message") ("help,h", "Display this message")
("config,c", boost::program_options::value<std::string>(), ("config,c", boost::program_options::value<std::string>(),
"Concordia configuration file (required)"); "Concordia configuration file (required)")
("generate-index,g", "Generate suffix array based index out of "
"added sentences")
("load-index,l", "Load the generated index for searching")
("simple-search,s", boost::program_options::value<std::string>(),
"Pattern to be searched in the index")
("read-file,r", boost::program_options::value<std::string>(),
"File to be read and added to index");
po::variables_map cli; po::variables_map cli;
po::store(po::parse_command_line(argc, argv, desc), cli); po::store(po::parse_command_line(argc, argv, desc), cli);
@ -38,7 +50,90 @@ int main(int argc, char** argv) {
try { try {
Concordia concordia(configFile); Concordia concordia(configFile);
std::cout << "Welcome to Concordia. Version = " std::cout << "Welcome to Concordia. Version = "
<< concordia.getVersion() << endl; << concordia.getVersion() << std::endl;
if (cli.count("generate-index")) {
std::cout << "\tGenerating index..." << std::endl;
boost::posix_time::ptime time_start =
boost::posix_time::microsec_clock::local_time();
concordia.generateIndex();
boost::posix_time::ptime time_end =
boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration msdiff = time_end - time_start;
std::cout << "\tIndex generated in: " <<
msdiff.total_milliseconds() << "ms." << std::endl;
} else if (cli.count("load-index")) {
std::cout << "\tLoading index..." << std::endl;
boost::posix_time::ptime time_start =
boost::posix_time::microsec_clock::local_time();
concordia.loadIndex();
boost::posix_time::ptime time_end =
boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration msdiff = time_end - time_start;
std::cout << "\tIndex loaded in: " <<
msdiff.total_milliseconds() << "ms." << std::endl;
} else if (cli.count("simple-search")) {
std::string pattern = cli["simple-search"].as<std::string>();
std::cout << "\tSearching for pattern: \"" << pattern <<
"\"" << std::endl;
} else if (cli.count("read-file")) {
std::string filePath = cli["read-file"].as<std::string>();
std::cout << "\tReading sentences from file: " << filePath <<
std::endl;
ifstream text_file(filePath.c_str());
std::string line;
if (text_file.is_open()) {
long lineCount = 0;
vector<std::string> buffer;
boost::posix_time::ptime timeStart =
boost::posix_time::microsec_clock::local_time();
while (getline(text_file, line)) {
lineCount++;
buffer.push_back(line);
if (lineCount % READ_BUFFER_LENGTH == 0) {
concordia.addAllSentences(buffer);
buffer.clear();
boost::posix_time::ptime timeEnd =
boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration msdiff =
timeEnd - timeStart;
long timeElapsed = msdiff.total_milliseconds();
double speed = static_cast<double>(
1000 * lineCount / timeElapsed);
std::cout << "\tRead and added to index " <<
lineCount << " sentences in " << timeElapsed
<< "ms. Current speed: " << speed <<
" sentences per second" << std::endl;
}
}
if (buffer.size() > 0) {
concordia.addAllSentences(buffer);
}
text_file.close();
boost::posix_time::ptime timeTotalEnd =
boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration totalMsdiff =
timeTotalEnd - timeStart;
long totalTimeElapsed = totalMsdiff.total_milliseconds();
double totalSpeed =
static_cast<double>(1000 * lineCount / totalTimeElapsed);
std::cout << "\tReading finished. Read and added to index "
<< lineCount << " sentences in " << totalTimeElapsed <<
"ms. Overall speed: " << totalSpeed <<
" sentences per second" << std::endl;
} else {
std::cerr << "Unable to open file: "<< filePath;
return 1;
}
} else {
std::cerr << "One of the options: generate-index, simple-search, "
<< "read-file must be provided. See the "
"options specification: "
<< std::endl << desc << std::endl;
return 1;
}
std::cout << "Concordia operation completed without errors."
<< std::endl;
} catch(ConcordiaException & e) { } catch(ConcordiaException & e) {
std::cerr << "ConcordiaException caught with message: " std::cerr << "ConcordiaException caught with message: "
<< std::endl << std::endl
@ -48,7 +143,7 @@ int main(int argc, char** argv) {
<< std::endl; << std::endl;
return 1; return 1;
} catch(exception & e) { } catch(exception & e) {
std::cerr << "Exception caught with message: " std::cerr << "Unexpected exception caught with message: "
<< std::endl << std::endl
<< e.what() << e.what()
<< std::endl << std::endl

View File

@ -2,5 +2,9 @@
echo "Running Concordia" echo "Running Concordia"
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg rm prod/resources/temp/*
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/medium.txt
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -g
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -l
#./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Ala ma chyba kota"

View File

@ -14,6 +14,7 @@ add_library(concordia SHARED
concordia_config.cpp concordia_config.cpp
concordia_exception.cpp concordia_exception.cpp
common/logging.cpp common/logging.cpp
common/utils.cpp
) )
add_subdirectory(t) add_subdirectory(t)
@ -22,6 +23,18 @@ add_subdirectory(t)
install(TARGETS concordia DESTINATION lib/) install(TARGETS concordia DESTINATION lib/)
install(FILES concordia.hpp DESTINATION include/concordia/) install(FILES concordia.hpp DESTINATION include/concordia/)
# ----------------------------------------------------
# libconfig
# ----------------------------------------------------
find_library(LIBCONFIG_LIB NAMES config++ REQUIRED)
find_path(LIBCONFIG_INCLUDE libconfig.h++)
if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
message(STATUS "Found Libconfig")
include_directories(${LIBCONFIG_INCLUDE})
link_directories(${LIBCONFIG_LIB})
endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
target_link_libraries(concordia log4cpp) target_link_libraries(concordia log4cpp)
target_link_libraries(concordia ${LIBSTEMMER_LIB}) target_link_libraries(concordia ${LIBSTEMMER_LIB})
target_link_libraries(concordia ${Boost_LIBRARIES}) target_link_libraries(concordia ${Boost_LIBRARIES})

View File

@ -15,3 +15,6 @@
#define LEXICON_TEXT_FIELD_SEPARATORS "\t " #define LEXICON_TEXT_FIELD_SEPARATORS "\t "
#define LEXICON_FIELD_SEPARATOR "\t" #define LEXICON_FIELD_SEPARATOR "\t"
typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;

View File

@ -0,0 +1,41 @@
#include "concordia/common/utils.hpp"
Utils::Utils() {
}
Utils::~Utils() {
}
void Utils::writeIndexCharacter(ofstream & file,
INDEX_CHARACTER_TYPE character) {
file.write(reinterpret_cast<char *>(&character), sizeof(character));
}
INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) {
INDEX_CHARACTER_TYPE character;
file.read(reinterpret_cast<char *>(&character), sizeof(character));
return character;
}
sauchar_t * Utils::indexVectorToSaucharArray(
vector<INDEX_CHARACTER_TYPE> & input) {
const int kArraySize = input.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray =
new sauchar_t[kArraySize];
int pos = 0;
for (vector<INDEX_CHARACTER_TYPE>::iterator it = input.begin();
it != input.end(); ++it) {
insertCharToSaucharArray(patternArray, *it, pos);
pos += sizeof(INDEX_CHARACTER_TYPE);
}
return patternArray;
}
void Utils::insertCharToSaucharArray(sauchar_t * array,
INDEX_CHARACTER_TYPE character, int pos) {
sauchar_t * characterArray = reinterpret_cast<sauchar_t *>(&character);
for (int i = pos; i < pos+sizeof(character); i++) {
array[i] = characterArray[i-pos];
}
}

View File

@ -0,0 +1,36 @@
#ifndef UTILS_HDR
#define UTILS_HDR
#include <boost/shared_ptr.hpp>
#include <fstream>
#include <iostream>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/concordia_exception.hpp"
#include "build/libdivsufsort/include/divsufsort.h"
using namespace std;
class Utils {
public:
explicit Utils();
/*! Destructor.
*/
virtual ~Utils();
static void writeIndexCharacter(ofstream & file,
INDEX_CHARACTER_TYPE character);
static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file);
static sauchar_t * indexVectorToSaucharArray(
vector<INDEX_CHARACTER_TYPE> & input);
static void insertCharToSaucharArray(sauchar_t * array,
INDEX_CHARACTER_TYPE character, int pos);
private:
};
#endif

View File

@ -46,9 +46,16 @@ void Concordia::addSentence(const std::string & sentence)
_index->addSentence(sentence); _index->addSentence(sentence);
} }
void Concordia::addAllSentences(vector<std::string> & sentences)
throw(ConcordiaException) {
_index->addAllSentences(sentences);
}
void Concordia::generateIndex() throw(ConcordiaException) { void Concordia::generateIndex() throw(ConcordiaException) {
_index->generateSuffixArray(); _index->generateSuffixArray();
_index->serializeWordMap(); }
void Concordia::loadIndex() throw(ConcordiaException) {
_searcher->loadIndex(_config->getWordMapFilePath(), _searcher->loadIndex(_config->getWordMapFilePath(),
_config->getHashedIndexFilePath(), _config->getHashedIndexFilePath(),
_config->getSuffixArrayFilePath()); _config->getSuffixArrayFilePath());

View File

@ -35,8 +35,13 @@ public:
void addSentence(const std::string & sentence) throw(ConcordiaException); void addSentence(const std::string & sentence) throw(ConcordiaException);
void addAllSentences(vector<std::string> & sentences)
throw(ConcordiaException);
void generateIndex() throw(ConcordiaException); void generateIndex() throw(ConcordiaException);
void loadIndex() throw(ConcordiaException);
std::vector<saidx_t> simpleSearch(const std::string & pattern) std::vector<saidx_t> simpleSearch(const std::string & pattern)
throw(ConcordiaException); throw(ConcordiaException);

View File

@ -1,5 +1,6 @@
#include "concordia/concordia_index.hpp" #include "concordia/concordia_index.hpp"
#include "concordia/common/utils.hpp"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
#include <iostream> #include <iostream>
@ -27,36 +28,36 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilePath,
ConcordiaIndex::~ConcordiaIndex() { ConcordiaIndex::~ConcordiaIndex() {
} }
void ConcordiaIndex::serializeWordMap() { void ConcordiaIndex::_serializeWordMap() {
_hashGenerator->serializeWordMap(); _hashGenerator->serializeWordMap();
} }
void ConcordiaIndex::generateSuffixArray() { void ConcordiaIndex::generateSuffixArray() {
if (boost::filesystem::exists(_hashedIndexFilePath.c_str())) {
ifstream hashedIndexFile; ifstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in| hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in|
ios::ate|ios::binary); ios::ate|ios::binary);
/* Get the file size. */ /* Get the file size. */
long n = hashedIndexFile.tellg() / sizeof(sauchar_t); saidx_t n = hashedIndexFile.tellg();
if (n > 0) {
sauchar_t *T; sauchar_t *T;
saidx_t *SA; saidx_t *SA;
T = new sauchar_t[n];
SA = new saidx_t[n];
/* Read n bytes of data. */ /* Read n bytes of data. */
hashedIndexFile.seekg(0, ios::beg); hashedIndexFile.seekg(0, ios::beg);
T = new sauchar_t[n];
sauchar_t buff;
int pos = 0; int pos = 0;
while (!hashedIndexFile.eof()) { while (!hashedIndexFile.eof()) {
hashedIndexFile.read(reinterpret_cast<char *>(&buff), INDEX_CHARACTER_TYPE character =
sizeof(sauchar_t)); Utils::readIndexCharacter(hashedIndexFile);
T[pos++] = buff; Utils::insertCharToSaucharArray(T, character, pos);
pos+=sizeof(character);
} }
hashedIndexFile.close(); hashedIndexFile.close();
SA = new saidx_t[n];
/* Construct the suffix array. */ /* Construct the suffix array. */
if (divsufsort(T, SA, (saidx_t)n) != 0) { if (divsufsort(T, SA, (saidx_t)n) != 0) {
throw ConcordiaException("Error creating suffix array."); throw ConcordiaException("Error creating suffix array.");
@ -65,7 +66,8 @@ void ConcordiaIndex::generateSuffixArray() {
/* Write the suffix array. */ /* Write the suffix array. */
ofstream suffixArrayFile; ofstream suffixArrayFile;
suffixArrayFile.open(_suffixArrayFilePath.c_str(), ios::out|ios::binary); suffixArrayFile.open(_suffixArrayFilePath.c_str(),
ios::out|ios::binary);
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
suffixArrayFile.write(reinterpret_cast<char *>(&SA[i]), suffixArrayFile.write(reinterpret_cast<char *>(&SA[i]),
@ -76,19 +78,44 @@ void ConcordiaIndex::generateSuffixArray() {
/* Deallocate memory. */ /* Deallocate memory. */
delete[] T; delete[] T;
delete[] SA; delete[] SA;
} else {
throw ConcordiaException("Can not generate suffix array: "
"hashed index file is empty");
}
} else {
throw ConcordiaException("Can not generate suffix array: "
"hashed index file does not exist");
}
} }
void ConcordiaIndex::addSentence(const string & sentence) { void ConcordiaIndex::addSentence(const string & sentence) {
vector<sauchar_t> hash = _hashGenerator->generateHash(sentence); vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(sentence);
ofstream hashedIndexFile; ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out| hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
ios::app|ios::binary); ios::app|ios::binary);
for (vector<sauchar_t>::iterator it = hash.begin(); for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
it != hash.end(); ++it) { it != hash.end(); ++it) {
sauchar_t buff = *it; Utils::writeIndexCharacter(hashedIndexFile, *it);
hashedIndexFile.write(reinterpret_cast<char *>(&buff),
sizeof(sauchar_t));
} }
hashedIndexFile.close(); hashedIndexFile.close();
_serializeWordMap();
}
void ConcordiaIndex::addAllSentences(vector<std::string> & sentences) {
ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
ios::app|ios::binary);
for (vector<string>::iterator sent_it = sentences.begin();
sent_it != sentences.end(); ++sent_it) {
string sentence = *sent_it;
vector<INDEX_CHARACTER_TYPE> hash =
_hashGenerator->generateHash(sentence);
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
it != hash.end(); ++it) {
Utils::writeIndexCharacter(hashedIndexFile, *it);
}
}
hashedIndexFile.close();
_serializeWordMap();
} }

View File

@ -30,11 +30,13 @@ public:
void addSentence(const string & sentence); void addSentence(const string & sentence);
void serializeWordMap(); void addAllSentences(vector<string> & sentences);
void generateSuffixArray(); void generateSuffixArray();
private: private:
void _serializeWordMap();
boost::shared_ptr<HashGenerator> _hashGenerator; boost::shared_ptr<HashGenerator> _hashGenerator;
string _hashedIndexFilePath; string _hashedIndexFilePath;

View File

@ -20,15 +20,16 @@ HashGenerator::HashGenerator(const string & wordMapFilePath)
HashGenerator::~HashGenerator() { HashGenerator::~HashGenerator() {
} }
vector<sauchar_t> HashGenerator::generateHash(const string & sentence) { vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
vector<sauchar_t> result; const string & sentence) {
vector<INDEX_CHARACTER_TYPE> result;
vector<string> tokenTexts; vector<string> tokenTexts;
boost::split(tokenTexts, sentence, boost::is_any_of(" ")); boost::split(tokenTexts, sentence, boost::is_any_of(" "));
for (vector<string>::iterator it = tokenTexts.begin(); for (vector<string>::iterator it = tokenTexts.begin();
it != tokenTexts.end(); ++it) { it != tokenTexts.end(); ++it) {
string token = *it; string token = *it;
sauchar_t code = _wordMap->getWordCode(token); INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
result.push_back(code); result.push_back(code);
} }

View File

@ -6,10 +6,9 @@
#include <vector> #include <vector>
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include "concordia/word_map.hpp" #include "concordia/word_map.hpp"
#include "concordia/common/config.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
#include "build/libdivsufsort/include/divsufsort.h"
/*! /*!
Class for generating a sentence hash. Class for generating a sentence hash.
@ -27,7 +26,7 @@ public:
*/ */
virtual ~HashGenerator(); virtual ~HashGenerator();
vector<sauchar_t> generateHash(const string & sentence); vector<INDEX_CHARACTER_TYPE> generateHash(const string & sentence);
void serializeWordMap(); void serializeWordMap();

View File

@ -1,5 +1,6 @@
#include "concordia/index_searcher.hpp" #include "concordia/index_searcher.hpp"
#include "concordia/common/utils.hpp"
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
IndexSearcher::IndexSearcher(): IndexSearcher::IndexSearcher():
@ -38,16 +39,15 @@ void IndexSearcher::loadIndex(const string & wordMapFilepath,
ifstream hashedIndexFile; ifstream hashedIndexFile;
hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in
| ios::ate | ios::binary); | ios::ate | ios::binary);
_n = hashedIndexFile.tellg() / sizeof(sauchar_t); _n = hashedIndexFile.tellg();
_T = new sauchar_t[_n];
hashedIndexFile.seekg(0, ios::beg); hashedIndexFile.seekg(0, ios::beg);
sauchar_t sauchar_buff; _T = new sauchar_t[_n];
int pos = 0; int pos = 0;
while (!hashedIndexFile.eof()) { while (!hashedIndexFile.eof()) {
hashedIndexFile.read(reinterpret_cast<char *>(&sauchar_buff), INDEX_CHARACTER_TYPE character =
sizeof(sauchar_t)); Utils::readIndexCharacter(hashedIndexFile);
_T[pos++] = sauchar_buff; Utils::insertCharToSaucharArray(_T, character, pos);
pos+=sizeof(character);
} }
hashedIndexFile.close(); hashedIndexFile.close();
@ -59,7 +59,8 @@ void IndexSearcher::loadIndex(const string & wordMapFilepath,
saidx_t saidx_buff; saidx_t saidx_buff;
pos = 0; pos = 0;
while (!suffixArrayFile.eof() && pos < _n) { while (!suffixArrayFile.eof() && pos < _n) {
suffixArrayFile.read(reinterpret_cast<char *>(&saidx_buff), sizeof(saidx_t)); suffixArrayFile.read(reinterpret_cast<char *>(&saidx_buff),
sizeof(saidx_t));
_SA[pos++] = saidx_buff; _SA[pos++] = saidx_buff;
} }
suffixArrayFile.close(); suffixArrayFile.close();
@ -70,20 +71,22 @@ vector<saidx_t> IndexSearcher::simpleSearch(const string & pattern)
vector<saidx_t> result; vector<saidx_t> result;
int left; int left;
vector<sauchar_t> hash = _hashGenerator->generateHash(pattern); vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(pattern);
saidx_t patternLength = hash.size(); saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = new sauchar_t[patternLength]; sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
int i = 0;
for (vector<sauchar_t>::iterator it = hash.begin();
it != hash.end(); ++it) {
patternArray[i] = *it;
i++;
}
int size = sa_search(_T, (saidx_t) _n, int size = sa_search(_T, (saidx_t) _n,
(const sauchar_t *) patternArray, patternLength, (const sauchar_t *) patternArray, patternLength,
_SA, (saidx_t) _n, &left); _SA, (saidx_t) _n, &left);
for (i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
result.push_back(_SA[left + i]); saidx_t result_pos = _SA[left + i];
if (result_pos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
// As we are looking for a pattern in an array of higher
// resolution than the hashed index file, we might
// obtain accidental results exceeding the boundaries
// of characters in hashed index. The above check
// removes these accidental results.
result.push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE));
}
} }
delete[] patternArray; delete[] patternArray;

View File

@ -5,6 +5,7 @@
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include "concordia/common/config.hpp"
#include "build/libdivsufsort/include/divsufsort.h" #include "build/libdivsufsort/include/divsufsort.h"
#include "concordia/hash_generator.hpp" #include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
@ -39,7 +40,7 @@ private:
saidx_t * _SA; saidx_t * _SA;
size_t _n; saidx_t _n;
}; };
#endif #endif

View File

@ -1,4 +1,5 @@
add_library(concordia-tests add_library(concordia-tests
test_utils.cpp
test_word_map.cpp test_word_map.cpp
test_hash_generator.cpp test_hash_generator.cpp
test_concordia_index.cpp test_concordia_index.cpp

View File

@ -54,6 +54,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
expectedResult1.push_back(7); expectedResult1.push_back(7);
expectedResult1.push_back(4); expectedResult1.push_back(4);
concordia.loadIndex();
vector<saidx_t> searchResult1 = concordia.simpleSearch("ma rysia"); vector<saidx_t> searchResult1 = concordia.simpleSearch("ma rysia");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
@ -68,10 +69,12 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
{ {
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
concordia.addSentence("to jest okno"); vector<string> testSentences;
concordia.addSentence("czy jest okno otwarte"); testSentences.push_back("to jest okno");
concordia.addSentence("chyba to jest tutaj"); testSentences.push_back("czy jest okno otwarte");
concordia.addSentence("to jest"); testSentences.push_back("chyba to jest tutaj");
testSentences.push_back("to jest");
concordia.addAllSentences(testSentences);
concordia.generateIndex(); concordia.generateIndex();
@ -109,6 +112,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
expectedResult2.push_back(1); expectedResult2.push_back(1);
expectedResult2.push_back(4); expectedResult2.push_back(4);
concordia.loadIndex();
vector<saidx_t> searchResult1 = concordia.simpleSearch("to jest"); vector<saidx_t> searchResult1 = concordia.simpleSearch("to jest");
vector<saidx_t> searchResult2 = concordia.simpleSearch("jest okno"); vector<saidx_t> searchResult2 = concordia.simpleSearch("jest okno");

View File

@ -58,7 +58,6 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
index.addSentence("Marysia ma rysia"); index.addSentence("Marysia ma rysia");
index.generateSuffixArray(); index.generateSuffixArray();
index.serializeWordMap();
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"))); BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"))); BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));

View File

@ -2,6 +2,7 @@
#include "tests/unit-tests/unit_tests_globals.hpp" #include "tests/unit-tests/unit_tests_globals.hpp"
#include <string> #include <string>
#include "concordia/common/config.hpp"
#include "concordia/hash_generator.hpp" #include "concordia/hash_generator.hpp"
#define TEST_WORD_MAP_PATH "/tmp/test_word_map.bin" #define TEST_WORD_MAP_PATH "/tmp/test_word_map.bin"
@ -18,8 +19,8 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH); HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
vector<sauchar_t> hash = hashGenerator.generateHash("Ala ma kota"); vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala ma kota");
vector<sauchar_t> expected; vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(0); expected.push_back(0);
expected.push_back(1); expected.push_back(1);
expected.push_back(2); expected.push_back(2);
@ -34,8 +35,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
} }
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH); HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
vector<sauchar_t> hash1 = hashGenerator1.generateHash("Ala ma kota"); vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala ma kota");
vector<sauchar_t> expected1; vector<INDEX_CHARACTER_TYPE> expected1;
expected1.push_back(0); expected1.push_back(0);
expected1.push_back(1); expected1.push_back(1);
expected1.push_back(2); expected1.push_back(2);
@ -44,8 +45,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
hashGenerator1.serializeWordMap(); hashGenerator1.serializeWordMap();
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH); HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
vector<sauchar_t> hash2 = hashGenerator2.generateHash("Ala ma psa"); vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala ma psa");
vector<sauchar_t> expected2; vector<INDEX_CHARACTER_TYPE> expected2;
expected2.push_back(0); expected2.push_back(0);
expected2.push_back(1); expected2.push_back(1);
expected2.push_back(3); expected2.push_back(3);

View File

@ -24,7 +24,6 @@ ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map
index.addSentence("Marysia ma rysia"); index.addSentence("Marysia ma rysia");
index.generateSuffixArray(); index.generateSuffixArray();
index.serializeWordMap();
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin"))); BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"))); BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));

161
concordia/t/test_utils.cpp Normal file
View File

@ -0,0 +1,161 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/common/utils.hpp"
#include "concordia/common/config.hpp"
#include "tests/common/test_resources_manager.hpp"
#include "build/libdivsufsort/include/divsufsort.h"
#include <boost/filesystem.hpp>
#include <string>
using namespace std;
BOOST_AUTO_TEST_SUITE(utils)
BOOST_AUTO_TEST_CASE( UtilsTest1 )
{
ofstream testFileOutput;
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
ios::out|ios::binary);
INDEX_CHARACTER_TYPE testCharacter = 123456789; //in hex: 75BCD15
Utils::writeIndexCharacter(testFileOutput,testCharacter);
testFileOutput.close();
ifstream testFileInput;
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
INDEX_CHARACTER_TYPE retrievedCharacter = Utils::readIndexCharacter(testFileInput);
BOOST_CHECK_EQUAL(retrievedCharacter, testCharacter);
testFileInput.close();
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
}
BOOST_AUTO_TEST_CASE( UtilsTest2 )
{
ofstream testFileOutput;
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
ios::out|ios::binary);
Utils::writeIndexCharacter(testFileOutput,123456789); //in hex: 75BCD15
//in memory: 15 cd 5b 07
// in DEC: 21 205 91 7
Utils::writeIndexCharacter(testFileOutput,987654321); //in hex: 3ADE68B1
//in memory: b1 68 de 3a
// in DEC: 177 104 222 58
testFileOutput.close();
sauchar_t * dataArray = new sauchar_t[8];
ifstream testFileInput;
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
INDEX_CHARACTER_TYPE retrievedCharacter1 = Utils::readIndexCharacter(testFileInput);
BOOST_CHECK_EQUAL(retrievedCharacter1, 123456789);
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter1, 0);
INDEX_CHARACTER_TYPE retrievedCharacter2 = Utils::readIndexCharacter(testFileInput);
BOOST_CHECK_EQUAL(retrievedCharacter2, 987654321);
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter2, 4);
testFileInput.close();
vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(21);
expected.push_back(205);
expected.push_back(91);
expected.push_back(7);
expected.push_back(177);
expected.push_back(104);
expected.push_back(222);
expected.push_back(58);
vector<INDEX_CHARACTER_TYPE> result;
for (int i=0;i<8;i++) {
INDEX_CHARACTER_TYPE a = dataArray[i];
result.push_back(a);
}
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
}
BOOST_AUTO_TEST_CASE( UtilsTest3 )
{
vector<INDEX_CHARACTER_TYPE> hash;
hash.push_back(123456789);
hash.push_back(987654321);
sauchar_t * dataArray = Utils::indexVectorToSaucharArray(hash);
vector<INDEX_CHARACTER_TYPE> result;
for (int i=0;i<8;i++) {
INDEX_CHARACTER_TYPE a = dataArray[i];
result.push_back(a);
}
vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(21);
expected.push_back(205);
expected.push_back(91);
expected.push_back(7);
expected.push_back(177);
expected.push_back(104);
expected.push_back(222);
expected.push_back(58);
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
}
/*
BOOST_AUTO_TEST_CASE( UtilsTest4 )
{
ofstream testFileOutput;
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
ios::out|ios::binary);
Utils::writeIndexCharacter(testFileOutput,123456789); //in hex: 75BCD15
//in memory: 15 cd 5b 07
// in DEC: 21 205 91 7
Utils::writeIndexCharacter(testFileOutput,987654321); //in hex: 3ADE68B1
//in memory: b1 68 de 3a
// in DEC: 177 104 222 58
testFileOutput.close();
sauchar_t * dataArray = Utils::readIndexFromFile(
ifstream testFileInput;
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
INDEX_CHARACTER_TYPE retrievedCharacter1 = Utils::readIndexCharacter(testFileInput);
BOOST_CHECK_EQUAL(retrievedCharacter1, 123456789);
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter1, 0);
INDEX_CHARACTER_TYPE retrievedCharacter2 = Utils::readIndexCharacter(testFileInput);
BOOST_CHECK_EQUAL(retrievedCharacter2, 987654321);
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter2, 4);
testFileInput.close();
vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(21);
expected.push_back(205);
expected.push_back(91);
expected.push_back(7);
expected.push_back(177);
expected.push_back(104);
expected.push_back(222);
expected.push_back(58);
vector<INDEX_CHARACTER_TYPE> result;
for (int i=0;i<8;i++) {
INDEX_CHARACTER_TYPE a = dataArray[i];
result.push_back(a);
}
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
}
*/
BOOST_AUTO_TEST_SUITE_END()

View File

@ -8,9 +8,9 @@ WordMap::WordMap() throw(ConcordiaException) {
WordMap::~WordMap() { WordMap::~WordMap() {
} }
sauchar_t WordMap::getWordCode(const string & word) { INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word) {
if (_map.find(word) == _map.end()) { if (_map.find(word) == _map.end()) {
sauchar_t newCode = _nextFree; INDEX_CHARACTER_TYPE newCode = _nextFree;
_map[word] = newCode; _map[word] = newCode;
_nextFree++; _nextFree++;
return newCode; return newCode;

View File

@ -4,14 +4,11 @@
#include <string> #include <string>
#include <map> #include <map>
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
#include "concordia/common/config.hpp"
#include <boost/archive/text_oarchive.hpp> #include <boost/archive/text_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp> #include <boost/archive/text_iarchive.hpp>
#include <boost/serialization/map.hpp> #include <boost/serialization/map.hpp>
#include "build/libdivsufsort/include/divsufsort.h"
/*! /*!
Class representing dictionary for word to int encoding. Class representing dictionary for word to int encoding.
@ -27,7 +24,7 @@ public:
*/ */
virtual ~WordMap(); virtual ~WordMap();
sauchar_t getWordCode(const string & word); INDEX_CHARACTER_TYPE getWordCode(const string & word);
private: private:
friend class boost::serialization::access; friend class boost::serialization::access;
@ -39,9 +36,9 @@ private:
ar & _nextFree; ar & _nextFree;
} }
map<string, sauchar_t> _map; map<string, INDEX_CHARACTER_TYPE> _map;
sauchar_t _nextFree; INDEX_CHARACTER_TYPE _nextFree;
}; };
#endif #endif

View File

@ -56,8 +56,18 @@ endif(HAVE_INLINE)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/config.h") configure_file("${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/config.h")
## Checks for types ## ## Checks for types ##
# sauchar_t (32bit) # sauchar_t (8bit)
set(SAUCHAR_TYPE "int") check_type_size("uint8_t" UINT8_T)
if(HAVE_UINT8_T)
set(SAUCHAR_TYPE "uint8_t")
else(HAVE_UINT8_T)
check_type_size("unsigned char" SIZEOF_UNSIGNED_CHAR)
if("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
set(SAUCHAR_TYPE "unsigned char")
else("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
message(FATAL_ERROR "Cannot find unsigned 8-bit integer type")
endif("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
endif(HAVE_UINT8_T)
# saint_t (32bit) # saint_t (32bit)
check_type_size("int32_t" INT32_T) check_type_size("int32_t" INT32_T)
if(HAVE_INT32_T) if(HAVE_INT32_T)

View File

@ -6,6 +6,21 @@
#Path to the Puddle tagset #Path to the Puddle tagset
puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@"; puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@";
#-------------------------------------------------------------------------------
#Word map, hashed index and suffix array files are in a temporary directory
#and should be deleted at the end of each test procedure.
#Word map file containing unique codes for tokens
word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
#File containing the "text" for suffix array searching, i.e. sequence of codes
hashed_index_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
#Binarized suffix array
suffix_array_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
#-------------------------------------------------------------------------------
### eof ### eof

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,3 @@
Ala ma kota
Ala ma rysia
Marysia ma rysia