concordia-console, new approach to suffix array - 4 sauchars per one saidx
This commit is contained in:
parent
7c1ed7fb6e
commit
47405834a3
2
.gitignore
vendored
2
.gitignore
vendored
@ -5,5 +5,5 @@ prod/resources/concordia-config/concordia.cfg
|
||||
concordia/common/config.hpp
|
||||
tests/resources/concordia-config/concordia.cfg
|
||||
tests/resources/temp
|
||||
|
||||
prod/resources/temp
|
||||
|
||||
|
@ -6,6 +6,10 @@ project(concordia C CXX)
|
||||
set (CONCORDIA_VERSION_MAJOR 0)
|
||||
set (CONCORDIA_VERSION_MINOR 1)
|
||||
|
||||
# Type of the characters in index
|
||||
|
||||
set (INDEX_CHARACTER_TYPE "unsigned int")
|
||||
|
||||
# ============================== #
|
||||
# Production paths
|
||||
# ============================== #
|
||||
@ -25,7 +29,7 @@ set (TEMP_HASHED_INDEX "temp_hashed_index.bin")
|
||||
set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin")
|
||||
|
||||
file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp)
|
||||
|
||||
file(MAKE_DIRECTORY ${PROD_RESOURCES_DIRECTORY}/temp)
|
||||
|
||||
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
||||
|
||||
@ -112,6 +116,10 @@ configure_file (
|
||||
"${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg"
|
||||
)
|
||||
|
||||
configure_file (
|
||||
"${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg.in"
|
||||
"${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg"
|
||||
)
|
||||
|
||||
# ================================================
|
||||
# Concordia: sub-projects
|
||||
|
@ -2,10 +2,15 @@
|
||||
#include <fstream>
|
||||
#include <boost/program_options.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/date_time/posix_time/posix_time.hpp>
|
||||
|
||||
#include "concordia/concordia.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include "build/libdivsufsort/include/divsufsort.h"
|
||||
|
||||
#define READ_BUFFER_LENGTH 1000
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
@ -14,7 +19,14 @@ int main(int argc, char** argv) {
|
||||
desc.add_options()
|
||||
("help,h", "Display this message")
|
||||
("config,c", boost::program_options::value<std::string>(),
|
||||
"Concordia configuration file (required)");
|
||||
"Concordia configuration file (required)")
|
||||
("generate-index,g", "Generate suffix array based index out of "
|
||||
"added sentences")
|
||||
("load-index,l", "Load the generated index for searching")
|
||||
("simple-search,s", boost::program_options::value<std::string>(),
|
||||
"Pattern to be searched in the index")
|
||||
("read-file,r", boost::program_options::value<std::string>(),
|
||||
"File to be read and added to index");
|
||||
|
||||
po::variables_map cli;
|
||||
po::store(po::parse_command_line(argc, argv, desc), cli);
|
||||
@ -38,7 +50,90 @@ int main(int argc, char** argv) {
|
||||
try {
|
||||
Concordia concordia(configFile);
|
||||
std::cout << "Welcome to Concordia. Version = "
|
||||
<< concordia.getVersion() << endl;
|
||||
<< concordia.getVersion() << std::endl;
|
||||
if (cli.count("generate-index")) {
|
||||
std::cout << "\tGenerating index..." << std::endl;
|
||||
boost::posix_time::ptime time_start =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
concordia.generateIndex();
|
||||
boost::posix_time::ptime time_end =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration msdiff = time_end - time_start;
|
||||
std::cout << "\tIndex generated in: " <<
|
||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||
} else if (cli.count("load-index")) {
|
||||
std::cout << "\tLoading index..." << std::endl;
|
||||
boost::posix_time::ptime time_start =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
concordia.loadIndex();
|
||||
boost::posix_time::ptime time_end =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration msdiff = time_end - time_start;
|
||||
std::cout << "\tIndex loaded in: " <<
|
||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||
} else if (cli.count("simple-search")) {
|
||||
std::string pattern = cli["simple-search"].as<std::string>();
|
||||
std::cout << "\tSearching for pattern: \"" << pattern <<
|
||||
"\"" << std::endl;
|
||||
} else if (cli.count("read-file")) {
|
||||
std::string filePath = cli["read-file"].as<std::string>();
|
||||
std::cout << "\tReading sentences from file: " << filePath <<
|
||||
std::endl;
|
||||
ifstream text_file(filePath.c_str());
|
||||
std::string line;
|
||||
if (text_file.is_open()) {
|
||||
long lineCount = 0;
|
||||
vector<std::string> buffer;
|
||||
boost::posix_time::ptime timeStart =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
while (getline(text_file, line)) {
|
||||
lineCount++;
|
||||
buffer.push_back(line);
|
||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||
concordia.addAllSentences(buffer);
|
||||
buffer.clear();
|
||||
boost::posix_time::ptime timeEnd =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration msdiff =
|
||||
timeEnd - timeStart;
|
||||
long timeElapsed = msdiff.total_milliseconds();
|
||||
double speed = static_cast<double>(
|
||||
1000 * lineCount / timeElapsed);
|
||||
std::cout << "\tRead and added to index " <<
|
||||
lineCount << " sentences in " << timeElapsed
|
||||
<< "ms. Current speed: " << speed <<
|
||||
" sentences per second" << std::endl;
|
||||
}
|
||||
}
|
||||
if (buffer.size() > 0) {
|
||||
concordia.addAllSentences(buffer);
|
||||
}
|
||||
text_file.close();
|
||||
boost::posix_time::ptime timeTotalEnd =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration totalMsdiff =
|
||||
timeTotalEnd - timeStart;
|
||||
long totalTimeElapsed = totalMsdiff.total_milliseconds();
|
||||
double totalSpeed =
|
||||
static_cast<double>(1000 * lineCount / totalTimeElapsed);
|
||||
std::cout << "\tReading finished. Read and added to index "
|
||||
<< lineCount << " sentences in " << totalTimeElapsed <<
|
||||
"ms. Overall speed: " << totalSpeed <<
|
||||
" sentences per second" << std::endl;
|
||||
} else {
|
||||
std::cerr << "Unable to open file: "<< filePath;
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
std::cerr << "One of the options: generate-index, simple-search, "
|
||||
<< "read-file must be provided. See the "
|
||||
"options specification: "
|
||||
<< std::endl << desc << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "Concordia operation completed without errors."
|
||||
<< std::endl;
|
||||
} catch(ConcordiaException & e) {
|
||||
std::cerr << "ConcordiaException caught with message: "
|
||||
<< std::endl
|
||||
@ -48,7 +143,7 @@ int main(int argc, char** argv) {
|
||||
<< std::endl;
|
||||
return 1;
|
||||
} catch(exception & e) {
|
||||
std::cerr << "Exception caught with message: "
|
||||
std::cerr << "Unexpected exception caught with message: "
|
||||
<< std::endl
|
||||
<< e.what()
|
||||
<< std::endl
|
||||
|
@ -2,5 +2,9 @@
|
||||
|
||||
echo "Running Concordia"
|
||||
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg
|
||||
rm prod/resources/temp/*
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/medium.txt
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -g
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -l
|
||||
#./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Ala ma chyba kota"
|
||||
|
||||
|
@ -14,6 +14,7 @@ add_library(concordia SHARED
|
||||
concordia_config.cpp
|
||||
concordia_exception.cpp
|
||||
common/logging.cpp
|
||||
common/utils.cpp
|
||||
)
|
||||
|
||||
add_subdirectory(t)
|
||||
@ -22,6 +23,18 @@ add_subdirectory(t)
|
||||
install(TARGETS concordia DESTINATION lib/)
|
||||
install(FILES concordia.hpp DESTINATION include/concordia/)
|
||||
|
||||
# ----------------------------------------------------
|
||||
# libconfig
|
||||
# ----------------------------------------------------
|
||||
find_library(LIBCONFIG_LIB NAMES config++ REQUIRED)
|
||||
find_path(LIBCONFIG_INCLUDE libconfig.h++)
|
||||
|
||||
if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
|
||||
message(STATUS "Found Libconfig")
|
||||
include_directories(${LIBCONFIG_INCLUDE})
|
||||
link_directories(${LIBCONFIG_LIB})
|
||||
endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
|
||||
|
||||
target_link_libraries(concordia log4cpp)
|
||||
target_link_libraries(concordia ${LIBSTEMMER_LIB})
|
||||
target_link_libraries(concordia ${Boost_LIBRARIES})
|
||||
|
@ -15,3 +15,6 @@
|
||||
|
||||
#define LEXICON_TEXT_FIELD_SEPARATORS "\t "
|
||||
#define LEXICON_FIELD_SEPARATOR "\t"
|
||||
|
||||
typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;
|
||||
|
||||
|
41
concordia/common/utils.cpp
Normal file
41
concordia/common/utils.cpp
Normal file
@ -0,0 +1,41 @@
|
||||
#include "concordia/common/utils.hpp"
|
||||
|
||||
Utils::Utils() {
|
||||
}
|
||||
|
||||
Utils::~Utils() {
|
||||
}
|
||||
|
||||
void Utils::writeIndexCharacter(ofstream & file,
|
||||
INDEX_CHARACTER_TYPE character) {
|
||||
file.write(reinterpret_cast<char *>(&character), sizeof(character));
|
||||
}
|
||||
|
||||
INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) {
|
||||
INDEX_CHARACTER_TYPE character;
|
||||
file.read(reinterpret_cast<char *>(&character), sizeof(character));
|
||||
return character;
|
||||
}
|
||||
|
||||
sauchar_t * Utils::indexVectorToSaucharArray(
|
||||
vector<INDEX_CHARACTER_TYPE> & input) {
|
||||
const int kArraySize = input.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray =
|
||||
new sauchar_t[kArraySize];
|
||||
int pos = 0;
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = input.begin();
|
||||
it != input.end(); ++it) {
|
||||
insertCharToSaucharArray(patternArray, *it, pos);
|
||||
pos += sizeof(INDEX_CHARACTER_TYPE);
|
||||
}
|
||||
return patternArray;
|
||||
}
|
||||
|
||||
void Utils::insertCharToSaucharArray(sauchar_t * array,
|
||||
INDEX_CHARACTER_TYPE character, int pos) {
|
||||
sauchar_t * characterArray = reinterpret_cast<sauchar_t *>(&character);
|
||||
for (int i = pos; i < pos+sizeof(character); i++) {
|
||||
array[i] = characterArray[i-pos];
|
||||
}
|
||||
}
|
||||
|
36
concordia/common/utils.hpp
Normal file
36
concordia/common/utils.hpp
Normal file
@ -0,0 +1,36 @@
|
||||
#ifndef UTILS_HDR
|
||||
#define UTILS_HDR
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include "build/libdivsufsort/include/divsufsort.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
class Utils {
|
||||
public:
|
||||
explicit Utils();
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~Utils();
|
||||
|
||||
static void writeIndexCharacter(ofstream & file,
|
||||
INDEX_CHARACTER_TYPE character);
|
||||
|
||||
static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file);
|
||||
|
||||
static sauchar_t * indexVectorToSaucharArray(
|
||||
vector<INDEX_CHARACTER_TYPE> & input);
|
||||
|
||||
static void insertCharToSaucharArray(sauchar_t * array,
|
||||
INDEX_CHARACTER_TYPE character, int pos);
|
||||
private:
|
||||
};
|
||||
|
||||
#endif
|
@ -46,9 +46,16 @@ void Concordia::addSentence(const std::string & sentence)
|
||||
_index->addSentence(sentence);
|
||||
}
|
||||
|
||||
void Concordia::addAllSentences(vector<std::string> & sentences)
|
||||
throw(ConcordiaException) {
|
||||
_index->addAllSentences(sentences);
|
||||
}
|
||||
|
||||
void Concordia::generateIndex() throw(ConcordiaException) {
|
||||
_index->generateSuffixArray();
|
||||
_index->serializeWordMap();
|
||||
}
|
||||
|
||||
void Concordia::loadIndex() throw(ConcordiaException) {
|
||||
_searcher->loadIndex(_config->getWordMapFilePath(),
|
||||
_config->getHashedIndexFilePath(),
|
||||
_config->getSuffixArrayFilePath());
|
||||
|
@ -35,8 +35,13 @@ public:
|
||||
|
||||
void addSentence(const std::string & sentence) throw(ConcordiaException);
|
||||
|
||||
void addAllSentences(vector<std::string> & sentences)
|
||||
throw(ConcordiaException);
|
||||
|
||||
void generateIndex() throw(ConcordiaException);
|
||||
|
||||
void loadIndex() throw(ConcordiaException);
|
||||
|
||||
std::vector<saidx_t> simpleSearch(const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "concordia/concordia_index.hpp"
|
||||
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <iostream>
|
||||
|
||||
@ -27,36 +28,36 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilePath,
|
||||
ConcordiaIndex::~ConcordiaIndex() {
|
||||
}
|
||||
|
||||
void ConcordiaIndex::serializeWordMap() {
|
||||
void ConcordiaIndex::_serializeWordMap() {
|
||||
_hashGenerator->serializeWordMap();
|
||||
}
|
||||
|
||||
void ConcordiaIndex::generateSuffixArray() {
|
||||
if (boost::filesystem::exists(_hashedIndexFilePath.c_str())) {
|
||||
ifstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in|
|
||||
ios::ate|ios::binary);
|
||||
|
||||
/* Get the file size. */
|
||||
long n = hashedIndexFile.tellg() / sizeof(sauchar_t);
|
||||
|
||||
saidx_t n = hashedIndexFile.tellg();
|
||||
if (n > 0) {
|
||||
sauchar_t *T;
|
||||
saidx_t *SA;
|
||||
|
||||
T = new sauchar_t[n];
|
||||
SA = new saidx_t[n];
|
||||
|
||||
/* Read n bytes of data. */
|
||||
hashedIndexFile.seekg(0, ios::beg);
|
||||
|
||||
sauchar_t buff;
|
||||
T = new sauchar_t[n];
|
||||
int pos = 0;
|
||||
while (!hashedIndexFile.eof()) {
|
||||
hashedIndexFile.read(reinterpret_cast<char *>(&buff),
|
||||
sizeof(sauchar_t));
|
||||
T[pos++] = buff;
|
||||
INDEX_CHARACTER_TYPE character =
|
||||
Utils::readIndexCharacter(hashedIndexFile);
|
||||
Utils::insertCharToSaucharArray(T, character, pos);
|
||||
pos+=sizeof(character);
|
||||
}
|
||||
hashedIndexFile.close();
|
||||
|
||||
SA = new saidx_t[n];
|
||||
|
||||
/* Construct the suffix array. */
|
||||
if (divsufsort(T, SA, (saidx_t)n) != 0) {
|
||||
throw ConcordiaException("Error creating suffix array.");
|
||||
@ -65,7 +66,8 @@ void ConcordiaIndex::generateSuffixArray() {
|
||||
/* Write the suffix array. */
|
||||
|
||||
ofstream suffixArrayFile;
|
||||
suffixArrayFile.open(_suffixArrayFilePath.c_str(), ios::out|ios::binary);
|
||||
suffixArrayFile.open(_suffixArrayFilePath.c_str(),
|
||||
ios::out|ios::binary);
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
suffixArrayFile.write(reinterpret_cast<char *>(&SA[i]),
|
||||
@ -76,19 +78,44 @@ void ConcordiaIndex::generateSuffixArray() {
|
||||
/* Deallocate memory. */
|
||||
delete[] T;
|
||||
delete[] SA;
|
||||
} else {
|
||||
throw ConcordiaException("Can not generate suffix array: "
|
||||
"hashed index file is empty");
|
||||
}
|
||||
} else {
|
||||
throw ConcordiaException("Can not generate suffix array: "
|
||||
"hashed index file does not exist");
|
||||
}
|
||||
}
|
||||
|
||||
void ConcordiaIndex::addSentence(const string & sentence) {
|
||||
vector<sauchar_t> hash = _hashGenerator->generateHash(sentence);
|
||||
vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(sentence);
|
||||
ofstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
||||
ios::app|ios::binary);
|
||||
for (vector<sauchar_t>::iterator it = hash.begin();
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
sauchar_t buff = *it;
|
||||
hashedIndexFile.write(reinterpret_cast<char *>(&buff),
|
||||
sizeof(sauchar_t));
|
||||
Utils::writeIndexCharacter(hashedIndexFile, *it);
|
||||
}
|
||||
hashedIndexFile.close();
|
||||
_serializeWordMap();
|
||||
}
|
||||
|
||||
void ConcordiaIndex::addAllSentences(vector<std::string> & sentences) {
|
||||
ofstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
||||
ios::app|ios::binary);
|
||||
for (vector<string>::iterator sent_it = sentences.begin();
|
||||
sent_it != sentences.end(); ++sent_it) {
|
||||
string sentence = *sent_it;
|
||||
vector<INDEX_CHARACTER_TYPE> hash =
|
||||
_hashGenerator->generateHash(sentence);
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
Utils::writeIndexCharacter(hashedIndexFile, *it);
|
||||
}
|
||||
}
|
||||
hashedIndexFile.close();
|
||||
_serializeWordMap();
|
||||
}
|
||||
|
||||
|
@ -30,11 +30,13 @@ public:
|
||||
|
||||
void addSentence(const string & sentence);
|
||||
|
||||
void serializeWordMap();
|
||||
void addAllSentences(vector<string> & sentences);
|
||||
|
||||
void generateSuffixArray();
|
||||
|
||||
private:
|
||||
void _serializeWordMap();
|
||||
|
||||
boost::shared_ptr<HashGenerator> _hashGenerator;
|
||||
|
||||
string _hashedIndexFilePath;
|
||||
|
@ -20,15 +20,16 @@ HashGenerator::HashGenerator(const string & wordMapFilePath)
|
||||
HashGenerator::~HashGenerator() {
|
||||
}
|
||||
|
||||
vector<sauchar_t> HashGenerator::generateHash(const string & sentence) {
|
||||
vector<sauchar_t> result;
|
||||
vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||
const string & sentence) {
|
||||
vector<INDEX_CHARACTER_TYPE> result;
|
||||
vector<string> tokenTexts;
|
||||
boost::split(tokenTexts, sentence, boost::is_any_of(" "));
|
||||
|
||||
for (vector<string>::iterator it = tokenTexts.begin();
|
||||
it != tokenTexts.end(); ++it) {
|
||||
string token = *it;
|
||||
sauchar_t code = _wordMap->getWordCode(token);
|
||||
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
|
||||
result.push_back(code);
|
||||
}
|
||||
|
||||
|
@ -6,10 +6,9 @@
|
||||
#include <vector>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include "concordia/word_map.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
|
||||
#include "build/libdivsufsort/include/divsufsort.h"
|
||||
|
||||
|
||||
/*!
|
||||
Class for generating a sentence hash.
|
||||
@ -27,7 +26,7 @@ public:
|
||||
*/
|
||||
virtual ~HashGenerator();
|
||||
|
||||
vector<sauchar_t> generateHash(const string & sentence);
|
||||
vector<INDEX_CHARACTER_TYPE> generateHash(const string & sentence);
|
||||
|
||||
void serializeWordMap();
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "concordia/index_searcher.hpp"
|
||||
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
IndexSearcher::IndexSearcher():
|
||||
@ -38,16 +39,15 @@ void IndexSearcher::loadIndex(const string & wordMapFilepath,
|
||||
ifstream hashedIndexFile;
|
||||
hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in
|
||||
| ios::ate | ios::binary);
|
||||
_n = hashedIndexFile.tellg() / sizeof(sauchar_t);
|
||||
_T = new sauchar_t[_n];
|
||||
|
||||
_n = hashedIndexFile.tellg();
|
||||
hashedIndexFile.seekg(0, ios::beg);
|
||||
sauchar_t sauchar_buff;
|
||||
_T = new sauchar_t[_n];
|
||||
int pos = 0;
|
||||
while (!hashedIndexFile.eof()) {
|
||||
hashedIndexFile.read(reinterpret_cast<char *>(&sauchar_buff),
|
||||
sizeof(sauchar_t));
|
||||
_T[pos++] = sauchar_buff;
|
||||
INDEX_CHARACTER_TYPE character =
|
||||
Utils::readIndexCharacter(hashedIndexFile);
|
||||
Utils::insertCharToSaucharArray(_T, character, pos);
|
||||
pos+=sizeof(character);
|
||||
}
|
||||
hashedIndexFile.close();
|
||||
|
||||
@ -59,7 +59,8 @@ void IndexSearcher::loadIndex(const string & wordMapFilepath,
|
||||
saidx_t saidx_buff;
|
||||
pos = 0;
|
||||
while (!suffixArrayFile.eof() && pos < _n) {
|
||||
suffixArrayFile.read(reinterpret_cast<char *>(&saidx_buff), sizeof(saidx_t));
|
||||
suffixArrayFile.read(reinterpret_cast<char *>(&saidx_buff),
|
||||
sizeof(saidx_t));
|
||||
_SA[pos++] = saidx_buff;
|
||||
}
|
||||
suffixArrayFile.close();
|
||||
@ -70,20 +71,22 @@ vector<saidx_t> IndexSearcher::simpleSearch(const string & pattern)
|
||||
vector<saidx_t> result;
|
||||
|
||||
int left;
|
||||
vector<sauchar_t> hash = _hashGenerator->generateHash(pattern);
|
||||
saidx_t patternLength = hash.size();
|
||||
sauchar_t * patternArray = new sauchar_t[patternLength];
|
||||
int i = 0;
|
||||
for (vector<sauchar_t>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
patternArray[i] = *it;
|
||||
i++;
|
||||
}
|
||||
vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(pattern);
|
||||
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||
int size = sa_search(_T, (saidx_t) _n,
|
||||
(const sauchar_t *) patternArray, patternLength,
|
||||
_SA, (saidx_t) _n, &left);
|
||||
for (i = 0; i < size; ++i) {
|
||||
result.push_back(_SA[left + i]);
|
||||
for (int i = 0; i < size; ++i) {
|
||||
saidx_t result_pos = _SA[left + i];
|
||||
if (result_pos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||
// As we are looking for a pattern in an array of higher
|
||||
// resolution than the hashed index file, we might
|
||||
// obtain accidental results exceeding the boundaries
|
||||
// of characters in hashed index. The above check
|
||||
// removes these accidental results.
|
||||
result.push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE));
|
||||
}
|
||||
}
|
||||
|
||||
delete[] patternArray;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "build/libdivsufsort/include/divsufsort.h"
|
||||
#include "concordia/hash_generator.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
@ -39,7 +40,7 @@ private:
|
||||
|
||||
saidx_t * _SA;
|
||||
|
||||
size_t _n;
|
||||
saidx_t _n;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,4 +1,5 @@
|
||||
add_library(concordia-tests
|
||||
test_utils.cpp
|
||||
test_word_map.cpp
|
||||
test_hash_generator.cpp
|
||||
test_concordia_index.cpp
|
||||
|
@ -54,6 +54,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||
expectedResult1.push_back(7);
|
||||
expectedResult1.push_back(4);
|
||||
|
||||
concordia.loadIndex();
|
||||
vector<saidx_t> searchResult1 = concordia.simpleSearch("ma rysia");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
@ -68,10 +69,12 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
concordia.addSentence("to jest okno");
|
||||
concordia.addSentence("czy jest okno otwarte");
|
||||
concordia.addSentence("chyba to jest tutaj");
|
||||
concordia.addSentence("to jest");
|
||||
vector<string> testSentences;
|
||||
testSentences.push_back("to jest okno");
|
||||
testSentences.push_back("czy jest okno otwarte");
|
||||
testSentences.push_back("chyba to jest tutaj");
|
||||
testSentences.push_back("to jest");
|
||||
concordia.addAllSentences(testSentences);
|
||||
|
||||
concordia.generateIndex();
|
||||
|
||||
@ -109,6 +112,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
expectedResult2.push_back(1);
|
||||
expectedResult2.push_back(4);
|
||||
|
||||
concordia.loadIndex();
|
||||
vector<saidx_t> searchResult1 = concordia.simpleSearch("to jest");
|
||||
vector<saidx_t> searchResult2 = concordia.simpleSearch("jest okno");
|
||||
|
||||
|
@ -58,7 +58,6 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
||||
index.addSentence("Marysia ma rysia");
|
||||
|
||||
index.generateSuffixArray();
|
||||
index.serializeWordMap();
|
||||
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include <string>
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/hash_generator.hpp"
|
||||
|
||||
#define TEST_WORD_MAP_PATH "/tmp/test_word_map.bin"
|
||||
@ -18,8 +19,8 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||
|
||||
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
|
||||
vector<sauchar_t> hash = hashGenerator.generateHash("Ala ma kota");
|
||||
vector<sauchar_t> expected;
|
||||
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala ma kota");
|
||||
vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(0);
|
||||
expected.push_back(1);
|
||||
expected.push_back(2);
|
||||
@ -34,8 +35,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
}
|
||||
|
||||
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
vector<sauchar_t> hash1 = hashGenerator1.generateHash("Ala ma kota");
|
||||
vector<sauchar_t> expected1;
|
||||
vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala ma kota");
|
||||
vector<INDEX_CHARACTER_TYPE> expected1;
|
||||
expected1.push_back(0);
|
||||
expected1.push_back(1);
|
||||
expected1.push_back(2);
|
||||
@ -44,8 +45,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
hashGenerator1.serializeWordMap();
|
||||
|
||||
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
vector<sauchar_t> hash2 = hashGenerator2.generateHash("Ala ma psa");
|
||||
vector<sauchar_t> expected2;
|
||||
vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala ma psa");
|
||||
vector<INDEX_CHARACTER_TYPE> expected2;
|
||||
expected2.push_back(0);
|
||||
expected2.push_back(1);
|
||||
expected2.push_back(3);
|
||||
|
@ -24,7 +24,6 @@ ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map
|
||||
index.addSentence("Marysia ma rysia");
|
||||
|
||||
index.generateSuffixArray();
|
||||
index.serializeWordMap();
|
||||
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
|
||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
|
||||
|
161
concordia/t/test_utils.cpp
Normal file
161
concordia/t/test_utils.cpp
Normal file
@ -0,0 +1,161 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "tests/common/test_resources_manager.hpp"
|
||||
#include "build/libdivsufsort/include/divsufsort.h"
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
#include <string>
|
||||
|
||||
using namespace std;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(utils)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( UtilsTest1 )
|
||||
{
|
||||
ofstream testFileOutput;
|
||||
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
||||
ios::out|ios::binary);
|
||||
INDEX_CHARACTER_TYPE testCharacter = 123456789; //in hex: 75BCD15
|
||||
Utils::writeIndexCharacter(testFileOutput,testCharacter);
|
||||
testFileOutput.close();
|
||||
|
||||
ifstream testFileInput;
|
||||
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
|
||||
INDEX_CHARACTER_TYPE retrievedCharacter = Utils::readIndexCharacter(testFileInput);
|
||||
BOOST_CHECK_EQUAL(retrievedCharacter, testCharacter);
|
||||
testFileInput.close();
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( UtilsTest2 )
|
||||
{
|
||||
ofstream testFileOutput;
|
||||
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
||||
ios::out|ios::binary);
|
||||
Utils::writeIndexCharacter(testFileOutput,123456789); //in hex: 75BCD15
|
||||
//in memory: 15 cd 5b 07
|
||||
// in DEC: 21 205 91 7
|
||||
|
||||
Utils::writeIndexCharacter(testFileOutput,987654321); //in hex: 3ADE68B1
|
||||
//in memory: b1 68 de 3a
|
||||
// in DEC: 177 104 222 58
|
||||
testFileOutput.close();
|
||||
|
||||
sauchar_t * dataArray = new sauchar_t[8];
|
||||
ifstream testFileInput;
|
||||
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
|
||||
|
||||
INDEX_CHARACTER_TYPE retrievedCharacter1 = Utils::readIndexCharacter(testFileInput);
|
||||
BOOST_CHECK_EQUAL(retrievedCharacter1, 123456789);
|
||||
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter1, 0);
|
||||
|
||||
INDEX_CHARACTER_TYPE retrievedCharacter2 = Utils::readIndexCharacter(testFileInput);
|
||||
BOOST_CHECK_EQUAL(retrievedCharacter2, 987654321);
|
||||
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter2, 4);
|
||||
|
||||
testFileInput.close();
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(21);
|
||||
expected.push_back(205);
|
||||
expected.push_back(91);
|
||||
expected.push_back(7);
|
||||
expected.push_back(177);
|
||||
expected.push_back(104);
|
||||
expected.push_back(222);
|
||||
expected.push_back(58);
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> result;
|
||||
for (int i=0;i<8;i++) {
|
||||
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||
result.push_back(a);
|
||||
}
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( UtilsTest3 )
|
||||
{
|
||||
vector<INDEX_CHARACTER_TYPE> hash;
|
||||
hash.push_back(123456789);
|
||||
hash.push_back(987654321);
|
||||
|
||||
sauchar_t * dataArray = Utils::indexVectorToSaucharArray(hash);
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> result;
|
||||
for (int i=0;i<8;i++) {
|
||||
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||
result.push_back(a);
|
||||
}
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(21);
|
||||
expected.push_back(205);
|
||||
expected.push_back(91);
|
||||
expected.push_back(7);
|
||||
expected.push_back(177);
|
||||
expected.push_back(104);
|
||||
expected.push_back(222);
|
||||
expected.push_back(58);
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||
}
|
||||
|
||||
/*
|
||||
BOOST_AUTO_TEST_CASE( UtilsTest4 )
|
||||
{
|
||||
ofstream testFileOutput;
|
||||
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
||||
ios::out|ios::binary);
|
||||
Utils::writeIndexCharacter(testFileOutput,123456789); //in hex: 75BCD15
|
||||
//in memory: 15 cd 5b 07
|
||||
// in DEC: 21 205 91 7
|
||||
|
||||
Utils::writeIndexCharacter(testFileOutput,987654321); //in hex: 3ADE68B1
|
||||
//in memory: b1 68 de 3a
|
||||
// in DEC: 177 104 222 58
|
||||
testFileOutput.close();
|
||||
|
||||
sauchar_t * dataArray = Utils::readIndexFromFile(
|
||||
ifstream testFileInput;
|
||||
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
|
||||
|
||||
INDEX_CHARACTER_TYPE retrievedCharacter1 = Utils::readIndexCharacter(testFileInput);
|
||||
BOOST_CHECK_EQUAL(retrievedCharacter1, 123456789);
|
||||
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter1, 0);
|
||||
|
||||
INDEX_CHARACTER_TYPE retrievedCharacter2 = Utils::readIndexCharacter(testFileInput);
|
||||
BOOST_CHECK_EQUAL(retrievedCharacter2, 987654321);
|
||||
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter2, 4);
|
||||
|
||||
testFileInput.close();
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(21);
|
||||
expected.push_back(205);
|
||||
expected.push_back(91);
|
||||
expected.push_back(7);
|
||||
expected.push_back(177);
|
||||
expected.push_back(104);
|
||||
expected.push_back(222);
|
||||
expected.push_back(58);
|
||||
|
||||
vector<INDEX_CHARACTER_TYPE> result;
|
||||
for (int i=0;i<8;i++) {
|
||||
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||
result.push_back(a);
|
||||
}
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
@ -8,9 +8,9 @@ WordMap::WordMap() throw(ConcordiaException) {
|
||||
WordMap::~WordMap() {
|
||||
}
|
||||
|
||||
sauchar_t WordMap::getWordCode(const string & word) {
|
||||
INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word) {
|
||||
if (_map.find(word) == _map.end()) {
|
||||
sauchar_t newCode = _nextFree;
|
||||
INDEX_CHARACTER_TYPE newCode = _nextFree;
|
||||
_map[word] = newCode;
|
||||
_nextFree++;
|
||||
return newCode;
|
||||
|
@ -4,14 +4,11 @@
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <boost/archive/text_oarchive.hpp>
|
||||
#include <boost/archive/text_iarchive.hpp>
|
||||
#include <boost/serialization/map.hpp>
|
||||
|
||||
#include "build/libdivsufsort/include/divsufsort.h"
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
Class representing dictionary for word to int encoding.
|
||||
|
||||
@ -27,7 +24,7 @@ public:
|
||||
*/
|
||||
virtual ~WordMap();
|
||||
|
||||
sauchar_t getWordCode(const string & word);
|
||||
INDEX_CHARACTER_TYPE getWordCode(const string & word);
|
||||
|
||||
private:
|
||||
friend class boost::serialization::access;
|
||||
@ -39,9 +36,9 @@ private:
|
||||
ar & _nextFree;
|
||||
}
|
||||
|
||||
map<string, sauchar_t> _map;
|
||||
map<string, INDEX_CHARACTER_TYPE> _map;
|
||||
|
||||
sauchar_t _nextFree;
|
||||
INDEX_CHARACTER_TYPE _nextFree;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -56,8 +56,18 @@ endif(HAVE_INLINE)
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/config.h")
|
||||
|
||||
## Checks for types ##
|
||||
# sauchar_t (32bit)
|
||||
set(SAUCHAR_TYPE "int")
|
||||
# sauchar_t (8bit)
|
||||
check_type_size("uint8_t" UINT8_T)
|
||||
if(HAVE_UINT8_T)
|
||||
set(SAUCHAR_TYPE "uint8_t")
|
||||
else(HAVE_UINT8_T)
|
||||
check_type_size("unsigned char" SIZEOF_UNSIGNED_CHAR)
|
||||
if("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
|
||||
set(SAUCHAR_TYPE "unsigned char")
|
||||
else("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
|
||||
message(FATAL_ERROR "Cannot find unsigned 8-bit integer type")
|
||||
endif("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
|
||||
endif(HAVE_UINT8_T)
|
||||
# saint_t (32bit)
|
||||
check_type_size("int32_t" INT32_T)
|
||||
if(HAVE_INT32_T)
|
||||
|
@ -6,6 +6,21 @@
|
||||
#Path to the Puddle tagset
|
||||
puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@";
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
#Word map, hashed index and suffix array files are in a temporary directory
|
||||
#and should be deleted at the end of each test procedure.
|
||||
|
||||
#Word map file containing unique codes for tokens
|
||||
|
||||
word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
||||
|
||||
#File containing the "text" for suffix array searching, i.e. sequence of codes
|
||||
|
||||
hashed_index_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
|
||||
|
||||
#Binarized suffix array
|
||||
|
||||
suffix_array_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
### eof
|
||||
|
6227
prod/resources/text-files/medium.txt
Normal file
6227
prod/resources/text-files/medium.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
prod/resources/text-files/small.txt
Normal file
3
prod/resources/text-files/small.txt
Normal file
@ -0,0 +1,3 @@
|
||||
Ala ma kota
|
||||
Ala ma rysia
|
||||
Marysia ma rysia
|
Loading…
Reference in New Issue
Block a user