concordia-console, new approach to suffix array - 4 sauchars per one saidx
This commit is contained in:
parent
7c1ed7fb6e
commit
47405834a3
2
.gitignore
vendored
2
.gitignore
vendored
@ -5,5 +5,5 @@ prod/resources/concordia-config/concordia.cfg
|
|||||||
concordia/common/config.hpp
|
concordia/common/config.hpp
|
||||||
tests/resources/concordia-config/concordia.cfg
|
tests/resources/concordia-config/concordia.cfg
|
||||||
tests/resources/temp
|
tests/resources/temp
|
||||||
|
prod/resources/temp
|
||||||
|
|
||||||
|
@ -6,6 +6,10 @@ project(concordia C CXX)
|
|||||||
set (CONCORDIA_VERSION_MAJOR 0)
|
set (CONCORDIA_VERSION_MAJOR 0)
|
||||||
set (CONCORDIA_VERSION_MINOR 1)
|
set (CONCORDIA_VERSION_MINOR 1)
|
||||||
|
|
||||||
|
# Type of the characters in index
|
||||||
|
|
||||||
|
set (INDEX_CHARACTER_TYPE "unsigned int")
|
||||||
|
|
||||||
# ============================== #
|
# ============================== #
|
||||||
# Production paths
|
# Production paths
|
||||||
# ============================== #
|
# ============================== #
|
||||||
@ -25,7 +29,7 @@ set (TEMP_HASHED_INDEX "temp_hashed_index.bin")
|
|||||||
set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin")
|
set (TEMP_SUFFIX_ARRAY "temp_suffix_array.bin")
|
||||||
|
|
||||||
file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp)
|
file(MAKE_DIRECTORY ${TEST_RESOURCES_DIRECTORY}/temp)
|
||||||
|
file(MAKE_DIRECTORY ${PROD_RESOURCES_DIRECTORY}/temp)
|
||||||
|
|
||||||
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
||||||
|
|
||||||
@ -112,6 +116,10 @@ configure_file (
|
|||||||
"${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg"
|
"${concordia_SOURCE_DIR}/tests/resources/concordia-config/concordia.cfg"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
configure_file (
|
||||||
|
"${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg.in"
|
||||||
|
"${concordia_SOURCE_DIR}/prod/resources/concordia-config/concordia.cfg"
|
||||||
|
)
|
||||||
|
|
||||||
# ================================================
|
# ================================================
|
||||||
# Concordia: sub-projects
|
# Concordia: sub-projects
|
||||||
|
@ -2,10 +2,15 @@
|
|||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
|
#include <boost/date_time/posix_time/posix_time.hpp>
|
||||||
|
|
||||||
#include "concordia/concordia.hpp"
|
#include "concordia/concordia.hpp"
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
#include "build/libdivsufsort/include/divsufsort.h"
|
#include "build/libdivsufsort/include/divsufsort.h"
|
||||||
|
|
||||||
|
#define READ_BUFFER_LENGTH 1000
|
||||||
|
|
||||||
namespace po = boost::program_options;
|
namespace po = boost::program_options;
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
@ -14,7 +19,14 @@ int main(int argc, char** argv) {
|
|||||||
desc.add_options()
|
desc.add_options()
|
||||||
("help,h", "Display this message")
|
("help,h", "Display this message")
|
||||||
("config,c", boost::program_options::value<std::string>(),
|
("config,c", boost::program_options::value<std::string>(),
|
||||||
"Concordia configuration file (required)");
|
"Concordia configuration file (required)")
|
||||||
|
("generate-index,g", "Generate suffix array based index out of "
|
||||||
|
"added sentences")
|
||||||
|
("load-index,l", "Load the generated index for searching")
|
||||||
|
("simple-search,s", boost::program_options::value<std::string>(),
|
||||||
|
"Pattern to be searched in the index")
|
||||||
|
("read-file,r", boost::program_options::value<std::string>(),
|
||||||
|
"File to be read and added to index");
|
||||||
|
|
||||||
po::variables_map cli;
|
po::variables_map cli;
|
||||||
po::store(po::parse_command_line(argc, argv, desc), cli);
|
po::store(po::parse_command_line(argc, argv, desc), cli);
|
||||||
@ -38,7 +50,90 @@ int main(int argc, char** argv) {
|
|||||||
try {
|
try {
|
||||||
Concordia concordia(configFile);
|
Concordia concordia(configFile);
|
||||||
std::cout << "Welcome to Concordia. Version = "
|
std::cout << "Welcome to Concordia. Version = "
|
||||||
<< concordia.getVersion() << endl;
|
<< concordia.getVersion() << std::endl;
|
||||||
|
if (cli.count("generate-index")) {
|
||||||
|
std::cout << "\tGenerating index..." << std::endl;
|
||||||
|
boost::posix_time::ptime time_start =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
concordia.generateIndex();
|
||||||
|
boost::posix_time::ptime time_end =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
boost::posix_time::time_duration msdiff = time_end - time_start;
|
||||||
|
std::cout << "\tIndex generated in: " <<
|
||||||
|
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||||
|
} else if (cli.count("load-index")) {
|
||||||
|
std::cout << "\tLoading index..." << std::endl;
|
||||||
|
boost::posix_time::ptime time_start =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
concordia.loadIndex();
|
||||||
|
boost::posix_time::ptime time_end =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
boost::posix_time::time_duration msdiff = time_end - time_start;
|
||||||
|
std::cout << "\tIndex loaded in: " <<
|
||||||
|
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||||
|
} else if (cli.count("simple-search")) {
|
||||||
|
std::string pattern = cli["simple-search"].as<std::string>();
|
||||||
|
std::cout << "\tSearching for pattern: \"" << pattern <<
|
||||||
|
"\"" << std::endl;
|
||||||
|
} else if (cli.count("read-file")) {
|
||||||
|
std::string filePath = cli["read-file"].as<std::string>();
|
||||||
|
std::cout << "\tReading sentences from file: " << filePath <<
|
||||||
|
std::endl;
|
||||||
|
ifstream text_file(filePath.c_str());
|
||||||
|
std::string line;
|
||||||
|
if (text_file.is_open()) {
|
||||||
|
long lineCount = 0;
|
||||||
|
vector<std::string> buffer;
|
||||||
|
boost::posix_time::ptime timeStart =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
while (getline(text_file, line)) {
|
||||||
|
lineCount++;
|
||||||
|
buffer.push_back(line);
|
||||||
|
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||||
|
concordia.addAllSentences(buffer);
|
||||||
|
buffer.clear();
|
||||||
|
boost::posix_time::ptime timeEnd =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
boost::posix_time::time_duration msdiff =
|
||||||
|
timeEnd - timeStart;
|
||||||
|
long timeElapsed = msdiff.total_milliseconds();
|
||||||
|
double speed = static_cast<double>(
|
||||||
|
1000 * lineCount / timeElapsed);
|
||||||
|
std::cout << "\tRead and added to index " <<
|
||||||
|
lineCount << " sentences in " << timeElapsed
|
||||||
|
<< "ms. Current speed: " << speed <<
|
||||||
|
" sentences per second" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (buffer.size() > 0) {
|
||||||
|
concordia.addAllSentences(buffer);
|
||||||
|
}
|
||||||
|
text_file.close();
|
||||||
|
boost::posix_time::ptime timeTotalEnd =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
boost::posix_time::time_duration totalMsdiff =
|
||||||
|
timeTotalEnd - timeStart;
|
||||||
|
long totalTimeElapsed = totalMsdiff.total_milliseconds();
|
||||||
|
double totalSpeed =
|
||||||
|
static_cast<double>(1000 * lineCount / totalTimeElapsed);
|
||||||
|
std::cout << "\tReading finished. Read and added to index "
|
||||||
|
<< lineCount << " sentences in " << totalTimeElapsed <<
|
||||||
|
"ms. Overall speed: " << totalSpeed <<
|
||||||
|
" sentences per second" << std::endl;
|
||||||
|
} else {
|
||||||
|
std::cerr << "Unable to open file: "<< filePath;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
std::cerr << "One of the options: generate-index, simple-search, "
|
||||||
|
<< "read-file must be provided. See the "
|
||||||
|
"options specification: "
|
||||||
|
<< std::endl << desc << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Concordia operation completed without errors."
|
||||||
|
<< std::endl;
|
||||||
} catch(ConcordiaException & e) {
|
} catch(ConcordiaException & e) {
|
||||||
std::cerr << "ConcordiaException caught with message: "
|
std::cerr << "ConcordiaException caught with message: "
|
||||||
<< std::endl
|
<< std::endl
|
||||||
@ -48,7 +143,7 @@ int main(int argc, char** argv) {
|
|||||||
<< std::endl;
|
<< std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
} catch(exception & e) {
|
} catch(exception & e) {
|
||||||
std::cerr << "Exception caught with message: "
|
std::cerr << "Unexpected exception caught with message: "
|
||||||
<< std::endl
|
<< std::endl
|
||||||
<< e.what()
|
<< e.what()
|
||||||
<< std::endl
|
<< std::endl
|
||||||
|
@ -2,5 +2,9 @@
|
|||||||
|
|
||||||
echo "Running Concordia"
|
echo "Running Concordia"
|
||||||
|
|
||||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg
|
rm prod/resources/temp/*
|
||||||
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/medium.txt
|
||||||
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -g
|
||||||
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -l
|
||||||
|
#./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Ala ma chyba kota"
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@ add_library(concordia SHARED
|
|||||||
concordia_config.cpp
|
concordia_config.cpp
|
||||||
concordia_exception.cpp
|
concordia_exception.cpp
|
||||||
common/logging.cpp
|
common/logging.cpp
|
||||||
|
common/utils.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
add_subdirectory(t)
|
add_subdirectory(t)
|
||||||
@ -22,6 +23,18 @@ add_subdirectory(t)
|
|||||||
install(TARGETS concordia DESTINATION lib/)
|
install(TARGETS concordia DESTINATION lib/)
|
||||||
install(FILES concordia.hpp DESTINATION include/concordia/)
|
install(FILES concordia.hpp DESTINATION include/concordia/)
|
||||||
|
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# libconfig
|
||||||
|
# ----------------------------------------------------
|
||||||
|
find_library(LIBCONFIG_LIB NAMES config++ REQUIRED)
|
||||||
|
find_path(LIBCONFIG_INCLUDE libconfig.h++)
|
||||||
|
|
||||||
|
if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
|
||||||
|
message(STATUS "Found Libconfig")
|
||||||
|
include_directories(${LIBCONFIG_INCLUDE})
|
||||||
|
link_directories(${LIBCONFIG_LIB})
|
||||||
|
endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
|
||||||
|
|
||||||
target_link_libraries(concordia log4cpp)
|
target_link_libraries(concordia log4cpp)
|
||||||
target_link_libraries(concordia ${LIBSTEMMER_LIB})
|
target_link_libraries(concordia ${LIBSTEMMER_LIB})
|
||||||
target_link_libraries(concordia ${Boost_LIBRARIES})
|
target_link_libraries(concordia ${Boost_LIBRARIES})
|
||||||
|
@ -15,3 +15,6 @@
|
|||||||
|
|
||||||
#define LEXICON_TEXT_FIELD_SEPARATORS "\t "
|
#define LEXICON_TEXT_FIELD_SEPARATORS "\t "
|
||||||
#define LEXICON_FIELD_SEPARATOR "\t"
|
#define LEXICON_FIELD_SEPARATOR "\t"
|
||||||
|
|
||||||
|
typedef @INDEX_CHARACTER_TYPE@ INDEX_CHARACTER_TYPE;
|
||||||
|
|
||||||
|
41
concordia/common/utils.cpp
Normal file
41
concordia/common/utils.cpp
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#include "concordia/common/utils.hpp"
|
||||||
|
|
||||||
|
Utils::Utils() {
|
||||||
|
}
|
||||||
|
|
||||||
|
Utils::~Utils() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void Utils::writeIndexCharacter(ofstream & file,
|
||||||
|
INDEX_CHARACTER_TYPE character) {
|
||||||
|
file.write(reinterpret_cast<char *>(&character), sizeof(character));
|
||||||
|
}
|
||||||
|
|
||||||
|
INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) {
|
||||||
|
INDEX_CHARACTER_TYPE character;
|
||||||
|
file.read(reinterpret_cast<char *>(&character), sizeof(character));
|
||||||
|
return character;
|
||||||
|
}
|
||||||
|
|
||||||
|
sauchar_t * Utils::indexVectorToSaucharArray(
|
||||||
|
vector<INDEX_CHARACTER_TYPE> & input) {
|
||||||
|
const int kArraySize = input.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
sauchar_t * patternArray =
|
||||||
|
new sauchar_t[kArraySize];
|
||||||
|
int pos = 0;
|
||||||
|
for (vector<INDEX_CHARACTER_TYPE>::iterator it = input.begin();
|
||||||
|
it != input.end(); ++it) {
|
||||||
|
insertCharToSaucharArray(patternArray, *it, pos);
|
||||||
|
pos += sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
}
|
||||||
|
return patternArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Utils::insertCharToSaucharArray(sauchar_t * array,
|
||||||
|
INDEX_CHARACTER_TYPE character, int pos) {
|
||||||
|
sauchar_t * characterArray = reinterpret_cast<sauchar_t *>(&character);
|
||||||
|
for (int i = pos; i < pos+sizeof(character); i++) {
|
||||||
|
array[i] = characterArray[i-pos];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
36
concordia/common/utils.hpp
Normal file
36
concordia/common/utils.hpp
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
#ifndef UTILS_HDR
|
||||||
|
#define UTILS_HDR
|
||||||
|
|
||||||
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
#include "build/libdivsufsort/include/divsufsort.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
class Utils {
|
||||||
|
public:
|
||||||
|
explicit Utils();
|
||||||
|
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~Utils();
|
||||||
|
|
||||||
|
static void writeIndexCharacter(ofstream & file,
|
||||||
|
INDEX_CHARACTER_TYPE character);
|
||||||
|
|
||||||
|
static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file);
|
||||||
|
|
||||||
|
static sauchar_t * indexVectorToSaucharArray(
|
||||||
|
vector<INDEX_CHARACTER_TYPE> & input);
|
||||||
|
|
||||||
|
static void insertCharToSaucharArray(sauchar_t * array,
|
||||||
|
INDEX_CHARACTER_TYPE character, int pos);
|
||||||
|
private:
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -46,9 +46,16 @@ void Concordia::addSentence(const std::string & sentence)
|
|||||||
_index->addSentence(sentence);
|
_index->addSentence(sentence);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Concordia::addAllSentences(vector<std::string> & sentences)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
_index->addAllSentences(sentences);
|
||||||
|
}
|
||||||
|
|
||||||
void Concordia::generateIndex() throw(ConcordiaException) {
|
void Concordia::generateIndex() throw(ConcordiaException) {
|
||||||
_index->generateSuffixArray();
|
_index->generateSuffixArray();
|
||||||
_index->serializeWordMap();
|
}
|
||||||
|
|
||||||
|
void Concordia::loadIndex() throw(ConcordiaException) {
|
||||||
_searcher->loadIndex(_config->getWordMapFilePath(),
|
_searcher->loadIndex(_config->getWordMapFilePath(),
|
||||||
_config->getHashedIndexFilePath(),
|
_config->getHashedIndexFilePath(),
|
||||||
_config->getSuffixArrayFilePath());
|
_config->getSuffixArrayFilePath());
|
||||||
|
@ -35,8 +35,13 @@ public:
|
|||||||
|
|
||||||
void addSentence(const std::string & sentence) throw(ConcordiaException);
|
void addSentence(const std::string & sentence) throw(ConcordiaException);
|
||||||
|
|
||||||
|
void addAllSentences(vector<std::string> & sentences)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
void generateIndex() throw(ConcordiaException);
|
void generateIndex() throw(ConcordiaException);
|
||||||
|
|
||||||
|
void loadIndex() throw(ConcordiaException);
|
||||||
|
|
||||||
std::vector<saidx_t> simpleSearch(const std::string & pattern)
|
std::vector<saidx_t> simpleSearch(const std::string & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "concordia/concordia_index.hpp"
|
#include "concordia/concordia_index.hpp"
|
||||||
|
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
@ -27,36 +28,36 @@ ConcordiaIndex::ConcordiaIndex(const string & wordMapFilePath,
|
|||||||
ConcordiaIndex::~ConcordiaIndex() {
|
ConcordiaIndex::~ConcordiaIndex() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaIndex::serializeWordMap() {
|
void ConcordiaIndex::_serializeWordMap() {
|
||||||
_hashGenerator->serializeWordMap();
|
_hashGenerator->serializeWordMap();
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaIndex::generateSuffixArray() {
|
void ConcordiaIndex::generateSuffixArray() {
|
||||||
|
if (boost::filesystem::exists(_hashedIndexFilePath.c_str())) {
|
||||||
ifstream hashedIndexFile;
|
ifstream hashedIndexFile;
|
||||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in|
|
||||||
ios::ate|ios::binary);
|
ios::ate|ios::binary);
|
||||||
|
|
||||||
/* Get the file size. */
|
/* Get the file size. */
|
||||||
long n = hashedIndexFile.tellg() / sizeof(sauchar_t);
|
saidx_t n = hashedIndexFile.tellg();
|
||||||
|
if (n > 0) {
|
||||||
sauchar_t *T;
|
sauchar_t *T;
|
||||||
saidx_t *SA;
|
saidx_t *SA;
|
||||||
|
|
||||||
T = new sauchar_t[n];
|
|
||||||
SA = new saidx_t[n];
|
|
||||||
|
|
||||||
/* Read n bytes of data. */
|
/* Read n bytes of data. */
|
||||||
hashedIndexFile.seekg(0, ios::beg);
|
hashedIndexFile.seekg(0, ios::beg);
|
||||||
|
T = new sauchar_t[n];
|
||||||
sauchar_t buff;
|
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
while (!hashedIndexFile.eof()) {
|
while (!hashedIndexFile.eof()) {
|
||||||
hashedIndexFile.read(reinterpret_cast<char *>(&buff),
|
INDEX_CHARACTER_TYPE character =
|
||||||
sizeof(sauchar_t));
|
Utils::readIndexCharacter(hashedIndexFile);
|
||||||
T[pos++] = buff;
|
Utils::insertCharToSaucharArray(T, character, pos);
|
||||||
|
pos+=sizeof(character);
|
||||||
}
|
}
|
||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
|
|
||||||
|
SA = new saidx_t[n];
|
||||||
|
|
||||||
/* Construct the suffix array. */
|
/* Construct the suffix array. */
|
||||||
if (divsufsort(T, SA, (saidx_t)n) != 0) {
|
if (divsufsort(T, SA, (saidx_t)n) != 0) {
|
||||||
throw ConcordiaException("Error creating suffix array.");
|
throw ConcordiaException("Error creating suffix array.");
|
||||||
@ -65,7 +66,8 @@ void ConcordiaIndex::generateSuffixArray() {
|
|||||||
/* Write the suffix array. */
|
/* Write the suffix array. */
|
||||||
|
|
||||||
ofstream suffixArrayFile;
|
ofstream suffixArrayFile;
|
||||||
suffixArrayFile.open(_suffixArrayFilePath.c_str(), ios::out|ios::binary);
|
suffixArrayFile.open(_suffixArrayFilePath.c_str(),
|
||||||
|
ios::out|ios::binary);
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
suffixArrayFile.write(reinterpret_cast<char *>(&SA[i]),
|
suffixArrayFile.write(reinterpret_cast<char *>(&SA[i]),
|
||||||
@ -76,19 +78,44 @@ void ConcordiaIndex::generateSuffixArray() {
|
|||||||
/* Deallocate memory. */
|
/* Deallocate memory. */
|
||||||
delete[] T;
|
delete[] T;
|
||||||
delete[] SA;
|
delete[] SA;
|
||||||
|
} else {
|
||||||
|
throw ConcordiaException("Can not generate suffix array: "
|
||||||
|
"hashed index file is empty");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw ConcordiaException("Can not generate suffix array: "
|
||||||
|
"hashed index file does not exist");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaIndex::addSentence(const string & sentence) {
|
void ConcordiaIndex::addSentence(const string & sentence) {
|
||||||
vector<sauchar_t> hash = _hashGenerator->generateHash(sentence);
|
vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(sentence);
|
||||||
ofstream hashedIndexFile;
|
ofstream hashedIndexFile;
|
||||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
||||||
ios::app|ios::binary);
|
ios::app|ios::binary);
|
||||||
for (vector<sauchar_t>::iterator it = hash.begin();
|
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||||
it != hash.end(); ++it) {
|
it != hash.end(); ++it) {
|
||||||
sauchar_t buff = *it;
|
Utils::writeIndexCharacter(hashedIndexFile, *it);
|
||||||
hashedIndexFile.write(reinterpret_cast<char *>(&buff),
|
|
||||||
sizeof(sauchar_t));
|
|
||||||
}
|
}
|
||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
|
_serializeWordMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConcordiaIndex::addAllSentences(vector<std::string> & sentences) {
|
||||||
|
ofstream hashedIndexFile;
|
||||||
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
||||||
|
ios::app|ios::binary);
|
||||||
|
for (vector<string>::iterator sent_it = sentences.begin();
|
||||||
|
sent_it != sentences.end(); ++sent_it) {
|
||||||
|
string sentence = *sent_it;
|
||||||
|
vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
|
_hashGenerator->generateHash(sentence);
|
||||||
|
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||||
|
it != hash.end(); ++it) {
|
||||||
|
Utils::writeIndexCharacter(hashedIndexFile, *it);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
hashedIndexFile.close();
|
||||||
|
_serializeWordMap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -30,11 +30,13 @@ public:
|
|||||||
|
|
||||||
void addSentence(const string & sentence);
|
void addSentence(const string & sentence);
|
||||||
|
|
||||||
void serializeWordMap();
|
void addAllSentences(vector<string> & sentences);
|
||||||
|
|
||||||
void generateSuffixArray();
|
void generateSuffixArray();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void _serializeWordMap();
|
||||||
|
|
||||||
boost::shared_ptr<HashGenerator> _hashGenerator;
|
boost::shared_ptr<HashGenerator> _hashGenerator;
|
||||||
|
|
||||||
string _hashedIndexFilePath;
|
string _hashedIndexFilePath;
|
||||||
|
@ -20,15 +20,16 @@ HashGenerator::HashGenerator(const string & wordMapFilePath)
|
|||||||
HashGenerator::~HashGenerator() {
|
HashGenerator::~HashGenerator() {
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<sauchar_t> HashGenerator::generateHash(const string & sentence) {
|
vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||||
vector<sauchar_t> result;
|
const string & sentence) {
|
||||||
|
vector<INDEX_CHARACTER_TYPE> result;
|
||||||
vector<string> tokenTexts;
|
vector<string> tokenTexts;
|
||||||
boost::split(tokenTexts, sentence, boost::is_any_of(" "));
|
boost::split(tokenTexts, sentence, boost::is_any_of(" "));
|
||||||
|
|
||||||
for (vector<string>::iterator it = tokenTexts.begin();
|
for (vector<string>::iterator it = tokenTexts.begin();
|
||||||
it != tokenTexts.end(); ++it) {
|
it != tokenTexts.end(); ++it) {
|
||||||
string token = *it;
|
string token = *it;
|
||||||
sauchar_t code = _wordMap->getWordCode(token);
|
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
|
||||||
result.push_back(code);
|
result.push_back(code);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,10 +6,9 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
#include "concordia/word_map.hpp"
|
#include "concordia/word_map.hpp"
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
|
||||||
#include "build/libdivsufsort/include/divsufsort.h"
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for generating a sentence hash.
|
Class for generating a sentence hash.
|
||||||
@ -27,7 +26,7 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~HashGenerator();
|
virtual ~HashGenerator();
|
||||||
|
|
||||||
vector<sauchar_t> generateHash(const string & sentence);
|
vector<INDEX_CHARACTER_TYPE> generateHash(const string & sentence);
|
||||||
|
|
||||||
void serializeWordMap();
|
void serializeWordMap();
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "concordia/index_searcher.hpp"
|
#include "concordia/index_searcher.hpp"
|
||||||
|
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
IndexSearcher::IndexSearcher():
|
IndexSearcher::IndexSearcher():
|
||||||
@ -38,16 +39,15 @@ void IndexSearcher::loadIndex(const string & wordMapFilepath,
|
|||||||
ifstream hashedIndexFile;
|
ifstream hashedIndexFile;
|
||||||
hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in
|
hashedIndexFile.open(hashedIndexFilepath.c_str(), ios::in
|
||||||
| ios::ate | ios::binary);
|
| ios::ate | ios::binary);
|
||||||
_n = hashedIndexFile.tellg() / sizeof(sauchar_t);
|
_n = hashedIndexFile.tellg();
|
||||||
_T = new sauchar_t[_n];
|
|
||||||
|
|
||||||
hashedIndexFile.seekg(0, ios::beg);
|
hashedIndexFile.seekg(0, ios::beg);
|
||||||
sauchar_t sauchar_buff;
|
_T = new sauchar_t[_n];
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
while (!hashedIndexFile.eof()) {
|
while (!hashedIndexFile.eof()) {
|
||||||
hashedIndexFile.read(reinterpret_cast<char *>(&sauchar_buff),
|
INDEX_CHARACTER_TYPE character =
|
||||||
sizeof(sauchar_t));
|
Utils::readIndexCharacter(hashedIndexFile);
|
||||||
_T[pos++] = sauchar_buff;
|
Utils::insertCharToSaucharArray(_T, character, pos);
|
||||||
|
pos+=sizeof(character);
|
||||||
}
|
}
|
||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
|
|
||||||
@ -59,7 +59,8 @@ void IndexSearcher::loadIndex(const string & wordMapFilepath,
|
|||||||
saidx_t saidx_buff;
|
saidx_t saidx_buff;
|
||||||
pos = 0;
|
pos = 0;
|
||||||
while (!suffixArrayFile.eof() && pos < _n) {
|
while (!suffixArrayFile.eof() && pos < _n) {
|
||||||
suffixArrayFile.read(reinterpret_cast<char *>(&saidx_buff), sizeof(saidx_t));
|
suffixArrayFile.read(reinterpret_cast<char *>(&saidx_buff),
|
||||||
|
sizeof(saidx_t));
|
||||||
_SA[pos++] = saidx_buff;
|
_SA[pos++] = saidx_buff;
|
||||||
}
|
}
|
||||||
suffixArrayFile.close();
|
suffixArrayFile.close();
|
||||||
@ -70,20 +71,22 @@ vector<saidx_t> IndexSearcher::simpleSearch(const string & pattern)
|
|||||||
vector<saidx_t> result;
|
vector<saidx_t> result;
|
||||||
|
|
||||||
int left;
|
int left;
|
||||||
vector<sauchar_t> hash = _hashGenerator->generateHash(pattern);
|
vector<INDEX_CHARACTER_TYPE> hash = _hashGenerator->generateHash(pattern);
|
||||||
saidx_t patternLength = hash.size();
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||||
sauchar_t * patternArray = new sauchar_t[patternLength];
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||||
int i = 0;
|
|
||||||
for (vector<sauchar_t>::iterator it = hash.begin();
|
|
||||||
it != hash.end(); ++it) {
|
|
||||||
patternArray[i] = *it;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
int size = sa_search(_T, (saidx_t) _n,
|
int size = sa_search(_T, (saidx_t) _n,
|
||||||
(const sauchar_t *) patternArray, patternLength,
|
(const sauchar_t *) patternArray, patternLength,
|
||||||
_SA, (saidx_t) _n, &left);
|
_SA, (saidx_t) _n, &left);
|
||||||
for (i = 0; i < size; ++i) {
|
for (int i = 0; i < size; ++i) {
|
||||||
result.push_back(_SA[left + i]);
|
saidx_t result_pos = _SA[left + i];
|
||||||
|
if (result_pos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||||
|
// As we are looking for a pattern in an array of higher
|
||||||
|
// resolution than the hashed index file, we might
|
||||||
|
// obtain accidental results exceeding the boundaries
|
||||||
|
// of characters in hashed index. The above check
|
||||||
|
// removes these accidental results.
|
||||||
|
result.push_back(result_pos / sizeof(INDEX_CHARACTER_TYPE));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
delete[] patternArray;
|
delete[] patternArray;
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
#include "build/libdivsufsort/include/divsufsort.h"
|
#include "build/libdivsufsort/include/divsufsort.h"
|
||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
@ -39,7 +40,7 @@ private:
|
|||||||
|
|
||||||
saidx_t * _SA;
|
saidx_t * _SA;
|
||||||
|
|
||||||
size_t _n;
|
saidx_t _n;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
add_library(concordia-tests
|
add_library(concordia-tests
|
||||||
|
test_utils.cpp
|
||||||
test_word_map.cpp
|
test_word_map.cpp
|
||||||
test_hash_generator.cpp
|
test_hash_generator.cpp
|
||||||
test_concordia_index.cpp
|
test_concordia_index.cpp
|
||||||
|
@ -54,6 +54,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
|||||||
expectedResult1.push_back(7);
|
expectedResult1.push_back(7);
|
||||||
expectedResult1.push_back(4);
|
expectedResult1.push_back(4);
|
||||||
|
|
||||||
|
concordia.loadIndex();
|
||||||
vector<saidx_t> searchResult1 = concordia.simpleSearch("ma rysia");
|
vector<saidx_t> searchResult1 = concordia.simpleSearch("ma rysia");
|
||||||
|
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
@ -68,10 +69,12 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
|||||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||||
{
|
{
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
concordia.addSentence("to jest okno");
|
vector<string> testSentences;
|
||||||
concordia.addSentence("czy jest okno otwarte");
|
testSentences.push_back("to jest okno");
|
||||||
concordia.addSentence("chyba to jest tutaj");
|
testSentences.push_back("czy jest okno otwarte");
|
||||||
concordia.addSentence("to jest");
|
testSentences.push_back("chyba to jest tutaj");
|
||||||
|
testSentences.push_back("to jest");
|
||||||
|
concordia.addAllSentences(testSentences);
|
||||||
|
|
||||||
concordia.generateIndex();
|
concordia.generateIndex();
|
||||||
|
|
||||||
@ -109,6 +112,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
|||||||
expectedResult2.push_back(1);
|
expectedResult2.push_back(1);
|
||||||
expectedResult2.push_back(4);
|
expectedResult2.push_back(4);
|
||||||
|
|
||||||
|
concordia.loadIndex();
|
||||||
vector<saidx_t> searchResult1 = concordia.simpleSearch("to jest");
|
vector<saidx_t> searchResult1 = concordia.simpleSearch("to jest");
|
||||||
vector<saidx_t> searchResult2 = concordia.simpleSearch("jest okno");
|
vector<saidx_t> searchResult2 = concordia.simpleSearch("jest okno");
|
||||||
|
|
||||||
|
@ -58,7 +58,6 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
|||||||
index.addSentence("Marysia ma rysia");
|
index.addSentence("Marysia ma rysia");
|
||||||
|
|
||||||
index.generateSuffixArray();
|
index.generateSuffixArray();
|
||||||
index.serializeWordMap();
|
|
||||||
|
|
||||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
|
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
|
||||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
|
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
|
|
||||||
#define TEST_WORD_MAP_PATH "/tmp/test_word_map.bin"
|
#define TEST_WORD_MAP_PATH "/tmp/test_word_map.bin"
|
||||||
@ -18,8 +19,8 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
|||||||
|
|
||||||
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
||||||
|
|
||||||
vector<sauchar_t> hash = hashGenerator.generateHash("Ala ma kota");
|
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala ma kota");
|
||||||
vector<sauchar_t> expected;
|
vector<INDEX_CHARACTER_TYPE> expected;
|
||||||
expected.push_back(0);
|
expected.push_back(0);
|
||||||
expected.push_back(1);
|
expected.push_back(1);
|
||||||
expected.push_back(2);
|
expected.push_back(2);
|
||||||
@ -34,8 +35,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
|||||||
}
|
}
|
||||||
|
|
||||||
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
|
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||||
vector<sauchar_t> hash1 = hashGenerator1.generateHash("Ala ma kota");
|
vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala ma kota");
|
||||||
vector<sauchar_t> expected1;
|
vector<INDEX_CHARACTER_TYPE> expected1;
|
||||||
expected1.push_back(0);
|
expected1.push_back(0);
|
||||||
expected1.push_back(1);
|
expected1.push_back(1);
|
||||||
expected1.push_back(2);
|
expected1.push_back(2);
|
||||||
@ -44,8 +45,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
|||||||
hashGenerator1.serializeWordMap();
|
hashGenerator1.serializeWordMap();
|
||||||
|
|
||||||
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
|
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||||
vector<sauchar_t> hash2 = hashGenerator2.generateHash("Ala ma psa");
|
vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala ma psa");
|
||||||
vector<sauchar_t> expected2;
|
vector<INDEX_CHARACTER_TYPE> expected2;
|
||||||
expected2.push_back(0);
|
expected2.push_back(0);
|
||||||
expected2.push_back(1);
|
expected2.push_back(1);
|
||||||
expected2.push_back(3);
|
expected2.push_back(3);
|
||||||
|
@ -24,7 +24,6 @@ ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_word_map
|
|||||||
index.addSentence("Marysia ma rysia");
|
index.addSentence("Marysia ma rysia");
|
||||||
|
|
||||||
index.generateSuffixArray();
|
index.generateSuffixArray();
|
||||||
index.serializeWordMap();
|
|
||||||
|
|
||||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
|
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_word_map.bin")));
|
||||||
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
|
BOOST_CHECK(boost::filesystem::exists(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin")));
|
||||||
|
161
concordia/t/test_utils.cpp
Normal file
161
concordia/t/test_utils.cpp
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
|
#include "build/libdivsufsort/include/divsufsort.h"
|
||||||
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE(utils)
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( UtilsTest1 )
|
||||||
|
{
|
||||||
|
ofstream testFileOutput;
|
||||||
|
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
||||||
|
ios::out|ios::binary);
|
||||||
|
INDEX_CHARACTER_TYPE testCharacter = 123456789; //in hex: 75BCD15
|
||||||
|
Utils::writeIndexCharacter(testFileOutput,testCharacter);
|
||||||
|
testFileOutput.close();
|
||||||
|
|
||||||
|
ifstream testFileInput;
|
||||||
|
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
|
||||||
|
INDEX_CHARACTER_TYPE retrievedCharacter = Utils::readIndexCharacter(testFileInput);
|
||||||
|
BOOST_CHECK_EQUAL(retrievedCharacter, testCharacter);
|
||||||
|
testFileInput.close();
|
||||||
|
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( UtilsTest2 )
|
||||||
|
{
|
||||||
|
ofstream testFileOutput;
|
||||||
|
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
||||||
|
ios::out|ios::binary);
|
||||||
|
Utils::writeIndexCharacter(testFileOutput,123456789); //in hex: 75BCD15
|
||||||
|
//in memory: 15 cd 5b 07
|
||||||
|
// in DEC: 21 205 91 7
|
||||||
|
|
||||||
|
Utils::writeIndexCharacter(testFileOutput,987654321); //in hex: 3ADE68B1
|
||||||
|
//in memory: b1 68 de 3a
|
||||||
|
// in DEC: 177 104 222 58
|
||||||
|
testFileOutput.close();
|
||||||
|
|
||||||
|
sauchar_t * dataArray = new sauchar_t[8];
|
||||||
|
ifstream testFileInput;
|
||||||
|
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
|
||||||
|
|
||||||
|
INDEX_CHARACTER_TYPE retrievedCharacter1 = Utils::readIndexCharacter(testFileInput);
|
||||||
|
BOOST_CHECK_EQUAL(retrievedCharacter1, 123456789);
|
||||||
|
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter1, 0);
|
||||||
|
|
||||||
|
INDEX_CHARACTER_TYPE retrievedCharacter2 = Utils::readIndexCharacter(testFileInput);
|
||||||
|
BOOST_CHECK_EQUAL(retrievedCharacter2, 987654321);
|
||||||
|
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter2, 4);
|
||||||
|
|
||||||
|
testFileInput.close();
|
||||||
|
|
||||||
|
vector<INDEX_CHARACTER_TYPE> expected;
|
||||||
|
expected.push_back(21);
|
||||||
|
expected.push_back(205);
|
||||||
|
expected.push_back(91);
|
||||||
|
expected.push_back(7);
|
||||||
|
expected.push_back(177);
|
||||||
|
expected.push_back(104);
|
||||||
|
expected.push_back(222);
|
||||||
|
expected.push_back(58);
|
||||||
|
|
||||||
|
vector<INDEX_CHARACTER_TYPE> result;
|
||||||
|
for (int i=0;i<8;i++) {
|
||||||
|
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||||
|
result.push_back(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( UtilsTest3 )
|
||||||
|
{
|
||||||
|
vector<INDEX_CHARACTER_TYPE> hash;
|
||||||
|
hash.push_back(123456789);
|
||||||
|
hash.push_back(987654321);
|
||||||
|
|
||||||
|
sauchar_t * dataArray = Utils::indexVectorToSaucharArray(hash);
|
||||||
|
|
||||||
|
vector<INDEX_CHARACTER_TYPE> result;
|
||||||
|
for (int i=0;i<8;i++) {
|
||||||
|
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||||
|
result.push_back(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<INDEX_CHARACTER_TYPE> expected;
|
||||||
|
expected.push_back(21);
|
||||||
|
expected.push_back(205);
|
||||||
|
expected.push_back(91);
|
||||||
|
expected.push_back(7);
|
||||||
|
expected.push_back(177);
|
||||||
|
expected.push_back(104);
|
||||||
|
expected.push_back(222);
|
||||||
|
expected.push_back(58);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
BOOST_AUTO_TEST_CASE( UtilsTest4 )
|
||||||
|
{
|
||||||
|
ofstream testFileOutput;
|
||||||
|
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
||||||
|
ios::out|ios::binary);
|
||||||
|
Utils::writeIndexCharacter(testFileOutput,123456789); //in hex: 75BCD15
|
||||||
|
//in memory: 15 cd 5b 07
|
||||||
|
// in DEC: 21 205 91 7
|
||||||
|
|
||||||
|
Utils::writeIndexCharacter(testFileOutput,987654321); //in hex: 3ADE68B1
|
||||||
|
//in memory: b1 68 de 3a
|
||||||
|
// in DEC: 177 104 222 58
|
||||||
|
testFileOutput.close();
|
||||||
|
|
||||||
|
sauchar_t * dataArray = Utils::readIndexFromFile(
|
||||||
|
ifstream testFileInput;
|
||||||
|
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
|
||||||
|
|
||||||
|
INDEX_CHARACTER_TYPE retrievedCharacter1 = Utils::readIndexCharacter(testFileInput);
|
||||||
|
BOOST_CHECK_EQUAL(retrievedCharacter1, 123456789);
|
||||||
|
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter1, 0);
|
||||||
|
|
||||||
|
INDEX_CHARACTER_TYPE retrievedCharacter2 = Utils::readIndexCharacter(testFileInput);
|
||||||
|
BOOST_CHECK_EQUAL(retrievedCharacter2, 987654321);
|
||||||
|
Utils::insertCharToSaucharArray(dataArray, retrievedCharacter2, 4);
|
||||||
|
|
||||||
|
testFileInput.close();
|
||||||
|
|
||||||
|
vector<INDEX_CHARACTER_TYPE> expected;
|
||||||
|
expected.push_back(21);
|
||||||
|
expected.push_back(205);
|
||||||
|
expected.push_back(91);
|
||||||
|
expected.push_back(7);
|
||||||
|
expected.push_back(177);
|
||||||
|
expected.push_back(104);
|
||||||
|
expected.push_back(222);
|
||||||
|
expected.push_back(58);
|
||||||
|
|
||||||
|
vector<INDEX_CHARACTER_TYPE> result;
|
||||||
|
for (int i=0;i<8;i++) {
|
||||||
|
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||||
|
result.push_back(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp","temp_file.bin"));
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE_END()
|
@ -8,9 +8,9 @@ WordMap::WordMap() throw(ConcordiaException) {
|
|||||||
WordMap::~WordMap() {
|
WordMap::~WordMap() {
|
||||||
}
|
}
|
||||||
|
|
||||||
sauchar_t WordMap::getWordCode(const string & word) {
|
INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word) {
|
||||||
if (_map.find(word) == _map.end()) {
|
if (_map.find(word) == _map.end()) {
|
||||||
sauchar_t newCode = _nextFree;
|
INDEX_CHARACTER_TYPE newCode = _nextFree;
|
||||||
_map[word] = newCode;
|
_map[word] = newCode;
|
||||||
_nextFree++;
|
_nextFree++;
|
||||||
return newCode;
|
return newCode;
|
||||||
|
@ -4,14 +4,11 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
#include <boost/archive/text_oarchive.hpp>
|
#include <boost/archive/text_oarchive.hpp>
|
||||||
#include <boost/archive/text_iarchive.hpp>
|
#include <boost/archive/text_iarchive.hpp>
|
||||||
#include <boost/serialization/map.hpp>
|
#include <boost/serialization/map.hpp>
|
||||||
|
|
||||||
#include "build/libdivsufsort/include/divsufsort.h"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing dictionary for word to int encoding.
|
Class representing dictionary for word to int encoding.
|
||||||
|
|
||||||
@ -27,7 +24,7 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~WordMap();
|
virtual ~WordMap();
|
||||||
|
|
||||||
sauchar_t getWordCode(const string & word);
|
INDEX_CHARACTER_TYPE getWordCode(const string & word);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class boost::serialization::access;
|
friend class boost::serialization::access;
|
||||||
@ -39,9 +36,9 @@ private:
|
|||||||
ar & _nextFree;
|
ar & _nextFree;
|
||||||
}
|
}
|
||||||
|
|
||||||
map<string, sauchar_t> _map;
|
map<string, INDEX_CHARACTER_TYPE> _map;
|
||||||
|
|
||||||
sauchar_t _nextFree;
|
INDEX_CHARACTER_TYPE _nextFree;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -56,8 +56,18 @@ endif(HAVE_INLINE)
|
|||||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/config.h")
|
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake" "${CMAKE_CURRENT_BINARY_DIR}/config.h")
|
||||||
|
|
||||||
## Checks for types ##
|
## Checks for types ##
|
||||||
# sauchar_t (32bit)
|
# sauchar_t (8bit)
|
||||||
set(SAUCHAR_TYPE "int")
|
check_type_size("uint8_t" UINT8_T)
|
||||||
|
if(HAVE_UINT8_T)
|
||||||
|
set(SAUCHAR_TYPE "uint8_t")
|
||||||
|
else(HAVE_UINT8_T)
|
||||||
|
check_type_size("unsigned char" SIZEOF_UNSIGNED_CHAR)
|
||||||
|
if("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
|
||||||
|
set(SAUCHAR_TYPE "unsigned char")
|
||||||
|
else("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
|
||||||
|
message(FATAL_ERROR "Cannot find unsigned 8-bit integer type")
|
||||||
|
endif("${SIZEOF_UNSIGNED_CHAR}" STREQUAL "1")
|
||||||
|
endif(HAVE_UINT8_T)
|
||||||
# saint_t (32bit)
|
# saint_t (32bit)
|
||||||
check_type_size("int32_t" INT32_T)
|
check_type_size("int32_t" INT32_T)
|
||||||
if(HAVE_INT32_T)
|
if(HAVE_INT32_T)
|
||||||
|
@ -6,6 +6,21 @@
|
|||||||
#Path to the Puddle tagset
|
#Path to the Puddle tagset
|
||||||
puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@";
|
puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@";
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
#Word map, hashed index and suffix array files are in a temporary directory
|
||||||
|
#and should be deleted at the end of each test procedure.
|
||||||
|
|
||||||
|
#Word map file containing unique codes for tokens
|
||||||
|
|
||||||
|
word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
||||||
|
|
||||||
|
#File containing the "text" for suffix array searching, i.e. sequence of codes
|
||||||
|
|
||||||
|
hashed_index_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_HASHED_INDEX@"
|
||||||
|
|
||||||
|
#Binarized suffix array
|
||||||
|
|
||||||
|
suffix_array_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_SUFFIX_ARRAY@"
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
|
||||||
### eof
|
### eof
|
||||||
|
6227
prod/resources/text-files/medium.txt
Normal file
6227
prod/resources/text-files/medium.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
prod/resources/text-files/small.txt
Normal file
3
prod/resources/text-files/small.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
Ala ma kota
|
||||||
|
Ala ma rysia
|
||||||
|
Marysia ma rysia
|
Loading…
Reference in New Issue
Block a user