concordia-library/concordia/concordia_index.cpp

190 lines
7.9 KiB
C++
Raw Normal View History

2013-11-14 20:36:34 +01:00
#include "concordia/concordia_index.hpp"
#include "concordia/common/utils.hpp"
#include "concordia/common/config.hpp"
2013-11-14 20:36:34 +01:00
#include <boost/filesystem.hpp>
#include <boost/foreach.hpp>
2015-08-19 20:49:26 +02:00
#include <boost/make_shared.hpp>
2013-11-28 16:47:57 +01:00
#include <iostream>
#include <climits>
2013-11-14 20:36:34 +01:00
ConcordiaIndex::ConcordiaIndex(const std::string & hashedIndexFilePath,
const std::string & markersFilePath)
2013-11-28 16:47:57 +01:00
throw(ConcordiaException) :
_hashedIndexFilePath(hashedIndexFilePath),
_markersFilePath(markersFilePath) {
2013-11-14 20:36:34 +01:00
}
ConcordiaIndex::~ConcordiaIndex() {
}
boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
boost::shared_ptr<std::vector<sauchar_t> > T) {
saidx_t * SA_array = new saidx_t[T->size()];
if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) {
throw ConcordiaException("Error creating suffix array.");
}
boost::shared_ptr<std::vector<saidx_t> > result =
2015-06-27 12:40:24 +02:00
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
for (int i = 0; i < T->size(); i++) {
result->push_back(SA_array[i]);
2013-11-28 16:47:57 +01:00
}
delete[] SA_array;
return result;
2013-11-14 20:36:34 +01:00
}
2015-06-26 22:50:53 +02:00
std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
2015-06-26 22:50:53 +02:00
const std::vector<Example> & examples) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
2015-06-26 22:50:53 +02:00
std::vector<TokenizedSentence> hashedPatterns;
BOOST_FOREACH(Example example, examples) {
2015-08-19 20:49:26 +02:00
TokenizedSentence hashedPattern =
2015-06-27 12:40:24 +02:00
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
2015-08-19 20:49:26 +02:00
hashedPatterns.push_back(hashedPattern);
2015-06-26 22:50:53 +02:00
}
hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap();
2015-06-27 12:40:24 +02:00
2015-06-26 22:50:53 +02:00
return hashedPatterns;
}
2015-08-19 20:49:26 +02:00
TokenizedSentence ConcordiaIndex::addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
2015-06-26 22:50:53 +02:00
const Example & example) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
2015-08-19 20:49:26 +02:00
TokenizedSentence hashedPattern =
2015-06-27 12:40:24 +02:00
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
2013-11-28 16:47:57 +01:00
hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap();
2015-06-27 12:40:24 +02:00
2015-06-26 22:50:53 +02:00
return hashedPattern;
}
void ConcordiaIndex::addTokenizedExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
2015-08-19 20:49:26 +02:00
const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, tokenizedSentence, id);
hashedIndexFile.close();
markersFile.close();
}
2015-08-19 20:49:26 +02:00
void ConcordiaIndex::addAllTokenizedExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const std::vector<TokenizedSentence> & tokenizedSentences,
const std::vector<SUFFIX_MARKER_TYPE> & ids) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
int index = 0;
BOOST_FOREACH(TokenizedSentence tokenizedSentence, tokenizedSentences) {
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, tokenizedSentence, ids.at(index));
index++;
}
hashedIndexFile.close();
markersFile.close();
}
void ConcordiaIndex::_addSingleTokenizedExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
2015-08-19 20:49:26 +02:00
const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id) {
2017-10-10 15:39:47 +02:00
// prepend sentence boundary marker
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
Utils::writeMarker(markersFile, sentenceBoundaryMA);
markers->push_back(sentenceBoundaryMA);
2015-08-19 20:49:26 +02:00
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
2015-06-27 12:40:24 +02:00
int offset = 0;
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
it != hash.end(); ++it) {
INDEX_CHARACTER_TYPE character = *it;
Utils::writeIndexCharacter(hashedIndexFile, character);
Utils::appendCharToSaucharVector(T, character);
// append to markersFile
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
id,
offset,
hash.size());
Utils::writeMarker(markersFile, marker);
markers->push_back(marker);
offset++;
}
// append sentence boundary marker
2017-10-10 15:39:47 +02:00
sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
2017-10-10 15:39:47 +02:00
sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
Utils::writeMarker(markersFile, sentenceBoundaryMA);
markers->push_back(sentenceBoundaryMA);
}
2015-06-27 12:40:24 +02:00
2015-08-19 20:49:26 +02:00
TokenizedSentence ConcordiaIndex::_addSingleExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
2015-08-19 20:49:26 +02:00
TokenizedSentence hashedPattern =
hashGenerator->generateHash(example.getSentence());
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, hashedPattern, example.getId());
2015-06-26 22:50:53 +02:00
return hashedPattern;
2013-11-14 20:36:34 +01:00
}