2013-11-14 20:36:34 +01:00
|
|
|
#include "concordia/concordia_index.hpp"
|
|
|
|
|
2013-12-06 22:29:25 +01:00
|
|
|
#include "concordia/common/utils.hpp"
|
2014-03-14 11:30:17 +01:00
|
|
|
#include "concordia/common/config.hpp"
|
2013-11-14 20:36:34 +01:00
|
|
|
#include <boost/filesystem.hpp>
|
2014-02-20 10:49:17 +01:00
|
|
|
#include <boost/foreach.hpp>
|
2015-08-19 20:49:26 +02:00
|
|
|
#include <boost/make_shared.hpp>
|
|
|
|
|
2013-11-28 16:47:57 +01:00
|
|
|
#include <iostream>
|
2015-04-09 22:17:19 +02:00
|
|
|
#include <climits>
|
2013-11-14 20:36:34 +01:00
|
|
|
|
2015-04-15 14:14:10 +02:00
|
|
|
ConcordiaIndex::ConcordiaIndex(const std::string & hashedIndexFilePath,
|
2019-01-18 13:30:51 +01:00
|
|
|
const std::string & markersFilePath):
|
2014-02-20 10:49:17 +01:00
|
|
|
_hashedIndexFilePath(hashedIndexFilePath),
|
|
|
|
_markersFilePath(markersFilePath) {
|
2013-11-14 20:36:34 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
ConcordiaIndex::~ConcordiaIndex() {
|
|
|
|
}
|
|
|
|
|
2019-02-28 20:56:22 +01:00
|
|
|
void ConcordiaIndex::generateSuffixArray(
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA) {
|
|
|
|
SA->clear();
|
|
|
|
for(int i=0;i<T->size();i++) {
|
|
|
|
SA->push_back(0);
|
2013-12-14 15:23:17 +01:00
|
|
|
}
|
2019-02-28 20:56:22 +01:00
|
|
|
if (divsufsort(T->data(), SA->data(), (saidx_t) T->size()) != 0) {
|
|
|
|
throw ConcordiaException("Error creating suffix array.");
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
2013-11-14 20:36:34 +01:00
|
|
|
}
|
|
|
|
|
2015-06-26 22:50:53 +02:00
|
|
|
std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
|
2014-02-20 10:49:17 +01:00
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
2015-04-15 14:14:10 +02:00
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
2015-06-26 22:50:53 +02:00
|
|
|
const std::vector<Example> & examples) {
|
2015-04-15 14:14:10 +02:00
|
|
|
std::ofstream hashedIndexFile;
|
|
|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
|
|
|
std::ios::app|std::ios::binary);
|
|
|
|
std::ofstream markersFile;
|
|
|
|
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
|
|
|
std::ios::app|std::ios::binary);
|
2015-06-26 22:50:53 +02:00
|
|
|
|
|
|
|
std::vector<TokenizedSentence> hashedPatterns;
|
|
|
|
BOOST_FOREACH(Example example, examples) {
|
2015-08-19 20:49:26 +02:00
|
|
|
TokenizedSentence hashedPattern =
|
2015-06-27 12:40:24 +02:00
|
|
|
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
2014-02-20 10:49:17 +01:00
|
|
|
T, markers, example);
|
2015-08-19 20:49:26 +02:00
|
|
|
hashedPatterns.push_back(hashedPattern);
|
2015-06-26 22:50:53 +02:00
|
|
|
}
|
|
|
|
|
2013-12-06 22:29:25 +01:00
|
|
|
hashedIndexFile.close();
|
2014-02-20 10:49:17 +01:00
|
|
|
markersFile.close();
|
2013-12-14 15:23:17 +01:00
|
|
|
hashGenerator->serializeWordMap();
|
2015-06-27 12:40:24 +02:00
|
|
|
|
2015-06-26 22:50:53 +02:00
|
|
|
return hashedPatterns;
|
2013-12-06 22:29:25 +01:00
|
|
|
}
|
|
|
|
|
2015-08-19 20:49:26 +02:00
|
|
|
TokenizedSentence ConcordiaIndex::addExample(
|
2014-02-20 10:49:17 +01:00
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
2015-04-15 14:14:10 +02:00
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
2015-06-26 22:50:53 +02:00
|
|
|
const Example & example) {
|
2015-04-15 14:14:10 +02:00
|
|
|
std::ofstream hashedIndexFile;
|
|
|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
|
|
|
std::ios::app|std::ios::binary);
|
|
|
|
std::ofstream markersFile;
|
|
|
|
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
|
|
|
std::ios::app|std::ios::binary);
|
2015-08-19 20:49:26 +02:00
|
|
|
TokenizedSentence hashedPattern =
|
2015-06-27 12:40:24 +02:00
|
|
|
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
2014-02-20 10:49:17 +01:00
|
|
|
T, markers, example);
|
2013-11-28 16:47:57 +01:00
|
|
|
hashedIndexFile.close();
|
2014-02-20 10:49:17 +01:00
|
|
|
markersFile.close();
|
2013-12-14 15:23:17 +01:00
|
|
|
hashGenerator->serializeWordMap();
|
2015-06-27 12:40:24 +02:00
|
|
|
|
2015-06-26 22:50:53 +02:00
|
|
|
return hashedPattern;
|
2013-12-14 15:23:17 +01:00
|
|
|
}
|
|
|
|
|
2015-08-01 17:03:39 +02:00
|
|
|
void ConcordiaIndex::addTokenizedExample(
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
2015-08-19 20:49:26 +02:00
|
|
|
const TokenizedSentence & tokenizedSentence,
|
|
|
|
const SUFFIX_MARKER_TYPE id) {
|
2015-08-01 17:03:39 +02:00
|
|
|
std::ofstream hashedIndexFile;
|
|
|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
|
|
|
std::ios::app|std::ios::binary);
|
|
|
|
std::ofstream markersFile;
|
|
|
|
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
|
|
|
std::ios::app|std::ios::binary);
|
|
|
|
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
|
|
|
|
T, markers, tokenizedSentence, id);
|
|
|
|
hashedIndexFile.close();
|
|
|
|
markersFile.close();
|
|
|
|
}
|
|
|
|
|
2015-08-19 20:49:26 +02:00
|
|
|
void ConcordiaIndex::addAllTokenizedExamples(
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
const std::vector<TokenizedSentence> & tokenizedSentences,
|
|
|
|
const std::vector<SUFFIX_MARKER_TYPE> & ids) {
|
|
|
|
std::ofstream hashedIndexFile;
|
|
|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
|
|
|
std::ios::app|std::ios::binary);
|
|
|
|
std::ofstream markersFile;
|
|
|
|
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
|
|
|
std::ios::app|std::ios::binary);
|
|
|
|
|
|
|
|
int index = 0;
|
|
|
|
BOOST_FOREACH(TokenizedSentence tokenizedSentence, tokenizedSentences) {
|
|
|
|
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
|
|
|
|
T, markers, tokenizedSentence, ids.at(index));
|
|
|
|
index++;
|
|
|
|
}
|
|
|
|
hashedIndexFile.close();
|
|
|
|
markersFile.close();
|
|
|
|
}
|
|
|
|
|
2015-08-01 17:03:39 +02:00
|
|
|
void ConcordiaIndex::_addSingleTokenizedExample(
|
2015-04-15 14:14:10 +02:00
|
|
|
std::ofstream & hashedIndexFile,
|
|
|
|
std::ofstream & markersFile,
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
2015-08-19 20:49:26 +02:00
|
|
|
const TokenizedSentence & tokenizedSentence,
|
|
|
|
const SUFFIX_MARKER_TYPE id) {
|
2017-10-10 15:39:47 +02:00
|
|
|
|
|
|
|
// prepend sentence boundary marker
|
|
|
|
INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
|
|
|
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
|
|
|
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
|
|
|
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
|
|
|
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
|
|
|
markers->push_back(sentenceBoundaryMA);
|
|
|
|
|
|
|
|
|
2015-08-19 20:49:26 +02:00
|
|
|
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence.getCodes();
|
2015-06-27 12:40:24 +02:00
|
|
|
|
2014-02-20 10:49:17 +01:00
|
|
|
int offset = 0;
|
2015-04-15 14:14:10 +02:00
|
|
|
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
2015-04-15 10:55:26 +02:00
|
|
|
it != hash.end(); ++it) {
|
2013-12-14 15:23:17 +01:00
|
|
|
INDEX_CHARACTER_TYPE character = *it;
|
|
|
|
Utils::writeIndexCharacter(hashedIndexFile, character);
|
|
|
|
Utils::appendCharToSaucharVector(T, character);
|
2014-02-20 10:49:17 +01:00
|
|
|
|
|
|
|
// append to markersFile
|
2015-04-09 22:17:19 +02:00
|
|
|
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
|
2015-08-01 17:03:39 +02:00
|
|
|
id,
|
2015-04-09 22:17:19 +02:00
|
|
|
offset,
|
2015-04-15 10:55:26 +02:00
|
|
|
hash.size());
|
2014-02-20 10:49:17 +01:00
|
|
|
|
|
|
|
Utils::writeMarker(markersFile, marker);
|
|
|
|
markers->push_back(marker);
|
|
|
|
|
|
|
|
offset++;
|
2013-12-14 15:23:17 +01:00
|
|
|
}
|
2014-02-20 10:49:17 +01:00
|
|
|
|
|
|
|
// append sentence boundary marker
|
2017-10-10 15:39:47 +02:00
|
|
|
sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE;
|
2014-02-20 10:49:17 +01:00
|
|
|
Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI);
|
|
|
|
Utils::appendCharToSaucharVector(T, sentenceBoundaryHI);
|
2017-10-10 15:39:47 +02:00
|
|
|
sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
2014-02-20 10:49:17 +01:00
|
|
|
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
|
|
|
markers->push_back(sentenceBoundaryMA);
|
2015-08-01 17:03:39 +02:00
|
|
|
}
|
2015-06-27 12:40:24 +02:00
|
|
|
|
2015-08-19 20:49:26 +02:00
|
|
|
TokenizedSentence ConcordiaIndex::_addSingleExample(
|
2015-08-01 17:03:39 +02:00
|
|
|
std::ofstream & hashedIndexFile,
|
|
|
|
std::ofstream & markersFile,
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
const Example & example) {
|
2015-08-19 20:49:26 +02:00
|
|
|
TokenizedSentence hashedPattern =
|
2015-08-01 17:03:39 +02:00
|
|
|
hashGenerator->generateHash(example.getSentence());
|
|
|
|
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
|
|
|
|
T, markers, hashedPattern, example.getId());
|
2015-08-07 12:54:57 +02:00
|
|
|
|
2015-06-26 22:50:53 +02:00
|
|
|
return hashedPattern;
|
2013-11-14 20:36:34 +01:00
|
|
|
}
|