2013-11-14 20:36:34 +01:00
|
|
|
#include "concordia/concordia_index.hpp"
|
|
|
|
|
2013-12-06 22:29:25 +01:00
|
|
|
#include "concordia/common/utils.hpp"
|
2013-11-14 20:36:34 +01:00
|
|
|
#include <boost/filesystem.hpp>
|
2013-11-28 16:47:57 +01:00
|
|
|
#include <iostream>
|
2013-11-14 20:36:34 +01:00
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath)
|
2013-11-28 16:47:57 +01:00
|
|
|
throw(ConcordiaException) :
|
2013-12-14 15:23:17 +01:00
|
|
|
_hashedIndexFilePath(hashedIndexFilePath) {
|
2013-11-14 20:36:34 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
ConcordiaIndex::~ConcordiaIndex() {
|
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<vector<sauchar_t> > T) {
|
|
|
|
saidx_t * SA_array = new saidx_t[T->size()];
|
|
|
|
if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) {
|
|
|
|
throw ConcordiaException("Error creating suffix array.");
|
|
|
|
}
|
2013-12-06 22:29:25 +01:00
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
boost::shared_ptr<vector<saidx_t> > result =
|
|
|
|
boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>);
|
|
|
|
for (int i = 0; i < T->size(); i++) {
|
|
|
|
result->push_back(SA_array[i]);
|
2013-11-28 16:47:57 +01:00
|
|
|
}
|
2013-12-14 15:23:17 +01:00
|
|
|
|
|
|
|
delete[] SA_array;
|
|
|
|
return result;
|
2013-11-14 20:36:34 +01:00
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
void ConcordiaIndex::addSentence(boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<vector<sauchar_t> > T,
|
|
|
|
const string & sentence) {
|
2013-11-28 16:47:57 +01:00
|
|
|
ofstream hashedIndexFile;
|
|
|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
|
|
|
ios::app|ios::binary);
|
2013-12-14 15:23:17 +01:00
|
|
|
_addSingleSentence(hashedIndexFile, hashGenerator, T, sentence);
|
2013-12-06 22:29:25 +01:00
|
|
|
hashedIndexFile.close();
|
2013-12-14 15:23:17 +01:00
|
|
|
hashGenerator->serializeWordMap();
|
2013-12-06 22:29:25 +01:00
|
|
|
}
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
void ConcordiaIndex::addAllSentences(
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<vector<string> > sentences) {
|
2013-12-06 22:29:25 +01:00
|
|
|
ofstream hashedIndexFile;
|
|
|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
|
|
|
ios::app|ios::binary);
|
2013-12-14 15:23:17 +01:00
|
|
|
for (vector<string>::iterator sent_it = sentences->begin();
|
|
|
|
sent_it != sentences->end(); ++sent_it) {
|
2013-12-06 22:29:25 +01:00
|
|
|
string sentence = *sent_it;
|
2013-12-14 15:23:17 +01:00
|
|
|
_addSingleSentence(hashedIndexFile, hashGenerator, T, sentence);
|
2013-11-20 17:43:29 +01:00
|
|
|
}
|
2013-11-28 16:47:57 +01:00
|
|
|
hashedIndexFile.close();
|
2013-12-14 15:23:17 +01:00
|
|
|
hashGenerator->serializeWordMap();
|
|
|
|
}
|
|
|
|
|
|
|
|
void ConcordiaIndex::_addSingleSentence(
|
|
|
|
ofstream & hashedIndexFile,
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
const string & sentence) {
|
|
|
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash
|
|
|
|
= hashGenerator->generateHash(sentence);
|
|
|
|
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash->begin();
|
|
|
|
it != hash->end(); ++it) {
|
|
|
|
INDEX_CHARACTER_TYPE character = *it;
|
|
|
|
Utils::writeIndexCharacter(hashedIndexFile, character);
|
|
|
|
Utils::appendCharToSaucharVector(T, character);
|
|
|
|
}
|
2013-11-14 20:36:34 +01:00
|
|
|
}
|
|
|
|
|