concordia-library/concordia/concordia_index.cpp

95 lines
3.1 KiB
C++
Raw Normal View History

2013-11-14 20:36:34 +01:00
#include "concordia/concordia_index.hpp"
#include <boost/filesystem.hpp>
2013-11-28 16:47:57 +01:00
#include <iostream>
2013-11-14 20:36:34 +01:00
2013-11-28 16:47:57 +01:00
ConcordiaIndex::ConcordiaIndex(const string & wordMapFilePath,
const string & hashedIndexFilePath,
const string & suffixArrayFilePath)
throw(ConcordiaException) :
_hashedIndexFilePath(hashedIndexFilePath),
_suffixArrayFilePath(suffixArrayFilePath) {
if (boost::filesystem::exists(wordMapFilePath)) {
if (!boost::filesystem::exists(hashedIndexFilePath)) {
2013-11-14 20:36:34 +01:00
throw ConcordiaException("E01: Word map file exists "
"but hashed index file absent.");
}
} else { // WordMap file does not exist
2013-11-28 16:47:57 +01:00
if (boost::filesystem::exists(hashedIndexFilePath)) {
2013-11-14 20:36:34 +01:00
throw ConcordiaException("E02: Hashed index file exists "
"but word map file absent.");
}
}
_hashGenerator = boost::shared_ptr<HashGenerator>(
2013-11-28 16:47:57 +01:00
new HashGenerator(wordMapFilePath));
2013-11-14 20:36:34 +01:00
}
ConcordiaIndex::~ConcordiaIndex() {
}
void ConcordiaIndex::serializeWordMap() {
_hashGenerator->serializeWordMap();
}
void ConcordiaIndex::generateSuffixArray() {
2013-11-28 16:47:57 +01:00
ifstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::in|
ios::ate|ios::binary);
2013-11-20 17:43:29 +01:00
/* Get the file size. */
2013-12-01 23:34:46 +01:00
long n = hashedIndexFile.tellg() / sizeof(sauchar_t);
2013-11-20 17:43:29 +01:00
sauchar_t *T;
saidx_t *SA;
2013-11-28 16:47:57 +01:00
T = new sauchar_t[n];
SA = new saidx_t[n];
2013-11-20 17:43:29 +01:00
/* Read n bytes of data. */
2013-11-28 16:47:57 +01:00
hashedIndexFile.seekg(0, ios::beg);
2013-11-20 17:43:29 +01:00
2013-11-28 16:47:57 +01:00
sauchar_t buff;
int pos = 0;
while (!hashedIndexFile.eof()) {
hashedIndexFile.read(reinterpret_cast<char *>(&buff),
sizeof(sauchar_t));
T[pos++] = buff;
}
hashedIndexFile.close();
2013-11-20 17:43:29 +01:00
/* Construct the suffix array. */
if (divsufsort(T, SA, (saidx_t)n) != 0) {
throw ConcordiaException("Error creating suffix array.");
}
/* Write the suffix array. */
2013-11-28 16:47:57 +01:00
ofstream suffixArrayFile;
suffixArrayFile.open(_suffixArrayFilePath.c_str(), ios::out|ios::binary);
for (int i = 0; i < n; i++) {
suffixArrayFile.write(reinterpret_cast<char *>(&SA[i]),
sizeof(saidx_t));
}
suffixArrayFile.close();
2013-11-20 17:43:29 +01:00
/* Deallocate memory. */
2013-11-28 16:47:57 +01:00
delete[] T;
delete[] SA;
2013-11-14 20:36:34 +01:00
}
void ConcordiaIndex::addSentence(const string & sentence) {
2013-11-20 17:43:29 +01:00
vector<sauchar_t> hash = _hashGenerator->generateHash(sentence);
2013-11-28 16:47:57 +01:00
ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
ios::app|ios::binary);
2013-11-20 17:43:29 +01:00
for (vector<sauchar_t>::iterator it = hash.begin();
it != hash.end(); ++it) {
2013-11-28 16:47:57 +01:00
sauchar_t buff = *it;
hashedIndexFile.write(reinterpret_cast<char *>(&buff),
sizeof(sauchar_t));
2013-11-20 17:43:29 +01:00
}
2013-11-28 16:47:57 +01:00
hashedIndexFile.close();
2013-11-14 20:36:34 +01:00
}