#include "concordia/concordia_index.hpp" #include "concordia/common/utils.hpp" #include #include #include #include ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath, const string & markersFilePath) throw(ConcordiaException) : _hashedIndexFilePath(hashedIndexFilePath), _markersFilePath(markersFilePath) { } ConcordiaIndex::~ConcordiaIndex() { } boost::shared_ptr > ConcordiaIndex::generateSuffixArray( boost::shared_ptr hashGenerator, boost::shared_ptr > T) { saidx_t * SA_array = new saidx_t[T->size()]; if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) { throw ConcordiaException("Error creating suffix array."); } boost::shared_ptr > result = boost::shared_ptr >(new vector); for (int i = 0; i < T->size(); i++) { result->push_back(SA_array[i]); } delete[] SA_array; return result; } void ConcordiaIndex::addExample( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const Example & example) { ofstream hashedIndexFile; hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out| ios::app|ios::binary); ofstream markersFile; markersFile.open(_markersFilePath.c_str(), ios::out| ios::app|ios::binary); _addSingleExample(hashedIndexFile, markersFile, hashGenerator, T, markers, example); hashedIndexFile.close(); markersFile.close(); hashGenerator->serializeWordMap(); } void ConcordiaIndex::addAllExamples( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const boost::ptr_vector & examples) { ofstream hashedIndexFile; hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out| ios::app|ios::binary); ofstream markersFile; markersFile.open(_markersFilePath.c_str(), ios::out| ios::app|ios::binary); BOOST_FOREACH(Example example, examples) { _addSingleExample(hashedIndexFile, markersFile, hashGenerator, T, markers, example); } hashedIndexFile.close(); markersFile.close(); hashGenerator->serializeWordMap(); } void ConcordiaIndex::_addSingleExample( ofstream & hashedIndexFile, ofstream & markersFile, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const Example & example) { boost::shared_ptr > hash = hashGenerator->generateHash(example.getSentence()); int offset = 0; for (vector::iterator it = hash->begin(); it != hash->end(); ++it) { INDEX_CHARACTER_TYPE character = *it; Utils::writeIndexCharacter(hashedIndexFile, character); Utils::appendCharToSaucharVector(T, character); // append to markersFile SUFFIX_MARKER_TYPE marker = offset; marker += example.getId() * SUFFIX_MARKER_DIVISOR; Utils::writeMarker(markersFile, marker); markers->push_back(marker); offset++; } // append sentence boundary marker INDEX_CHARACTER_TYPE sentenceBoundaryHI = ULONG_MAX; Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI); Utils::appendCharToSaucharVector(T, sentenceBoundaryHI); SUFFIX_MARKER_TYPE sentenceBoundaryMA = ULONG_MAX; Utils::writeMarker(markersFile, sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA); }