#include "concordia/concordia_index.hpp" #include "concordia/common/utils.hpp" #include "concordia/common/config.hpp" #include #include #include #include #include ConcordiaIndex::ConcordiaIndex(const std::string & hashedIndexFilePath, const std::string & markersFilePath): _hashedIndexFilePath(hashedIndexFilePath), _markersFilePath(markersFilePath) { } ConcordiaIndex::~ConcordiaIndex() { } boost::shared_ptr > ConcordiaIndex::generateSuffixArray( boost::shared_ptr > T) { saidx_t * SA_array = new saidx_t[T->size()]; if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) { throw ConcordiaException("Error creating suffix array."); } boost::shared_ptr > result = boost::shared_ptr >(new std::vector); for (int i = 0; i < T->size(); i++) { result->push_back(SA_array[i]); } delete[] SA_array; return result; } std::vector ConcordiaIndex::addAllExamples( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const std::vector & examples) { std::ofstream hashedIndexFile; hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); std::ofstream markersFile; markersFile.open(_markersFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); std::vector hashedPatterns; BOOST_FOREACH(Example example, examples) { TokenizedSentence hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator, T, markers, example); hashedPatterns.push_back(hashedPattern); } hashedIndexFile.close(); markersFile.close(); hashGenerator->serializeWordMap(); return hashedPatterns; } TokenizedSentence ConcordiaIndex::addExample( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const Example & example) { std::ofstream hashedIndexFile; hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); std::ofstream markersFile; markersFile.open(_markersFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); TokenizedSentence hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator, T, markers, example); hashedIndexFile.close(); markersFile.close(); hashGenerator->serializeWordMap(); return hashedPattern; } void ConcordiaIndex::addTokenizedExample( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const TokenizedSentence & tokenizedSentence, const SUFFIX_MARKER_TYPE id) { std::ofstream hashedIndexFile; hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); std::ofstream markersFile; markersFile.open(_markersFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator, T, markers, tokenizedSentence, id); hashedIndexFile.close(); markersFile.close(); } void ConcordiaIndex::addAllTokenizedExamples( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const std::vector & tokenizedSentences, const std::vector & ids) { std::ofstream hashedIndexFile; hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); std::ofstream markersFile; markersFile.open(_markersFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); int index = 0; BOOST_FOREACH(TokenizedSentence tokenizedSentence, tokenizedSentences) { _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator, T, markers, tokenizedSentence, ids.at(index)); index++; } hashedIndexFile.close(); markersFile.close(); } void ConcordiaIndex::_addSingleTokenizedExample( std::ofstream & hashedIndexFile, std::ofstream & markersFile, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const TokenizedSentence & tokenizedSentence, const SUFFIX_MARKER_TYPE id) { // prepend sentence boundary marker INDEX_CHARACTER_TYPE sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE; Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI); Utils::appendCharToSaucharVector(T, sentenceBoundaryHI); SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE; Utils::writeMarker(markersFile, sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA); std::vector hash = tokenizedSentence.getCodes(); int offset = 0; for (std::vector::iterator it = hash.begin(); it != hash.end(); ++it) { INDEX_CHARACTER_TYPE character = *it; Utils::writeIndexCharacter(hashedIndexFile, character); Utils::appendCharToSaucharVector(T, character); // append to markersFile SUFFIX_MARKER_TYPE marker = Utils::createMarker( id, offset, hash.size()); Utils::writeMarker(markersFile, marker); markers->push_back(marker); offset++; } // append sentence boundary marker sentenceBoundaryHI = INDEX_CHARACTER_TYPE_MAX_VALUE; Utils::writeIndexCharacter(hashedIndexFile, sentenceBoundaryHI); Utils::appendCharToSaucharVector(T, sentenceBoundaryHI); sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE; Utils::writeMarker(markersFile, sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA); } TokenizedSentence ConcordiaIndex::_addSingleExample( std::ofstream & hashedIndexFile, std::ofstream & markersFile, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const Example & example) { TokenizedSentence hashedPattern = hashGenerator->generateHash(example.getSentence()); _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator, T, markers, hashedPattern, example.getId()); return hashedPattern; }