#ifndef CONCORDIA_INDEX_HDR #define CONCORDIA_INDEX_HDR #include #include #include #include #include #include "concordia/common/config.hpp" #include "concordia/example.hpp" #include "concordia/hash_generator.hpp" #include "concordia/concordia_exception.hpp" #include "concordia/tokenized_sentence.hpp" #include /*! Class for creating and maintaining the index. This class does not hold the index data structures but only operates on them when they are passed to ConcordiaIndex methods by smart pointers. This class only remembers paths to two files: hashed index and markers array, which are backups of the respective data structures on HDD. */ class ConcordiaIndex { public: /*! Constructor. \param hashedIndexFilePath path to the hashed index file \param markersFilePath path to the markers array \throws ConcordiaException */ explicit ConcordiaIndex(const std::string & hashedIndexFilePath, const std::string & markersFilePath) throw(ConcordiaException); /*! Destructor. */ virtual ~ConcordiaIndex(); /*! Adds an Example to the index. Example is first hashed using the hash generator passed to this method. Then, hashed index and markers array (also passed to this method) are appended with the hashed example. At the same time, HDD versions of these two data structures are also appended with the same example. The method returns a tokenized version of the example. \param hashGenerator hash generator to be used to prepare the hash of the example \param T RAM-based hash index to be appended to \param markers RAM-based markers array to be appended to \param example example to be added to index \returns tokenized example \throws ConcordiaException */ boost::shared_ptr addExample( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const Example & example); /*! Adds a tokenized example to the index. Hashed index and markers array are appended with the example. At the same time, HDD versions of these two data structures are also appended with the same example. The method returns a tokenized version of the example. \param hashGenerator hash generator to be used to prepare the hash of the example \param T RAM-based hash index to be appended to \param markers RAM-based markers array to be appended to \param example example to be added to index \param tokenizedSentence tokenized sentence to be added \param id of the sentence to be added \throws ConcordiaException */ void addTokenizedExample( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr tokenizedSentence, SUFFIX_MARKER_TYPE id); /*! Adds multiple examples to the index. Examples are first hashed using the hash generator passed to this method. Then, hashed index and markers array (also passed to this method) are appended with the hashed examples. At the same time, HDD versions of these two data structures are also appended with the same examples. The method returns a vector of tokenized examples. \param hashGenerator hash generator to be used to prepare the hash of the example \param T RAM-based hash index to be appended to \param markers RAM-based markers array to be appended to \param examples vector of examples to be added to index \returns vector of tokenized examples \throws ConcordiaException */ std::vector addAllExamples( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const std::vector & examples); /*! Generates suffix array based on the passed hashed index. \returns the generated suffix array \throws ConcordiaException */ boost::shared_ptr > generateSuffixArray( boost::shared_ptr > T); private: void _addSingleTokenizedExample( std::ofstream & hashedIndexFile, std::ofstream & markersFile, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr tokenizedSentence, SUFFIX_MARKER_TYPE id); boost::shared_ptr _addSingleExample( std::ofstream & hashedIndexFile, std::ofstream & markersFile, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const Example & example); std::string _hashedIndexFilePath; std::string _markersFilePath; }; #endif