concordia-library/concordia/concordia_index.hpp

134 lines
5.5 KiB
C++

#ifndef CONCORDIA_INDEX_HDR
#define CONCORDIA_INDEX_HDR
#include <boost/shared_ptr.hpp>
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/example.hpp"
#include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp"
#include "concordia/tokenized_sentence.hpp"
#include <divsufsort.h>
/*!
Class for creating and maintaining the index. This class
does not hold the index data structures but only operates on
them when they are passed to ConcordiaIndex methods by
smart pointers. This class only remembers paths to two
files: hashed index and markers array, which are backups
of the respective data structures on HDD.
*/
class ConcordiaIndex {
public:
/*! Constructor.
\param hashedIndexFilePath path to the hashed index file
\param markersFilePath path to the markers array
\throws ConcordiaException
*/
explicit ConcordiaIndex(const std::string & hashedIndexFilePath,
const std::string & markersFilePath)
throw(ConcordiaException);
/*! Destructor.
*/
virtual ~ConcordiaIndex();
/*! Adds an Example to the index. Example is first hashed using
the hash generator passed to this method. Then, hashed index
and markers array (also passed to this method) are appended
with the hashed example. At the same time, HDD versions of these
two data structures are also appended with the same example.
The method returns a tokenized version of the example.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param example example to be added to index
\returns tokenized example
\throws ConcordiaException
*/
boost::shared_ptr<TokenizedSentence> addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example);
/*! Adds a tokenized example to the index. Hashed index
and markers array are appended with the example.
At the same time, HDD versions of these
two data structures are also appended with the same example.
The method returns a tokenized version of the example.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param example example to be added to index
\param tokenizedSentence tokenized sentence to be added
\param id of the sentence to be added
\throws ConcordiaException
*/
void addTokenizedExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id);
/*! Adds multiple examples to the index. Examples are first hashed using
the hash generator passed to this method. Then, hashed index
and markers array (also passed to this method) are appended
with the hashed examples. At the same time, HDD versions of these
two data structures are also appended with the same examples.
The method returns a vector of tokenized examples.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param examples vector of examples to be added to index
\returns vector of tokenized examples
\throws ConcordiaException
*/
std::vector<TokenizedSentence> addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const std::vector<Example> & examples);
/*! Generates suffix array based on the passed hashed index.
\returns the generated suffix array
\throws ConcordiaException
*/
boost::shared_ptr<std::vector<saidx_t> > generateSuffixArray(
boost::shared_ptr<std::vector<sauchar_t> > T);
private:
void _addSingleTokenizedExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id);
boost::shared_ptr<TokenizedSentence> _addSingleExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example);
std::string _hashedIndexFilePath;
std::string _markersFilePath;
};
#endif