134 lines
5.5 KiB
C++
134 lines
5.5 KiB
C++
#ifndef CONCORDIA_INDEX_HDR
|
|
#define CONCORDIA_INDEX_HDR
|
|
|
|
#include <boost/shared_ptr.hpp>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include <vector>
|
|
|
|
#include "concordia/common/config.hpp"
|
|
#include "concordia/example.hpp"
|
|
#include "concordia/hash_generator.hpp"
|
|
#include "concordia/concordia_exception.hpp"
|
|
#include "concordia/tokenized_sentence.hpp"
|
|
#include <divsufsort.h>
|
|
|
|
/*!
|
|
Class for creating and maintaining the index. This class
|
|
does not hold the index data structures but only operates on
|
|
them when they are passed to ConcordiaIndex methods by
|
|
smart pointers. This class only remembers paths to two
|
|
files: hashed index and markers array, which are backups
|
|
of the respective data structures on HDD.
|
|
|
|
*/
|
|
|
|
class ConcordiaIndex {
|
|
public:
|
|
/*! Constructor.
|
|
\param hashedIndexFilePath path to the hashed index file
|
|
\param markersFilePath path to the markers array
|
|
\throws ConcordiaException
|
|
*/
|
|
explicit ConcordiaIndex(const std::string & hashedIndexFilePath,
|
|
const std::string & markersFilePath)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Destructor.
|
|
*/
|
|
virtual ~ConcordiaIndex();
|
|
|
|
/*! Adds an Example to the index. Example is first hashed using
|
|
the hash generator passed to this method. Then, hashed index
|
|
and markers array (also passed to this method) are appended
|
|
with the hashed example. At the same time, HDD versions of these
|
|
two data structures are also appended with the same example.
|
|
The method returns a tokenized version of the example.
|
|
\param hashGenerator hash generator to be used to prepare the hash
|
|
of the example
|
|
\param T RAM-based hash index to be appended to
|
|
\param markers RAM-based markers array to be appended to
|
|
\param example example to be added to index
|
|
\returns tokenized example
|
|
\throws ConcordiaException
|
|
*/
|
|
boost::shared_ptr<TokenizedSentence> addExample(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
const Example & example);
|
|
|
|
/*! Adds a tokenized example to the index. Hashed index
|
|
and markers array are appended with the example.
|
|
At the same time, HDD versions of these
|
|
two data structures are also appended with the same example.
|
|
The method returns a tokenized version of the example.
|
|
\param hashGenerator hash generator to be used to prepare the hash
|
|
of the example
|
|
\param T RAM-based hash index to be appended to
|
|
\param markers RAM-based markers array to be appended to
|
|
\param example example to be added to index
|
|
\param tokenizedSentence tokenized sentence to be added
|
|
\param id of the sentence to be added
|
|
\throws ConcordiaException
|
|
*/
|
|
void addTokenizedExample(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
|
SUFFIX_MARKER_TYPE id);
|
|
|
|
/*! Adds multiple examples to the index. Examples are first hashed using
|
|
the hash generator passed to this method. Then, hashed index
|
|
and markers array (also passed to this method) are appended
|
|
with the hashed examples. At the same time, HDD versions of these
|
|
two data structures are also appended with the same examples.
|
|
The method returns a vector of tokenized examples.
|
|
\param hashGenerator hash generator to be used to prepare the hash
|
|
of the example
|
|
\param T RAM-based hash index to be appended to
|
|
\param markers RAM-based markers array to be appended to
|
|
\param examples vector of examples to be added to index
|
|
\returns vector of tokenized examples
|
|
\throws ConcordiaException
|
|
*/
|
|
std::vector<TokenizedSentence> addAllExamples(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
const std::vector<Example> & examples);
|
|
|
|
/*! Generates suffix array based on the passed hashed index.
|
|
\returns the generated suffix array
|
|
\throws ConcordiaException
|
|
*/
|
|
boost::shared_ptr<std::vector<saidx_t> > generateSuffixArray(
|
|
boost::shared_ptr<std::vector<sauchar_t> > T);
|
|
|
|
private:
|
|
void _addSingleTokenizedExample(
|
|
std::ofstream & hashedIndexFile,
|
|
std::ofstream & markersFile,
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
|
SUFFIX_MARKER_TYPE id);
|
|
|
|
boost::shared_ptr<TokenizedSentence> _addSingleExample(
|
|
std::ofstream & hashedIndexFile,
|
|
std::ofstream & markersFile,
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
const Example & example);
|
|
|
|
std::string _hashedIndexFilePath;
|
|
|
|
std::string _markersFilePath;
|
|
};
|
|
|
|
#endif
|