99 lines
3.8 KiB
C++
99 lines
3.8 KiB
C++
#ifndef CONCORDIA_INDEX_HDR
|
|
#define CONCORDIA_INDEX_HDR
|
|
|
|
#include <boost/shared_ptr.hpp>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include <vector>
|
|
|
|
#include "concordia/common/config.hpp"
|
|
#include "concordia/example.hpp"
|
|
#include "concordia/hash_generator.hpp"
|
|
#include "concordia/concordia_exception.hpp"
|
|
#include "concordia/tokenized_sentence.hpp"
|
|
#include <divsufsort.h>
|
|
|
|
/*!
|
|
Class for creating and maintaining the index. This class
|
|
does not hold the index data structures but only operates on
|
|
them when they are passed to ConcordiaIndex methods by
|
|
smart pointers. This class only remembers paths to two
|
|
files: hashed index and markers array, which are backups
|
|
of the respective data structures on HDD.
|
|
|
|
*/
|
|
|
|
class ConcordiaIndex {
|
|
public:
|
|
/*! Constructor.
|
|
\param hashedIndexFilePath path to the hashed index file
|
|
\param markersFilePath path to the markers array
|
|
\throws ConcordiaException
|
|
*/
|
|
explicit ConcordiaIndex(const std::string & hashedIndexFilePath,
|
|
const std::string & markersFilePath)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Destructor.
|
|
*/
|
|
virtual ~ConcordiaIndex();
|
|
|
|
/*! Adds an Example to the index. Example is first hashed using
|
|
the hash generator passed to this method. Then, hashed index
|
|
and markers array (also passed to this method) are appended
|
|
with the hashed example. At the same time, HDD versions of these
|
|
two data structures are also appended with the same example.
|
|
\param hashGenerator hash generator to be used to prepare the hash
|
|
of the example
|
|
\param T RAM-based hash index to be appended to
|
|
\param markers RAM-based markers array to be appended to
|
|
\param example example to be added to index
|
|
\throws ConcordiaException
|
|
*/
|
|
boost::shared_ptr<TokenizedSentence> addExample(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
const Example & example);
|
|
|
|
/*! Adds multiple examples to the index. Examples are first hashed using
|
|
the hash generator passed to this method. Then, hashed index
|
|
and markers array (also passed to this method) are appended
|
|
with the hashed examples. At the same time, HDD versions of these
|
|
two data structures are also appended with the same examples.
|
|
\param hashGenerator hash generator to be used to prepare the hash
|
|
of the example
|
|
\param T RAM-based hash index to be appended to
|
|
\param markers RAM-based markers array to be appended to
|
|
\param examples vector of examples to be added to index
|
|
\throws ConcordiaException
|
|
*/
|
|
std::vector<TokenizedSentence> addAllExamples(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
const std::vector<Example> & examples);
|
|
|
|
/*! Generates suffix array based on the passed hashed index.
|
|
\returns the generated suffix array
|
|
\throws ConcordiaException
|
|
*/
|
|
boost::shared_ptr<std::vector<saidx_t> > generateSuffixArray(
|
|
boost::shared_ptr<std::vector<sauchar_t> > T);
|
|
|
|
private:
|
|
boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile,
|
|
std::ofstream & markersFile,
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
const Example & example);
|
|
|
|
std::string _hashedIndexFilePath;
|
|
|
|
std::string _markersFilePath;
|
|
};
|
|
|
|
#endif
|