concordia-library/concordia/concordia_index.hpp

146 lines
6.2 KiB
C++
Raw Normal View History

2013-11-14 20:36:34 +01:00
#ifndef CONCORDIA_INDEX_HDR
#define CONCORDIA_INDEX_HDR
#include <boost/shared_ptr.hpp>
2013-11-20 17:43:29 +01:00
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
2013-11-20 17:43:29 +01:00
#include "concordia/common/config.hpp"
#include "concordia/example.hpp"
2013-11-14 20:36:34 +01:00
#include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp"
2015-06-26 22:50:53 +02:00
#include "concordia/tokenized_sentence.hpp"
#include <divsufsort.h>
2013-11-14 20:36:34 +01:00
/*!
2015-05-01 14:52:53 +02:00
Class for creating and maintaining the index. This class
does not hold the index data structures but only operates on
them when they are passed to ConcordiaIndex methods by
smart pointers. This class only remembers paths to two
files: hashed index and markers array, which are backups
of the respective data structures on HDD.
2013-11-14 20:36:34 +01:00
*/
class ConcordiaIndex {
public:
2015-05-01 14:52:53 +02:00
/*! Constructor.
\param hashedIndexFilePath path to the hashed index file
\param markersFilePath path to the markers array
*/
explicit ConcordiaIndex(const std::string & hashedIndexFilePath,
2019-01-18 13:30:51 +01:00
const std::string & markersFilePath);
2013-11-14 20:36:34 +01:00
/*! Destructor.
*/
virtual ~ConcordiaIndex();
2015-05-01 14:52:53 +02:00
/*! Adds an Example to the index. Example is first hashed using
the hash generator passed to this method. Then, hashed index
and markers array (also passed to this method) are appended
with the hashed example. At the same time, HDD versions of these
two data structures are also appended with the same example.
2015-06-27 12:40:24 +02:00
The method returns a tokenized version of the example.
2015-05-01 14:52:53 +02:00
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param example example to be added to index
2015-06-27 12:40:24 +02:00
\returns tokenized example
2015-05-01 14:52:53 +02:00
*/
2015-08-19 20:49:26 +02:00
TokenizedSentence addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example);
2013-11-14 20:36:34 +01:00
/*! Adds a tokenized example to the index. Hashed index
and markers array are appended with the example.
At the same time, HDD versions of these
two data structures are also appended with the same example.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param example example to be added to index
\param tokenizedSentence tokenized sentence to be added
\param id of the sentence to be added
*/
void addTokenizedExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
2015-08-19 20:49:26 +02:00
const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id);
/*! Adds multiple tokenized examples to the index. Hashed index
and markers array are appended with the examples.
At the same time, HDD versions of these
two data structures are also appended with the same examples.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param example example to be added to index
\param tokenizedSentences vector of tokenized sentences to be added
\param ids vector of ids of the sentences to be added
*/
void addAllTokenizedExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const std::vector<TokenizedSentence> & tokenizedSentences,
const std::vector<SUFFIX_MARKER_TYPE> & ids);
2015-05-01 14:52:53 +02:00
/*! Adds multiple examples to the index. Examples are first hashed using
the hash generator passed to this method. Then, hashed index
and markers array (also passed to this method) are appended
with the hashed examples. At the same time, HDD versions of these
two data structures are also appended with the same examples.
2015-06-27 12:40:24 +02:00
The method returns a vector of tokenized examples.
2015-05-01 14:52:53 +02:00
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param examples vector of examples to be added to index
2015-06-27 12:40:24 +02:00
\returns vector of tokenized examples
2015-05-01 14:52:53 +02:00
*/
2015-06-26 22:50:53 +02:00
std::vector<TokenizedSentence> addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const std::vector<Example> & examples);
2013-11-14 20:36:34 +01:00
2015-05-01 14:52:53 +02:00
/*! Generates suffix array based on the passed hashed index.
\returns the generated suffix array
*/
boost::shared_ptr<std::vector<saidx_t> > generateSuffixArray(
boost::shared_ptr<std::vector<sauchar_t> > T);
2013-11-14 20:36:34 +01:00
private:
void _addSingleTokenizedExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
2015-08-19 20:49:26 +02:00
const TokenizedSentence & tokenizedSentence,
const SUFFIX_MARKER_TYPE id);
2015-08-19 20:49:26 +02:00
TokenizedSentence _addSingleExample(
2015-06-27 12:40:24 +02:00
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example);
2013-11-20 17:43:29 +01:00
std::string _hashedIndexFilePath;
std::string _markersFilePath;
2013-11-14 20:36:34 +01:00
};
#endif