concordia-library/concordia/concordia.hpp

164 lines
5.7 KiB
C++

#ifndef CONCORDIA_HDR
#define CONCORDIA_HDR
#include <string>
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/filesystem.hpp>
#include "concordia/common/config.hpp"
#include "concordia/example.hpp"
#include "concordia/matched_pattern_fragment.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_index.hpp"
#include "concordia/index_searcher.hpp"
#include "concordia/concordia_search_result.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/anubis_search_result.hpp"
#include <divsufsort.h>
/*!
The Concordia class is the main access point to the library.
This class holds references to three out of four main data
structures used by Concordia: hashed index, markers array
and suffix array. Word map is maintained by the class
HashGenerator. Concordia has references to:
- the hash generator (HashGenerator)
- concordia index (ConcordiaIndex)
- concordia searcher (ConcordiaSearcher)
- configuration (ConcordiaConfig)
Whenever it is necessary, the data structures and tools
held by Concordia are passed by smart pointers to methods which
carry out specific functionalities.
*/
class Concordia {
public:
/*! Constructor.
\param configFilePath path to the Concordia configuration file
\throws ConcordiaException
*/
explicit Concordia(const std::string & configFilePath)
throw(ConcordiaException);
/*! Destructor.
*/
virtual ~Concordia();
/*! Getter for version.
\returns version of the Concordia library.
*/
std::string & getVersion();
/*! Tokenizes the given sentence.
\param sentence sentence to be tokenized
\returns tokenized sentence object,
containing information about original word positions
\throws ConcordiaException
*/
boost::shared_ptr<TokenizedSentence> tokenize(const std::string & sentence)
throw(ConcordiaException);
/*! Adds an Example to the index.
\param example example to be added
\returns tokenized sentence object,
containing information about original word positions
\throws ConcordiaException
*/
boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
throw(ConcordiaException);
/*! Adds a tokenized example to the index.
\param tokenizedSentence tokenized sentence to be added
\param id of the sentence to be added
\throws ConcordiaException
*/
void addTokenizedExample(
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id)
throw(ConcordiaException);
/*! Adds multiple examples to the index.
\param examples vector of examples to be added
\returns vector of tokenized sentence objects,
containing information about original word positions
\throws ConcordiaException
*/
std::vector<TokenizedSentence> addAllExamples(
const std::vector<Example> & examples)
throw(ConcordiaException);
/*! Performs a simple substring lookup on the index.
For more info see \ref tutorial1_2.
\param pattern pattern to be searched in the index
\returns vector of matched results
\throws ConcordiaException
*/
std::vector<MatchedPatternFragment> simpleSearch(
const std::string & pattern)
throw(ConcordiaException);
/*! \deprecated
Finds the examples from the index, whose resemblance to the
pattern is maximal. This method may perform very slow,
try using concordiaSearch instead.
\param pattern pattern to be searched in the index
\returns vector of anubis results
\throws ConcordiaException
*/
std::vector<AnubisSearchResult> anubisSearch(const std::string & pattern)
throw(ConcordiaException);
/*! Performs concordia lookup on the index. This is a unique library
functionality, designed to facilitate Computer-Aided Translation.
For more info see \ref tutorial1_3.
\param pattern pattern to be searched in the index
\returns concordia result
\throws ConcordiaException
*/
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
const std::string & pattern)
throw(ConcordiaException);
/*! Loads HDD stored index files to RAM and generates
suffix array based on RAM stored data structures.
For more info see \ref tutorial2.
\throws ConcordiaException
*/
void loadRAMIndexFromDisk() throw(ConcordiaException);
/*! Generates suffix array based on RAM stored data structures.
For more info see \ref tutorial2.
\throws ConcordiaException
*/
void refreshSAfromRAM() throw(ConcordiaException);
/*! Clears all the examples from the index
\throws ConcordiaException
*/
void clearIndex() throw(ConcordiaException);
private:
void _initializeIndex() throw(ConcordiaException);
static std::string _libraryVersion;
boost::shared_ptr<ConcordiaConfig> _config;
boost::shared_ptr<ConcordiaIndex> _index;
boost::shared_ptr<IndexSearcher> _searcher;
boost::shared_ptr<HashGenerator> _hashGenerator;
boost::shared_ptr<std::vector<sauchar_t> > _T;
boost::shared_ptr<std::vector<saidx_t> > _SA;
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > _markers;
};
#endif