210 lines
7.6 KiB
C++
210 lines
7.6 KiB
C++
#ifndef CONCORDIA_HDR
|
|
#define CONCORDIA_HDR
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
#include <boost/shared_ptr.hpp>
|
|
#include <boost/filesystem.hpp>
|
|
|
|
#include "concordia/common/config.hpp"
|
|
#include "concordia/example.hpp"
|
|
#include "concordia/matched_pattern_fragment.hpp"
|
|
#include "concordia/concordia_config.hpp"
|
|
#include "concordia/concordia_index.hpp"
|
|
#include "concordia/index_searcher.hpp"
|
|
#include "concordia/concordia_search_result.hpp"
|
|
#include "concordia/tokenized_sentence.hpp"
|
|
#include "concordia/anubis_search_result.hpp"
|
|
#include <divsufsort.h>
|
|
|
|
|
|
/*!
|
|
The Concordia class is the main access point to the library.
|
|
This class holds references to three out of four main data
|
|
structures used by Concordia: hashed index, markers array
|
|
and suffix array. Word map is maintained by the class
|
|
HashGenerator. Concordia has references to:
|
|
- the hash generator (HashGenerator)
|
|
- concordia index (ConcordiaIndex)
|
|
- concordia searcher (ConcordiaSearcher)
|
|
- configuration (ConcordiaConfig)
|
|
|
|
Whenever it is necessary, the data structures and tools
|
|
held by Concordia are passed by smart pointers to methods which
|
|
carry out specific functionalities.
|
|
|
|
*/
|
|
|
|
class Concordia {
|
|
public:
|
|
/*! Parameterless constructor
|
|
*/
|
|
Concordia();
|
|
|
|
/*! Constructor.
|
|
\param indexPath path to the index directory
|
|
\param configFilePath path to the Concordia configuration file
|
|
\throws ConcordiaException
|
|
*/
|
|
explicit Concordia(const std::string & indexPath,
|
|
const std::string & configFilePath)
|
|
throw(ConcordiaException);
|
|
/*! Destructor.
|
|
*/
|
|
virtual ~Concordia();
|
|
|
|
/*! Getter for version.
|
|
\returns version of the Concordia library.
|
|
*/
|
|
std::string & getVersion();
|
|
|
|
/*! Tokenizes the given sentence.
|
|
\param sentence sentence to be tokenized
|
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
|
\param generateCodes whether to generate codes for tokens using WordMap
|
|
\returns tokenized sentence object,
|
|
containing information about original word positions
|
|
\throws ConcordiaException
|
|
*/
|
|
TokenizedSentence tokenize(const std::string & sentence,
|
|
bool byWhitespace = false,
|
|
bool generateCodes = true)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Tokenizes all the given sentences.
|
|
\param sentences vector of sentences to be tokenized
|
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
|
\param generateCodes whether to generate codes for tokens using WordMap
|
|
\returns vector of tokenized sentence objects
|
|
\throws ConcordiaException
|
|
*/
|
|
std::vector<TokenizedSentence> tokenizeAll(
|
|
const std::vector<std::string> & sentences,
|
|
bool byWhitespace = false,
|
|
bool generateCodes = true)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Adds an Example to the index.
|
|
\param example example to be added
|
|
\returns tokenized sentence object,
|
|
containing information about original word positions
|
|
\throws ConcordiaException
|
|
*/
|
|
TokenizedSentence addExample(const Example & example)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Adds a tokenized example to the index.
|
|
\param tokenizedSentence tokenized sentence to be added
|
|
\param id id of the sentence to be added
|
|
\throws ConcordiaException
|
|
*/
|
|
void addTokenizedExample(
|
|
const TokenizedSentence & tokenizedSentence,
|
|
const SUFFIX_MARKER_TYPE id)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Adds multiple tokenized examples to the index.
|
|
\param examples vector of examples to be added
|
|
\param ids vector of ids of the sentences to be added
|
|
\throws ConcordiaException
|
|
*/
|
|
void addAllTokenizedExamples(
|
|
const std::vector<TokenizedSentence> & tokenizedSentences,
|
|
const std::vector<SUFFIX_MARKER_TYPE> & ids)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Adds multiple examples to the index.
|
|
\param examples vector of examples to be added
|
|
\returns vector of tokenized sentence objects,
|
|
containing information about original word positions
|
|
\throws ConcordiaException
|
|
*/
|
|
std::vector<TokenizedSentence> addAllExamples(
|
|
const std::vector<Example> & examples)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Performs a simple substring lookup on the index.
|
|
For more info see \ref tutorial1_2.
|
|
\param pattern pattern to be searched in the index
|
|
\param byWhitespace whether to tokenize the pattern by white space
|
|
\returns matched pattern fragment containing vector of occurences
|
|
\throws ConcordiaException
|
|
*/
|
|
MatchedPatternFragment simpleSearch(const std::string & pattern,
|
|
bool byWhitespace = false)
|
|
throw(ConcordiaException);
|
|
|
|
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
|
|
throw(ConcordiaException);
|
|
|
|
/*! \deprecated
|
|
Finds the examples from the index, whose resemblance to the
|
|
pattern is maximal. This method may perform very slow,
|
|
try using concordiaSearch instead.
|
|
\param pattern pattern to be searched in the index
|
|
\returns vector of anubis results
|
|
\throws ConcordiaException
|
|
*/
|
|
std::vector<AnubisSearchResult> anubisSearch(const std::string & pattern)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Performs concordia lookup on the index. This is a unique library
|
|
functionality, designed to facilitate Computer-Aided Translation.
|
|
For more info see \ref tutorial1_3.
|
|
\param pattern pattern to be searched in the index
|
|
\returns concordia result
|
|
\throws ConcordiaException
|
|
*/
|
|
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
|
|
const std::string & pattern,
|
|
bool byWhitespace = false)
|
|
throw(ConcordiaException);
|
|
|
|
/*! Loads HDD stored index files to RAM and generates
|
|
suffix array based on RAM stored data structures.
|
|
For more info see \ref tutorial2.
|
|
\throws ConcordiaException
|
|
*/
|
|
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
|
|
|
/*! Generates suffix array based on RAM stored data structures.
|
|
For more info see \ref tutorial2.
|
|
\throws ConcordiaException
|
|
*/
|
|
void refreshSAfromRAM() throw(ConcordiaException);
|
|
|
|
/*! Clears all the examples from the index
|
|
\throws ConcordiaException
|
|
*/
|
|
void clearIndex() throw(ConcordiaException);
|
|
|
|
private:
|
|
std::string _getWordMapFilePath();
|
|
|
|
std::string _getHashedIndexFilePath();
|
|
|
|
std::string _getMarkersFilePath();
|
|
|
|
void _initializeIndex() throw(ConcordiaException);
|
|
|
|
static std::string _libraryVersion;
|
|
|
|
std::string _indexPath;
|
|
|
|
boost::shared_ptr<ConcordiaConfig> _config;
|
|
|
|
boost::shared_ptr<ConcordiaIndex> _index;
|
|
|
|
boost::shared_ptr<IndexSearcher> _searcher;
|
|
|
|
boost::shared_ptr<HashGenerator> _hashGenerator;
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > _T;
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > _SA;
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > _markers;
|
|
};
|
|
|
|
#endif
|