230 lines
8.3 KiB
C++
230 lines
8.3 KiB
C++
#ifndef CONCORDIA_HDR
|
|
#define CONCORDIA_HDR
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
#include <boost/shared_ptr.hpp>
|
|
#include <boost/filesystem.hpp>
|
|
|
|
#include "concordia/common/config.hpp"
|
|
#include "concordia/example.hpp"
|
|
#include "concordia/matched_pattern_fragment.hpp"
|
|
#include "concordia/occurrences_list.hpp"
|
|
#include "concordia/concordia_config.hpp"
|
|
#include "concordia/concordia_index.hpp"
|
|
#include "concordia/index_searcher.hpp"
|
|
#include "concordia/concordia_search_result.hpp"
|
|
#include "concordia/tokenized_sentence.hpp"
|
|
#include "concordia/anubis_search_result.hpp"
|
|
#include <divsufsort.h>
|
|
|
|
|
|
/*!
|
|
The Concordia class is the main access point to the library.
|
|
This class holds references to three out of four main data
|
|
structures used by Concordia: hashed index, markers array
|
|
and suffix array. Word map is maintained by the class
|
|
HashGenerator. Concordia has references to:
|
|
- the hash generator (HashGenerator)
|
|
- concordia index (ConcordiaIndex)
|
|
- concordia searcher (ConcordiaSearcher)
|
|
- configuration (ConcordiaConfig)
|
|
|
|
Whenever it is necessary, the data structures and tools
|
|
held by Concordia are passed by smart pointers to methods which
|
|
carry out specific functionalities.
|
|
|
|
*/
|
|
|
|
class Concordia {
|
|
public:
|
|
/*! Parameterless constructor
|
|
*/
|
|
Concordia();
|
|
|
|
/*! Constructor.
|
|
\param indexPath path to the index directory
|
|
\param configFilePath path to the Concordia configuration file
|
|
\throws ConcordiaException
|
|
*/
|
|
explicit Concordia(const std::string & indexPath,
|
|
const std::string & configFilePath);
|
|
/*! Destructor.
|
|
*/
|
|
virtual ~Concordia();
|
|
|
|
/*! Getter for version.
|
|
\returns version of the Concordia library.
|
|
*/
|
|
std::string & getVersion();
|
|
|
|
/*! Tokenizes the given sentence.
|
|
\param sentence sentence to be tokenized
|
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
|
\param generateCodes whether to generate codes for tokens using WordMap
|
|
\returns tokenized sentence object,
|
|
containing information about original word positions
|
|
\throws ConcordiaException
|
|
*/
|
|
TokenizedSentence tokenize(const std::string & sentence,
|
|
bool byWhitespace = false,
|
|
bool generateCodes = true);
|
|
|
|
/*! Tokenizes all the given sentences.
|
|
\param sentences vector of sentences to be tokenized
|
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
|
\param generateCodes whether to generate codes for tokens using WordMap
|
|
\returns vector of tokenized sentence objects
|
|
\throws ConcordiaException
|
|
*/
|
|
std::vector<TokenizedSentence> tokenizeAll(
|
|
const std::vector<std::string> & sentences,
|
|
bool byWhitespace = false,
|
|
bool generateCodes = true);
|
|
|
|
/*! Adds an Example to the index.
|
|
\param example example to be added
|
|
\returns tokenized sentence object,
|
|
containing information about original word positions
|
|
\throws ConcordiaException
|
|
*/
|
|
TokenizedSentence addExample(const Example & example);
|
|
|
|
/*! Adds a tokenized example to the index.
|
|
\param tokenizedSentence tokenized sentence to be added
|
|
\param id id of the sentence to be added
|
|
\throws ConcordiaException
|
|
*/
|
|
void addTokenizedExample(
|
|
const TokenizedSentence & tokenizedSentence,
|
|
const SUFFIX_MARKER_TYPE id);
|
|
|
|
/*! Adds multiple tokenized examples to the index.
|
|
\param examples vector of examples to be added
|
|
\param ids vector of ids of the sentences to be added
|
|
\throws ConcordiaException
|
|
*/
|
|
void addAllTokenizedExamples(
|
|
const std::vector<TokenizedSentence> & tokenizedSentences,
|
|
const std::vector<SUFFIX_MARKER_TYPE> & ids);
|
|
|
|
/*! Adds multiple examples to the index.
|
|
\param examples vector of examples to be added
|
|
\returns vector of tokenized sentence objects,
|
|
containing information about original word positions
|
|
\throws ConcordiaException
|
|
*/
|
|
std::vector<TokenizedSentence> addAllExamples(
|
|
const std::vector<Example> & examples);
|
|
|
|
/*! Performs a simple substring lookup on the index.
|
|
For more info see \ref tutorial1_2.
|
|
\param pattern pattern to be searched in the index
|
|
\param byWhitespace whether to tokenize the pattern by white space
|
|
\returns matched pattern fragment containing vector of occurrences
|
|
\throws ConcordiaException
|
|
*/
|
|
MatchedPatternFragment simpleSearch(const std::string & pattern,
|
|
bool byWhitespace = false);
|
|
|
|
/*! Performs a substring lookup in RAM-based index, returning all occurrences.
|
|
The result contains no more than "limit" occurrences, starting at "offset".
|
|
\param hashGenerator hash generator to be used to convert
|
|
input sentence to a hash
|
|
\param pattern string pattern to be searched in the index.
|
|
\param limit maximum number of occurrences to return
|
|
\param offset starting occurrence
|
|
\param byWhitespace should the pattern by tokenized by white space
|
|
\returns list of occurrences of the pattern in the index
|
|
\throws ConcordiaException
|
|
*/
|
|
OccurrencesList fullSearch(
|
|
const std::string & pattern,
|
|
int limit,
|
|
int offset,
|
|
bool byWhitespace = false);
|
|
|
|
/*! Performs a search useful for lexicons in the following scenario:
|
|
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
|
The lexicon search performs as simple search - it requires
|
|
the match to cover the whole pattern, but additionally
|
|
the lexicon search requires that the match is the whole example source.
|
|
\param pattern pattern to be searched in the index
|
|
\param byWhitespace whether to tokenize the pattern by white space
|
|
\returns matched pattern fragment containing vector of occurrences
|
|
\throws ConcordiaException
|
|
*/
|
|
MatchedPatternFragment lexiconSearch(const std::string & pattern,
|
|
bool byWhitespace = false);
|
|
|
|
SUFFIX_MARKER_TYPE countOccurrences(const std::string & pattern);
|
|
|
|
/*! \deprecated
|
|
Finds the examples from the index, whose resemblance to the
|
|
pattern is maximal. This method may perform very slow,
|
|
try using concordiaSearch instead.
|
|
\param pattern pattern to be searched in the index
|
|
\returns vector of anubis results
|
|
\throws ConcordiaException
|
|
*/
|
|
std::vector<AnubisSearchResult> anubisSearch(const std::string & pattern);
|
|
|
|
/*! Performs concordia lookup on the index. This is a unique library
|
|
functionality, designed to facilitate Computer-Aided Translation.
|
|
For more info see \ref tutorial1_3.
|
|
\param pattern pattern to be searched in the index
|
|
\returns concordia result
|
|
\throws ConcordiaException
|
|
*/
|
|
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
|
|
const std::string & pattern,
|
|
bool byWhitespace = false);
|
|
|
|
/*! Loads HDD stored index files to RAM and generates
|
|
suffix array based on RAM stored data structures.
|
|
For more info see \ref tutorial2.
|
|
\throws ConcordiaException
|
|
*/
|
|
void loadRAMIndexFromDisk();
|
|
|
|
/*! Generates suffix array based on RAM stored data structures.
|
|
For more info see \ref tutorial2.
|
|
\throws ConcordiaException
|
|
*/
|
|
void refreshSAfromRAM();
|
|
|
|
/*! Clears all the examples from the index
|
|
\throws ConcordiaException
|
|
*/
|
|
void clearIndex();
|
|
|
|
private:
|
|
std::string _getWordMapFilePath();
|
|
|
|
std::string _getHashedIndexFilePath();
|
|
|
|
std::string _getMarkersFilePath();
|
|
|
|
void _initializeIndex();
|
|
|
|
static std::string _libraryVersion;
|
|
|
|
std::string _indexPath;
|
|
|
|
boost::shared_ptr<ConcordiaConfig> _config;
|
|
|
|
boost::shared_ptr<ConcordiaIndex> _index;
|
|
|
|
boost::shared_ptr<IndexSearcher> _searcher;
|
|
|
|
boost::shared_ptr<HashGenerator> _hashGenerator;
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > _T;
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > _SA;
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > _markers;
|
|
};
|
|
|
|
#endif
|