2013-10-24 17:08:58 +02:00
|
|
|
#ifndef CONCORDIA_HDR
|
|
|
|
#define CONCORDIA_HDR
|
|
|
|
|
|
|
|
#include <string>
|
2013-11-28 16:47:57 +01:00
|
|
|
#include <vector>
|
2013-10-24 17:08:58 +02:00
|
|
|
#include <boost/shared_ptr.hpp>
|
2013-12-14 15:23:17 +01:00
|
|
|
#include <boost/filesystem.hpp>
|
2013-10-24 17:08:58 +02:00
|
|
|
|
2014-02-20 10:49:17 +01:00
|
|
|
#include "concordia/common/config.hpp"
|
|
|
|
#include "concordia/example.hpp"
|
2015-08-07 12:54:57 +02:00
|
|
|
#include "concordia/matched_pattern_fragment.hpp"
|
2019-01-09 15:30:56 +01:00
|
|
|
#include "concordia/occurences_list.hpp"
|
2013-10-24 17:08:58 +02:00
|
|
|
#include "concordia/concordia_config.hpp"
|
2013-11-28 16:47:57 +01:00
|
|
|
#include "concordia/concordia_index.hpp"
|
|
|
|
#include "concordia/index_searcher.hpp"
|
2015-04-17 14:17:59 +02:00
|
|
|
#include "concordia/concordia_search_result.hpp"
|
2015-06-26 22:50:53 +02:00
|
|
|
#include "concordia/tokenized_sentence.hpp"
|
2015-04-12 12:06:41 +02:00
|
|
|
#include "concordia/anubis_search_result.hpp"
|
2014-04-29 14:46:04 +02:00
|
|
|
#include <divsufsort.h>
|
2013-11-28 16:47:57 +01:00
|
|
|
|
2013-10-24 17:08:58 +02:00
|
|
|
|
|
|
|
/*!
|
|
|
|
The Concordia class is the main access point to the library.
|
2015-05-01 14:52:53 +02:00
|
|
|
This class holds references to three out of four main data
|
|
|
|
structures used by Concordia: hashed index, markers array
|
|
|
|
and suffix array. Word map is maintained by the class
|
|
|
|
HashGenerator. Concordia has references to:
|
|
|
|
- the hash generator (HashGenerator)
|
|
|
|
- concordia index (ConcordiaIndex)
|
|
|
|
- concordia searcher (ConcordiaSearcher)
|
|
|
|
- configuration (ConcordiaConfig)
|
|
|
|
|
|
|
|
Whenever it is necessary, the data structures and tools
|
|
|
|
held by Concordia are passed by smart pointers to methods which
|
|
|
|
carry out specific functionalities.
|
2013-10-24 17:08:58 +02:00
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
class Concordia {
|
|
|
|
public:
|
2015-10-19 15:38:10 +02:00
|
|
|
/*! Parameterless constructor
|
|
|
|
*/
|
|
|
|
Concordia();
|
|
|
|
|
2013-10-24 17:08:58 +02:00
|
|
|
/*! Constructor.
|
2015-10-16 22:14:11 +02:00
|
|
|
\param indexPath path to the index directory
|
2013-10-24 17:08:58 +02:00
|
|
|
\param configFilePath path to the Concordia configuration file
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-10-16 22:14:11 +02:00
|
|
|
explicit Concordia(const std::string & indexPath,
|
|
|
|
const std::string & configFilePath)
|
2013-10-24 17:08:58 +02:00
|
|
|
throw(ConcordiaException);
|
|
|
|
/*! Destructor.
|
|
|
|
*/
|
|
|
|
virtual ~Concordia();
|
|
|
|
|
|
|
|
/*! Getter for version.
|
|
|
|
\returns version of the Concordia library.
|
|
|
|
*/
|
|
|
|
std::string & getVersion();
|
|
|
|
|
2015-08-01 17:03:39 +02:00
|
|
|
/*! Tokenizes the given sentence.
|
|
|
|
\param sentence sentence to be tokenized
|
2015-12-27 20:54:40 +01:00
|
|
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
2016-01-01 20:45:07 +01:00
|
|
|
\param generateCodes whether to generate codes for tokens using WordMap
|
2015-08-01 17:03:39 +02:00
|
|
|
\returns tokenized sentence object,
|
|
|
|
containing information about original word positions
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-12-27 20:54:40 +01:00
|
|
|
TokenizedSentence tokenize(const std::string & sentence,
|
2016-01-01 20:45:07 +01:00
|
|
|
bool byWhitespace = false,
|
|
|
|
bool generateCodes = true)
|
2015-08-19 20:49:26 +02:00
|
|
|
throw(ConcordiaException);
|
|
|
|
|
|
|
|
/*! Tokenizes all the given sentences.
|
|
|
|
\param sentences vector of sentences to be tokenized
|
2015-12-27 20:54:40 +01:00
|
|
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
2016-01-01 20:45:07 +01:00
|
|
|
\param generateCodes whether to generate codes for tokens using WordMap
|
2015-08-19 20:49:26 +02:00
|
|
|
\returns vector of tokenized sentence objects
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
|
|
|
std::vector<TokenizedSentence> tokenizeAll(
|
2015-12-27 20:54:40 +01:00
|
|
|
const std::vector<std::string> & sentences,
|
2016-01-01 20:45:07 +01:00
|
|
|
bool byWhitespace = false,
|
|
|
|
bool generateCodes = true)
|
2015-08-01 17:03:39 +02:00
|
|
|
throw(ConcordiaException);
|
|
|
|
|
2015-04-30 22:22:54 +02:00
|
|
|
/*! Adds an Example to the index.
|
|
|
|
\param example example to be added
|
2015-06-27 12:40:24 +02:00
|
|
|
\returns tokenized sentence object,
|
|
|
|
containing information about original word positions
|
2015-04-30 22:22:54 +02:00
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-08-19 20:49:26 +02:00
|
|
|
TokenizedSentence addExample(const Example & example)
|
|
|
|
throw(ConcordiaException);
|
2013-11-28 16:47:57 +01:00
|
|
|
|
2015-08-01 17:03:39 +02:00
|
|
|
/*! Adds a tokenized example to the index.
|
|
|
|
\param tokenizedSentence tokenized sentence to be added
|
2015-08-19 20:49:26 +02:00
|
|
|
\param id id of the sentence to be added
|
2015-08-01 17:03:39 +02:00
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
|
|
|
void addTokenizedExample(
|
2015-08-19 20:49:26 +02:00
|
|
|
const TokenizedSentence & tokenizedSentence,
|
|
|
|
const SUFFIX_MARKER_TYPE id)
|
|
|
|
throw(ConcordiaException);
|
|
|
|
|
|
|
|
/*! Adds multiple tokenized examples to the index.
|
|
|
|
\param examples vector of examples to be added
|
|
|
|
\param ids vector of ids of the sentences to be added
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
|
|
|
void addAllTokenizedExamples(
|
|
|
|
const std::vector<TokenizedSentence> & tokenizedSentences,
|
|
|
|
const std::vector<SUFFIX_MARKER_TYPE> & ids)
|
2015-08-01 17:03:39 +02:00
|
|
|
throw(ConcordiaException);
|
|
|
|
|
2015-04-30 22:22:54 +02:00
|
|
|
/*! Adds multiple examples to the index.
|
|
|
|
\param examples vector of examples to be added
|
2015-06-27 12:40:24 +02:00
|
|
|
\returns vector of tokenized sentence objects,
|
|
|
|
containing information about original word positions
|
2015-04-30 22:22:54 +02:00
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-06-27 12:40:24 +02:00
|
|
|
std::vector<TokenizedSentence> addAllExamples(
|
|
|
|
const std::vector<Example> & examples)
|
|
|
|
throw(ConcordiaException);
|
2013-12-06 22:29:25 +01:00
|
|
|
|
2015-04-30 22:22:54 +02:00
|
|
|
/*! Performs a simple substring lookup on the index.
|
|
|
|
For more info see \ref tutorial1_2.
|
|
|
|
\param pattern pattern to be searched in the index
|
2017-04-22 23:45:51 +02:00
|
|
|
\param byWhitespace whether to tokenize the pattern by white space
|
2017-04-21 14:51:58 +02:00
|
|
|
\returns matched pattern fragment containing vector of occurences
|
2015-04-30 22:22:54 +02:00
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2017-04-22 23:45:51 +02:00
|
|
|
MatchedPatternFragment simpleSearch(const std::string & pattern,
|
2017-10-10 15:39:47 +02:00
|
|
|
bool byWhitespace = false)
|
2017-04-21 14:51:58 +02:00
|
|
|
throw(ConcordiaException);
|
2013-11-28 16:47:57 +01:00
|
|
|
|
2019-01-09 15:30:56 +01:00
|
|
|
/*! Performs a substring lookup in RAM-based index, returning all occurences.
|
|
|
|
The result contains no more than "limit" occurences, starting at "offset".
|
|
|
|
\param hashGenerator hash generator to be used to convert
|
|
|
|
input sentence to a hash
|
|
|
|
\param pattern string pattern to be searched in the index.
|
|
|
|
\param limit maximum number of occurences to return
|
|
|
|
\param offset starting occurence
|
|
|
|
\param byWhitespace should the pattern by tokenized by white space
|
|
|
|
\returns list of occurences of the pattern in the index
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
|
|
|
OccurencesList fullSearch(
|
|
|
|
const std::string & pattern,
|
|
|
|
SUFFIX_MARKER_TYPE limit,
|
|
|
|
SUFFIX_MARKER_TYPE offset,
|
|
|
|
bool byWhitespace = false) throw(ConcordiaException);
|
|
|
|
|
2017-10-10 15:39:47 +02:00
|
|
|
/*! Performs a search useful for lexicons in the following scenario:
|
|
|
|
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
|
|
|
The lexicon search performs as simple search - it requires
|
|
|
|
the match to cover the whole pattern, but additionally
|
|
|
|
the lexicon search requires that the match is the whole example source.
|
|
|
|
\param pattern pattern to be searched in the index
|
|
|
|
\param byWhitespace whether to tokenize the pattern by white space
|
|
|
|
\returns matched pattern fragment containing vector of occurences
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
|
|
|
MatchedPatternFragment lexiconSearch(const std::string & pattern,
|
|
|
|
bool byWhitespace = false)
|
|
|
|
throw(ConcordiaException);
|
|
|
|
|
2015-10-01 13:36:54 +02:00
|
|
|
SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern)
|
|
|
|
throw(ConcordiaException);
|
|
|
|
|
2015-04-30 22:22:54 +02:00
|
|
|
/*! \deprecated
|
|
|
|
Finds the examples from the index, whose resemblance to the
|
|
|
|
pattern is maximal. This method may perform very slow,
|
|
|
|
try using concordiaSearch instead.
|
|
|
|
\param pattern pattern to be searched in the index
|
|
|
|
\returns vector of anubis results
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-04-15 10:55:26 +02:00
|
|
|
std::vector<AnubisSearchResult> anubisSearch(const std::string & pattern)
|
|
|
|
throw(ConcordiaException);
|
2015-04-12 12:06:41 +02:00
|
|
|
|
2015-04-30 22:22:54 +02:00
|
|
|
/*! Performs concordia lookup on the index. This is a unique library
|
|
|
|
functionality, designed to facilitate Computer-Aided Translation.
|
|
|
|
For more info see \ref tutorial1_3.
|
|
|
|
\param pattern pattern to be searched in the index
|
|
|
|
\returns concordia result
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-04-17 14:17:59 +02:00
|
|
|
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
|
2017-04-22 23:45:51 +02:00
|
|
|
const std::string & pattern,
|
|
|
|
bool byWhitespace = false)
|
2015-04-17 14:17:59 +02:00
|
|
|
throw(ConcordiaException);
|
|
|
|
|
2015-04-30 22:22:54 +02:00
|
|
|
/*! Loads HDD stored index files to RAM and generates
|
|
|
|
suffix array based on RAM stored data structures.
|
2017-04-21 14:51:58 +02:00
|
|
|
For more info see \ref tutorial2.
|
2015-04-30 22:22:54 +02:00
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2013-12-14 15:23:17 +01:00
|
|
|
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
2013-12-06 22:29:25 +01:00
|
|
|
|
2015-04-30 22:22:54 +02:00
|
|
|
/*! Generates suffix array based on RAM stored data structures.
|
|
|
|
For more info see \ref tutorial2.
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2013-12-14 15:23:17 +01:00
|
|
|
void refreshSAfromRAM() throw(ConcordiaException);
|
2013-11-28 16:47:57 +01:00
|
|
|
|
2015-05-04 20:40:44 +02:00
|
|
|
/*! Clears all the examples from the index
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
|
|
|
void clearIndex() throw(ConcordiaException);
|
|
|
|
|
2013-10-24 17:08:58 +02:00
|
|
|
private:
|
2015-10-16 22:14:11 +02:00
|
|
|
std::string _getWordMapFilePath();
|
|
|
|
|
|
|
|
std::string _getHashedIndexFilePath();
|
|
|
|
|
|
|
|
std::string _getMarkersFilePath();
|
|
|
|
|
2013-12-14 15:23:17 +01:00
|
|
|
void _initializeIndex() throw(ConcordiaException);
|
|
|
|
|
2013-10-24 17:08:58 +02:00
|
|
|
static std::string _libraryVersion;
|
|
|
|
|
2015-10-16 22:14:11 +02:00
|
|
|
std::string _indexPath;
|
|
|
|
|
2013-10-24 17:08:58 +02:00
|
|
|
boost::shared_ptr<ConcordiaConfig> _config;
|
2013-11-28 16:47:57 +01:00
|
|
|
|
|
|
|
boost::shared_ptr<ConcordiaIndex> _index;
|
|
|
|
|
|
|
|
boost::shared_ptr<IndexSearcher> _searcher;
|
2013-12-14 15:23:17 +01:00
|
|
|
|
|
|
|
boost::shared_ptr<HashGenerator> _hashGenerator;
|
|
|
|
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > _T;
|
|
|
|
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > _SA;
|
2014-02-20 10:49:17 +01:00
|
|
|
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > _markers;
|
2013-10-24 17:08:58 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|