#ifndef CONCORDIA_HDR #define CONCORDIA_HDR #include #include #include #include #include "concordia/common/config.hpp" #include "concordia/example.hpp" #include "concordia/matched_pattern_fragment.hpp" #include "concordia/occurences_list.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_index.hpp" #include "concordia/index_searcher.hpp" #include "concordia/concordia_search_result.hpp" #include "concordia/tokenized_sentence.hpp" #include "concordia/anubis_search_result.hpp" #include /*! The Concordia class is the main access point to the library. This class holds references to three out of four main data structures used by Concordia: hashed index, markers array and suffix array. Word map is maintained by the class HashGenerator. Concordia has references to: - the hash generator (HashGenerator) - concordia index (ConcordiaIndex) - concordia searcher (ConcordiaSearcher) - configuration (ConcordiaConfig) Whenever it is necessary, the data structures and tools held by Concordia are passed by smart pointers to methods which carry out specific functionalities. */ class Concordia { public: /*! Parameterless constructor */ Concordia(); /*! Constructor. \param indexPath path to the index directory \param configFilePath path to the Concordia configuration file \throws ConcordiaException */ explicit Concordia(const std::string & indexPath, const std::string & configFilePath) throw(ConcordiaException); /*! Destructor. */ virtual ~Concordia(); /*! Getter for version. \returns version of the Concordia library. */ std::string & getVersion(); /*! Tokenizes the given sentence. \param sentence sentence to be tokenized \param byWhitespace whether to tokenize the sentence by whitespace \param generateCodes whether to generate codes for tokens using WordMap \returns tokenized sentence object, containing information about original word positions \throws ConcordiaException */ TokenizedSentence tokenize(const std::string & sentence, bool byWhitespace = false, bool generateCodes = true) throw(ConcordiaException); /*! Tokenizes all the given sentences. \param sentences vector of sentences to be tokenized \param byWhitespace whether to tokenize the sentence by whitespace \param generateCodes whether to generate codes for tokens using WordMap \returns vector of tokenized sentence objects \throws ConcordiaException */ std::vector tokenizeAll( const std::vector & sentences, bool byWhitespace = false, bool generateCodes = true) throw(ConcordiaException); /*! Adds an Example to the index. \param example example to be added \returns tokenized sentence object, containing information about original word positions \throws ConcordiaException */ TokenizedSentence addExample(const Example & example) throw(ConcordiaException); /*! Adds a tokenized example to the index. \param tokenizedSentence tokenized sentence to be added \param id id of the sentence to be added \throws ConcordiaException */ void addTokenizedExample( const TokenizedSentence & tokenizedSentence, const SUFFIX_MARKER_TYPE id) throw(ConcordiaException); /*! Adds multiple tokenized examples to the index. \param examples vector of examples to be added \param ids vector of ids of the sentences to be added \throws ConcordiaException */ void addAllTokenizedExamples( const std::vector & tokenizedSentences, const std::vector & ids) throw(ConcordiaException); /*! Adds multiple examples to the index. \param examples vector of examples to be added \returns vector of tokenized sentence objects, containing information about original word positions \throws ConcordiaException */ std::vector addAllExamples( const std::vector & examples) throw(ConcordiaException); /*! Performs a simple substring lookup on the index. For more info see \ref tutorial1_2. \param pattern pattern to be searched in the index \param byWhitespace whether to tokenize the pattern by white space \returns matched pattern fragment containing vector of occurences \throws ConcordiaException */ MatchedPatternFragment simpleSearch(const std::string & pattern, bool byWhitespace = false) throw(ConcordiaException); /*! Performs a substring lookup in RAM-based index, returning all occurences. The result contains no more than "limit" occurences, starting at "offset". \param hashGenerator hash generator to be used to convert input sentence to a hash \param pattern string pattern to be searched in the index. \param limit maximum number of occurences to return \param offset starting occurence \param byWhitespace should the pattern by tokenized by white space \returns list of occurences of the pattern in the index \throws ConcordiaException */ OccurencesList fullSearch( const std::string & pattern, SUFFIX_MARKER_TYPE limit, SUFFIX_MARKER_TYPE offset, bool byWhitespace = false) throw(ConcordiaException); /*! Performs a search useful for lexicons in the following scenario: Concordia gets fed by a lexicon (glossary) instead of a TM. The lexicon search performs as simple search - it requires the match to cover the whole pattern, but additionally the lexicon search requires that the match is the whole example source. \param pattern pattern to be searched in the index \param byWhitespace whether to tokenize the pattern by white space \returns matched pattern fragment containing vector of occurences \throws ConcordiaException */ MatchedPatternFragment lexiconSearch(const std::string & pattern, bool byWhitespace = false) throw(ConcordiaException); SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern) throw(ConcordiaException); /*! \deprecated Finds the examples from the index, whose resemblance to the pattern is maximal. This method may perform very slow, try using concordiaSearch instead. \param pattern pattern to be searched in the index \returns vector of anubis results \throws ConcordiaException */ std::vector anubisSearch(const std::string & pattern) throw(ConcordiaException); /*! Performs concordia lookup on the index. This is a unique library functionality, designed to facilitate Computer-Aided Translation. For more info see \ref tutorial1_3. \param pattern pattern to be searched in the index \returns concordia result \throws ConcordiaException */ boost::shared_ptr concordiaSearch( const std::string & pattern, bool byWhitespace = false) throw(ConcordiaException); /*! Loads HDD stored index files to RAM and generates suffix array based on RAM stored data structures. For more info see \ref tutorial2. \throws ConcordiaException */ void loadRAMIndexFromDisk() throw(ConcordiaException); /*! Generates suffix array based on RAM stored data structures. For more info see \ref tutorial2. \throws ConcordiaException */ void refreshSAfromRAM() throw(ConcordiaException); /*! Clears all the examples from the index \throws ConcordiaException */ void clearIndex() throw(ConcordiaException); private: std::string _getWordMapFilePath(); std::string _getHashedIndexFilePath(); std::string _getMarkersFilePath(); void _initializeIndex() throw(ConcordiaException); static std::string _libraryVersion; std::string _indexPath; boost::shared_ptr _config; boost::shared_ptr _index; boost::shared_ptr _searcher; boost::shared_ptr _hashGenerator; boost::shared_ptr > _T; boost::shared_ptr > _SA; boost::shared_ptr > _markers; }; #endif