concordia-library/concordia/concordia.hpp

#ifndef CONCORDIA_HDR
#define CONCORDIA_HDR

#include <string>
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/filesystem.hpp>

#include "concordia/common/config.hpp"
#include "concordia/example.hpp"
#include "concordia/matched_pattern_fragment.hpp"
#include "concordia/occurences_list.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_index.hpp"
#include "concordia/index_searcher.hpp"
#include "concordia/concordia_search_result.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/anubis_search_result.hpp"
#include <divsufsort.h>


/*!
  The Concordia class is the main access point to the library.
  This class holds references to three out of four main data
  structures used by Concordia: hashed index, markers array
  and suffix array. Word map is maintained by the class
  HashGenerator. Concordia has references to:
  - the hash generator (HashGenerator)
  - concordia index (ConcordiaIndex)
  - concordia searcher (ConcordiaSearcher)
  - configuration (ConcordiaConfig)

  Whenever it is necessary, the data structures and tools
  held by Concordia are passed by smart pointers to methods which
  carry out specific functionalities.

*/

class Concordia {
public:
    /*! Parameterless constructor
    */
    Concordia();

    /*! Constructor.
      \param indexPath path to the index directory
      \param configFilePath path to the Concordia configuration file
      \throws ConcordiaException
    */
    explicit Concordia(const std::string & indexPath,
                       const std::string & configFilePath);
    /*! Destructor.
    */
    virtual ~Concordia();

    /*! Getter for version.
      \returns version of the Concordia library.
    */
    std::string & getVersion();

    /*! Tokenizes the given sentence.
      \param sentence sentence to be tokenized
      \param byWhitespace whether to tokenize the sentence by whitespace
      \param generateCodes whether to generate codes for tokens using WordMap
      \returns tokenized sentence object,
               containing information about original word positions
      \throws ConcordiaException
    */
    TokenizedSentence tokenize(const std::string & sentence,
                               bool byWhitespace = false,
                               bool generateCodes = true);

    /*! Tokenizes all the given sentences.
      \param sentences vector of sentences to be tokenized
      \param byWhitespace whether to tokenize the sentence by whitespace
      \param generateCodes whether to generate codes for tokens using WordMap
      \returns vector of tokenized sentence objects
      \throws ConcordiaException
    */
    std::vector<TokenizedSentence> tokenizeAll(
                                   const std::vector<std::string> & sentences,
                                   bool byWhitespace = false,
                                   bool generateCodes = true);

    /*! Adds an Example to the index.
      \param example example to be added
      \returns tokenized sentence object,
               containing information about original word positions
      \throws ConcordiaException
    */
    TokenizedSentence addExample(const Example & example);

    /*! Adds a tokenized example to the index.
      \param tokenizedSentence tokenized sentence to be added
      \param id id of the sentence to be added
      \throws ConcordiaException
    */
    void addTokenizedExample(
                    const TokenizedSentence & tokenizedSentence,
                    const SUFFIX_MARKER_TYPE id);

    /*! Adds multiple tokenized examples to the index.
      \param examples vector of examples to be added
      \param ids vector of ids of the sentences to be added
      \throws ConcordiaException
    */
    void addAllTokenizedExamples(
                    const std::vector<TokenizedSentence> & tokenizedSentences,
                    const std::vector<SUFFIX_MARKER_TYPE> & ids);

    /*! Adds multiple examples to the index.
      \param examples vector of examples to be added
      \returns vector of tokenized sentence objects,
               containing information about original word positions
      \throws ConcordiaException
    */
    std::vector<TokenizedSentence> addAllExamples(
                                const std::vector<Example> & examples);

    /*! Performs a simple substring lookup on the index.
        For more info see \ref tutorial1_2.
      \param pattern pattern to be searched in the index
      \param byWhitespace whether to tokenize the pattern by white space
      \returns matched pattern fragment containing vector of occurences
      \throws ConcordiaException
    */
    MatchedPatternFragment simpleSearch(const std::string & pattern,
                                          bool byWhitespace = false);

    /*! Performs a substring lookup in RAM-based index, returning all occurences.
        The result contains no more than "limit" occurences, starting at "offset".
      \param hashGenerator hash generator to be used to convert
             input sentence to a hash
      \param pattern string pattern to be searched in the index.
      \param limit maximum number of occurences to return
      \param offset starting occurence
      \param byWhitespace should the pattern by tokenized by white space
      \returns list of occurences of the pattern in the index
      \throws ConcordiaException
    */
    OccurencesList fullSearch(
                    const std::string & pattern,
                    int limit,
                    int offset,
                    bool byWhitespace = false);

    /*! Performs a search useful for lexicons in the following scenario:
      Concordia gets fed by a lexicon (glossary) instead of a TM.
      The lexicon search performs as simple search - it requires
      the match to cover the whole pattern, but additionally
      the lexicon search requires that the match is the whole example source.
    \param pattern pattern to be searched in the index
    \param byWhitespace whether to tokenize the pattern by white space
    \returns matched pattern fragment containing vector of occurences
    \throws ConcordiaException
    */
    MatchedPatternFragment lexiconSearch(const std::string & pattern,
                                        bool byWhitespace = false);

    SUFFIX_MARKER_TYPE countOccurences(const std::string & pattern);

    /*! \deprecated
        Finds the examples from the index, whose resemblance to the
        pattern is maximal. This method may perform very slow,
        try using concordiaSearch instead.
      \param pattern pattern to be searched in the index
      \returns vector of anubis results
      \throws ConcordiaException
    */
    std::vector<AnubisSearchResult> anubisSearch(const std::string & pattern);

    /*! Performs concordia lookup on the index. This is a unique library
        functionality, designed to facilitate Computer-Aided Translation.
        For more info see \ref tutorial1_3.
      \param pattern pattern to be searched in the index
      \returns concordia result
      \throws ConcordiaException
    */
    boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
                                                 const std::string & pattern,
                                                 bool byWhitespace = false);

    /*! Loads HDD stored index files to RAM and generates
        suffix array based on RAM stored data structures.
        For more info see \ref tutorial2.
      \throws ConcordiaException
    */
    void loadRAMIndexFromDisk();

    /*! Generates suffix array based on RAM stored data structures.
        For more info see \ref tutorial2.
      \throws ConcordiaException
    */
    void refreshSAfromRAM();

    /*! Clears all the examples from the index
      \throws ConcordiaException
    */
    void clearIndex();

private:
    std::string _getWordMapFilePath();

    std::string _getHashedIndexFilePath();

    std::string _getMarkersFilePath();

    void _initializeIndex();

    static std::string _libraryVersion;

    std::string _indexPath;

    boost::shared_ptr<ConcordiaConfig> _config;

    boost::shared_ptr<ConcordiaIndex> _index;

    boost::shared_ptr<IndexSearcher> _searcher;

    boost::shared_ptr<HashGenerator> _hashGenerator;

    boost::shared_ptr<std::vector<sauchar_t> > _T;

    boost::shared_ptr<std::vector<saidx_t> > _SA;

    boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > _markers;
};

#endif