concordia-library/concordia/index_searcher.hpp

161 lines
7.1 KiB
C++

#ifndef INDEX_SEARCHER_HDR
#define INDEX_SEARCHER_HDR
#include <boost/shared_ptr.hpp>
#include <fstream>
#include <iostream>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/matched_pattern_fragment.hpp"
#include "concordia/occurences_list.hpp"
#include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp"
#include "concordia/concordia_searcher.hpp"
#include "concordia/anubis_search_result.hpp"
#include <divsufsort.h>
/*!
Class for searching the index with a sentence. In all searches the sentence
is first hashed and then used as a query.
IndexSearcher performs the simpleSearch on its own, but uses a
ConcordiaSearcher object to carry out concordiaSearch.
*/
class IndexSearcher {
public:
/*! Constructor.
*/
explicit IndexSearcher();
/*! Destructor.
*/
virtual ~IndexSearcher();
/*! Performs a simple substring lookup in RAM-based index.
For more info see \ref tutorial1_2.
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\returns matched pattern fragment, containing occurences of the pattern in the index
\throws ConcordiaException
*/
MatchedPatternFragment simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern,
bool byWhitespace = false) throw(ConcordiaException);
/*! Performs a substring lookup in RAM-based index, returning all occurences.
The result contains no more than "limit" occurences, starting at "offset".
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\param limit maximum number of occurences to return
\param offset starting occurence
\param byWhitespace should the pattern by tokenized by white space
\returns list of occurences of the pattern in the index
\throws ConcordiaException
*/
OccurencesList fullSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern,
SUFFIX_MARKER_TYPE limit,
SUFFIX_MARKER_TYPE offset,
bool byWhitespace = false) throw(ConcordiaException);
/*! Performs a search useful for lexicons in the following scenario:
Concordia gets fed by a lexicon (glossary) instead of a TM.
The lexicon search performs as simple search - it requires
the match to cover the whole pattern, but additionally
the lexicon search requires that the match is the whole example source.
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\returns matched pattern fragment, containing occurences of the pattern in the index
\throws ConcordiaException
*/
MatchedPatternFragment lexiconSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern,
bool byWhitespace = false) throw(ConcordiaException);
SUFFIX_MARKER_TYPE countOccurences(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException);
/*! \deprecated
Finds the examples from the index, whose resemblance to the
pattern is maximal. This method may perform very slow,
try using concordiaSearch instead.
\param config concordia config object
(to read the anubis threshold parameter)
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\param byWhitespace whether to tokenize the pattern by white space
\returns vector of results
\throws ConcordiaException
*/
std::vector<AnubisSearchResult> anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException);
/*! Performs concordia lookup on the RAM-based index.
This is a unique library functionality, designed
to facilitate Computer-Aided Translation.
For more info see \ref tutorial1_3.
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
\param byWhitespace whether to tokenize the pattern by white space
\returns result of the search
\throws ConcordiaException
*/
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern,
bool byWhitespace = false) throw(ConcordiaException);
private:
boost::shared_ptr<ConcordiaSearcher> _concordiaSearcher;
};
#endif