161 lines
7.1 KiB
C++
161 lines
7.1 KiB
C++
#ifndef INDEX_SEARCHER_HDR
|
|
#define INDEX_SEARCHER_HDR
|
|
|
|
#include <boost/shared_ptr.hpp>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <vector>
|
|
|
|
#include "concordia/common/config.hpp"
|
|
#include "concordia/matched_pattern_fragment.hpp"
|
|
#include "concordia/occurences_list.hpp"
|
|
#include "concordia/hash_generator.hpp"
|
|
#include "concordia/concordia_exception.hpp"
|
|
#include "concordia/concordia_searcher.hpp"
|
|
#include "concordia/anubis_search_result.hpp"
|
|
|
|
#include <divsufsort.h>
|
|
|
|
/*!
|
|
Class for searching the index with a sentence. In all searches the sentence
|
|
is first hashed and then used as a query.
|
|
|
|
IndexSearcher performs the simpleSearch on its own, but uses a
|
|
ConcordiaSearcher object to carry out concordiaSearch.
|
|
|
|
*/
|
|
|
|
class IndexSearcher {
|
|
public:
|
|
/*! Constructor.
|
|
*/
|
|
explicit IndexSearcher();
|
|
|
|
/*! Destructor.
|
|
*/
|
|
virtual ~IndexSearcher();
|
|
|
|
/*! Performs a simple substring lookup in RAM-based index.
|
|
For more info see \ref tutorial1_2.
|
|
\param hashGenerator hash generator to be used to convert
|
|
input sentence to a hash
|
|
\param T hashed index to search in
|
|
\param markers markers array for the needs of searching
|
|
\param SA suffix array for the needs of searching
|
|
\param pattern string pattern to be searched in the index.
|
|
\returns matched pattern fragment, containing occurences of the pattern in the index
|
|
\throws ConcordiaException
|
|
*/
|
|
MatchedPatternFragment simpleSearch(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern,
|
|
bool byWhitespace = false) throw(ConcordiaException);
|
|
|
|
/*! Performs a substring lookup in RAM-based index, returning all occurences.
|
|
The result contains no more than "limit" occurences, starting at "offset".
|
|
\param hashGenerator hash generator to be used to convert
|
|
input sentence to a hash
|
|
\param T hashed index to search in
|
|
\param markers markers array for the needs of searching
|
|
\param SA suffix array for the needs of searching
|
|
\param pattern string pattern to be searched in the index.
|
|
\param limit maximum number of occurences to return
|
|
\param offset starting occurence
|
|
\param byWhitespace should the pattern by tokenized by white space
|
|
\returns list of occurences of the pattern in the index
|
|
\throws ConcordiaException
|
|
*/
|
|
OccurencesList fullSearch(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern,
|
|
SUFFIX_MARKER_TYPE limit,
|
|
SUFFIX_MARKER_TYPE offset,
|
|
bool byWhitespace = false) throw(ConcordiaException);
|
|
|
|
/*! Performs a search useful for lexicons in the following scenario:
|
|
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
|
The lexicon search performs as simple search - it requires
|
|
the match to cover the whole pattern, but additionally
|
|
the lexicon search requires that the match is the whole example source.
|
|
\param hashGenerator hash generator to be used to convert
|
|
input sentence to a hash
|
|
\param T hashed index to search in
|
|
\param markers markers array for the needs of searching
|
|
\param SA suffix array for the needs of searching
|
|
\param pattern string pattern to be searched in the index.
|
|
\returns matched pattern fragment, containing occurences of the pattern in the index
|
|
\throws ConcordiaException
|
|
*/
|
|
MatchedPatternFragment lexiconSearch(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern,
|
|
bool byWhitespace = false) throw(ConcordiaException);
|
|
|
|
SUFFIX_MARKER_TYPE countOccurences(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern) throw(ConcordiaException);
|
|
|
|
/*! \deprecated
|
|
Finds the examples from the index, whose resemblance to the
|
|
pattern is maximal. This method may perform very slow,
|
|
try using concordiaSearch instead.
|
|
\param config concordia config object
|
|
(to read the anubis threshold parameter)
|
|
\param hashGenerator hash generator to be used to convert
|
|
input sentence to a hash
|
|
\param T hashed index to search in
|
|
\param markers markers array for the needs of searching
|
|
\param SA suffix array for the needs of searching
|
|
\param pattern string pattern to be searched in the index.
|
|
\param byWhitespace whether to tokenize the pattern by white space
|
|
\returns vector of results
|
|
\throws ConcordiaException
|
|
*/
|
|
std::vector<AnubisSearchResult> anubisSearch(
|
|
boost::shared_ptr<ConcordiaConfig> config,
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern) throw(ConcordiaException);
|
|
|
|
/*! Performs concordia lookup on the RAM-based index.
|
|
This is a unique library functionality, designed
|
|
to facilitate Computer-Aided Translation.
|
|
For more info see \ref tutorial1_3.
|
|
\param hashGenerator hash generator to be used to convert
|
|
input sentence to a hash
|
|
\param T hashed index to search in
|
|
\param markers markers array for the needs of searching
|
|
\param SA suffix array for the needs of searching
|
|
\param pattern pattern to be searched in the index.
|
|
\param byWhitespace whether to tokenize the pattern by white space
|
|
\returns result of the search
|
|
\throws ConcordiaException
|
|
*/
|
|
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
const std::string & pattern,
|
|
bool byWhitespace = false) throw(ConcordiaException);
|
|
|
|
private:
|
|
boost::shared_ptr<ConcordiaSearcher> _concordiaSearcher;
|
|
};
|
|
|
|
#endif
|