2013-11-28 16:47:57 +01:00
|
|
|
#ifndef INDEX_SEARCHER_HDR
|
|
|
|
#define INDEX_SEARCHER_HDR
|
|
|
|
|
|
|
|
#include <boost/shared_ptr.hpp>
|
|
|
|
#include <fstream>
|
|
|
|
#include <iostream>
|
2015-04-15 10:55:26 +02:00
|
|
|
#include <vector>
|
2013-11-28 16:47:57 +01:00
|
|
|
|
2013-12-06 22:29:25 +01:00
|
|
|
#include "concordia/common/config.hpp"
|
2015-08-07 12:54:57 +02:00
|
|
|
#include "concordia/matched_pattern_fragment.hpp"
|
2019-01-09 15:30:56 +01:00
|
|
|
#include "concordia/occurences_list.hpp"
|
2013-11-28 16:47:57 +01:00
|
|
|
#include "concordia/hash_generator.hpp"
|
|
|
|
#include "concordia/concordia_exception.hpp"
|
2015-04-24 11:48:32 +02:00
|
|
|
#include "concordia/concordia_searcher.hpp"
|
2014-03-11 14:29:30 +01:00
|
|
|
#include "concordia/anubis_search_result.hpp"
|
2013-11-28 16:47:57 +01:00
|
|
|
|
2014-04-29 14:46:04 +02:00
|
|
|
#include <divsufsort.h>
|
2014-02-20 18:46:04 +01:00
|
|
|
|
2013-11-28 16:47:57 +01:00
|
|
|
/*!
|
2015-05-01 14:52:53 +02:00
|
|
|
Class for searching the index with a sentence. In all searches the sentence
|
|
|
|
is first hashed and then used as a query.
|
2017-04-21 14:51:58 +02:00
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
IndexSearcher performs the simpleSearch on its own, but uses a
|
|
|
|
ConcordiaSearcher object to carry out concordiaSearch.
|
2013-11-28 16:47:57 +01:00
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
class IndexSearcher {
|
|
|
|
public:
|
2015-05-01 14:52:53 +02:00
|
|
|
/*! Constructor.
|
|
|
|
*/
|
2013-11-28 16:47:57 +01:00
|
|
|
explicit IndexSearcher();
|
|
|
|
|
|
|
|
/*! Destructor.
|
|
|
|
*/
|
|
|
|
virtual ~IndexSearcher();
|
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
/*! Performs a simple substring lookup in RAM-based index.
|
|
|
|
For more info see \ref tutorial1_2.
|
|
|
|
\param hashGenerator hash generator to be used to convert
|
|
|
|
input sentence to a hash
|
|
|
|
\param T hashed index to search in
|
|
|
|
\param markers markers array for the needs of searching
|
|
|
|
\param SA suffix array for the needs of searching
|
|
|
|
\param pattern string pattern to be searched in the index.
|
2017-04-21 14:51:58 +02:00
|
|
|
\returns matched pattern fragment, containing occurences of the pattern in the index
|
2015-05-01 14:52:53 +02:00
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2017-04-21 14:51:58 +02:00
|
|
|
MatchedPatternFragment simpleSearch(
|
2013-12-14 15:23:17 +01:00
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
2014-02-20 10:49:17 +01:00
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
2017-10-10 15:39:47 +02:00
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
|
|
const std::string & pattern,
|
2019-01-18 13:30:51 +01:00
|
|
|
bool byWhitespace = false);
|
2017-10-10 15:39:47 +02:00
|
|
|
|
2019-01-09 15:30:56 +01:00
|
|
|
/*! Performs a substring lookup in RAM-based index, returning all occurences.
|
|
|
|
The result contains no more than "limit" occurences, starting at "offset".
|
|
|
|
\param hashGenerator hash generator to be used to convert
|
|
|
|
input sentence to a hash
|
|
|
|
\param T hashed index to search in
|
|
|
|
\param markers markers array for the needs of searching
|
|
|
|
\param SA suffix array for the needs of searching
|
|
|
|
\param pattern string pattern to be searched in the index.
|
|
|
|
\param limit maximum number of occurences to return
|
|
|
|
\param offset starting occurence
|
|
|
|
\param byWhitespace should the pattern by tokenized by white space
|
|
|
|
\returns list of occurences of the pattern in the index
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
|
|
|
OccurencesList fullSearch(
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
|
|
const std::string & pattern,
|
2019-01-09 18:31:52 +01:00
|
|
|
int limit,
|
|
|
|
int offset,
|
2019-01-18 13:30:51 +01:00
|
|
|
bool byWhitespace = false);
|
2019-01-09 15:30:56 +01:00
|
|
|
|
2017-10-10 15:39:47 +02:00
|
|
|
/*! Performs a search useful for lexicons in the following scenario:
|
|
|
|
Concordia gets fed by a lexicon (glossary) instead of a TM.
|
|
|
|
The lexicon search performs as simple search - it requires
|
|
|
|
the match to cover the whole pattern, but additionally
|
|
|
|
the lexicon search requires that the match is the whole example source.
|
|
|
|
\param hashGenerator hash generator to be used to convert
|
|
|
|
input sentence to a hash
|
|
|
|
\param T hashed index to search in
|
|
|
|
\param markers markers array for the needs of searching
|
|
|
|
\param SA suffix array for the needs of searching
|
|
|
|
\param pattern string pattern to be searched in the index.
|
|
|
|
\returns matched pattern fragment, containing occurences of the pattern in the index
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
|
|
|
MatchedPatternFragment lexiconSearch(
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
2013-12-14 15:23:17 +01:00
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
2017-04-22 23:45:51 +02:00
|
|
|
const std::string & pattern,
|
2019-01-18 13:30:51 +01:00
|
|
|
bool byWhitespace = false);
|
2014-03-11 14:29:30 +01:00
|
|
|
|
2015-10-01 13:36:54 +02:00
|
|
|
SUFFIX_MARKER_TYPE countOccurences(
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
2019-01-18 13:30:51 +01:00
|
|
|
const std::string & pattern);
|
2015-10-01 13:36:54 +02:00
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
/*! \deprecated
|
|
|
|
Finds the examples from the index, whose resemblance to the
|
|
|
|
pattern is maximal. This method may perform very slow,
|
|
|
|
try using concordiaSearch instead.
|
|
|
|
\param config concordia config object
|
|
|
|
(to read the anubis threshold parameter)
|
|
|
|
\param hashGenerator hash generator to be used to convert
|
|
|
|
input sentence to a hash
|
|
|
|
\param T hashed index to search in
|
|
|
|
\param markers markers array for the needs of searching
|
|
|
|
\param SA suffix array for the needs of searching
|
|
|
|
\param pattern string pattern to be searched in the index.
|
2017-04-22 23:45:51 +02:00
|
|
|
\param byWhitespace whether to tokenize the pattern by white space
|
2015-05-01 14:52:53 +02:00
|
|
|
\returns vector of results
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-04-15 14:14:10 +02:00
|
|
|
std::vector<AnubisSearchResult> anubisSearch(
|
2015-04-16 11:39:39 +02:00
|
|
|
boost::shared_ptr<ConcordiaConfig> config,
|
2014-03-11 14:29:30 +01:00
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
2019-01-18 13:30:51 +01:00
|
|
|
const std::string & pattern);
|
2015-04-17 14:17:59 +02:00
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
/*! Performs concordia lookup on the RAM-based index.
|
|
|
|
This is a unique library functionality, designed
|
|
|
|
to facilitate Computer-Aided Translation.
|
|
|
|
For more info see \ref tutorial1_3.
|
|
|
|
\param hashGenerator hash generator to be used to convert
|
|
|
|
input sentence to a hash
|
|
|
|
\param T hashed index to search in
|
|
|
|
\param markers markers array for the needs of searching
|
|
|
|
\param SA suffix array for the needs of searching
|
|
|
|
\param pattern pattern to be searched in the index.
|
2017-04-22 23:45:51 +02:00
|
|
|
\param byWhitespace whether to tokenize the pattern by white space
|
2015-05-01 14:52:53 +02:00
|
|
|
\returns result of the search
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-04-17 14:17:59 +02:00
|
|
|
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
|
|
|
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
2017-04-22 23:45:51 +02:00
|
|
|
const std::string & pattern,
|
2019-01-18 13:30:51 +01:00
|
|
|
bool byWhitespace = false);
|
2015-04-17 14:17:59 +02:00
|
|
|
|
2013-11-28 16:47:57 +01:00
|
|
|
private:
|
2015-04-24 11:48:32 +02:00
|
|
|
boost::shared_ptr<ConcordiaSearcher> _concordiaSearcher;
|
2013-11-28 16:47:57 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|