2014-05-14 16:29:44 +02:00
|
|
|
#ifndef ANUBIS_SEARCHER_HDR
|
|
|
|
#define ANUBIS_SEARCHER_HDR
|
|
|
|
|
|
|
|
#include <boost/shared_ptr.hpp>
|
|
|
|
|
|
|
|
#include "concordia/common/config.hpp"
|
2014-05-15 22:20:31 +02:00
|
|
|
#include "concordia/common/utils.hpp"
|
2014-05-14 16:29:44 +02:00
|
|
|
#include "concordia/substring_occurence.hpp"
|
|
|
|
#include "concordia/concordia_exception.hpp"
|
2015-04-16 11:39:39 +02:00
|
|
|
#include "concordia/concordia_config.hpp"
|
2015-04-17 14:17:59 +02:00
|
|
|
#include "concordia/concordia_search_result.hpp"
|
2014-05-14 16:29:44 +02:00
|
|
|
#include "concordia/anubis_search_result.hpp"
|
2015-04-14 20:14:30 +02:00
|
|
|
#include "concordia/tm_matches.hpp"
|
2014-05-14 16:29:44 +02:00
|
|
|
|
2015-04-17 14:17:59 +02:00
|
|
|
#include <vector>
|
2014-05-14 16:29:44 +02:00
|
|
|
#include <divsufsort.h>
|
|
|
|
|
|
|
|
/*!
|
2015-05-01 14:52:53 +02:00
|
|
|
Class for searching using Concordia algorithm. All searches are performed
|
|
|
|
on data structures passed to the methods of this class by smart pointers.
|
2014-05-14 16:29:44 +02:00
|
|
|
|
|
|
|
*/
|
|
|
|
|
2015-04-24 11:48:32 +02:00
|
|
|
class ConcordiaSearcher {
|
2014-05-14 16:29:44 +02:00
|
|
|
public:
|
2015-04-24 11:48:32 +02:00
|
|
|
explicit ConcordiaSearcher();
|
2014-05-14 16:29:44 +02:00
|
|
|
|
|
|
|
/*! Destructor.
|
|
|
|
*/
|
2015-04-24 11:48:32 +02:00
|
|
|
virtual ~ConcordiaSearcher();
|
2014-05-14 16:29:44 +02:00
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
/*! Performs concordia lookup on the RAM-based index.
|
|
|
|
This is a unique library functionality, designed
|
|
|
|
to facilitate Computer-Aided Translation.
|
|
|
|
For more info see \ref tutorial1_3.
|
|
|
|
\param result variable to store the result
|
|
|
|
\param T hashed index to search in
|
|
|
|
\param markers markers array for the needs of searching
|
|
|
|
\param SA suffix array for the needs of searching
|
|
|
|
\param pattern pattern to be searched in the index.
|
|
|
|
This pattern needs to be hashed.
|
|
|
|
*/
|
2015-04-17 14:17:59 +02:00
|
|
|
void concordiaSearch(
|
|
|
|
boost::shared_ptr<ConcordiaSearchResult> result,
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
2019-01-18 13:30:51 +01:00
|
|
|
const std::vector<INDEX_CHARACTER_TYPE> & pattern);
|
2015-04-17 14:17:59 +02:00
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
/*! \deprecated
|
|
|
|
Finds the examples from the index, whose resemblance to the
|
|
|
|
pattern is maximal. This method may perform very slow,
|
|
|
|
try using concordiaSearch instead.
|
|
|
|
\param config concordia config object
|
|
|
|
(to read the anubis threshold parameter)
|
|
|
|
\param T hashed index to search in
|
|
|
|
\param markers markers array for the needs of searching
|
|
|
|
\param SA suffix array for the needs of searching
|
|
|
|
\param pattern pattern to be searched in the index.
|
|
|
|
This pattern needs to be hashed.
|
|
|
|
\returns vector of results
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-04-15 10:55:26 +02:00
|
|
|
std::vector<AnubisSearchResult> anubisSearch(
|
2015-04-16 11:39:39 +02:00
|
|
|
boost::shared_ptr<ConcordiaConfig> config,
|
2014-05-15 22:20:31 +02:00
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
2019-01-18 13:30:51 +01:00
|
|
|
const std::vector<INDEX_CHARACTER_TYPE> & pattern);
|
2014-05-15 22:20:31 +02:00
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
/*! Generates map of all examples in the index which have
|
|
|
|
at least one word in common with the pattern. This method
|
|
|
|
is internally used in anubisSearch and may perform slow.
|
|
|
|
\param T hashed index to search in
|
|
|
|
\param markers markers array for the needs of searching
|
|
|
|
\param SA suffix array for the needs of searching
|
|
|
|
\param pattern pattern to be searched in the index.
|
|
|
|
This pattern needs to be hashed.
|
|
|
|
\returns generated map
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-04-14 20:14:30 +02:00
|
|
|
boost::shared_ptr<TmMatchesMap> getTmMatches(
|
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
2019-01-18 13:30:51 +01:00
|
|
|
const std::vector<INDEX_CHARACTER_TYPE> & pattern);
|
2015-04-14 20:14:30 +02:00
|
|
|
|
2015-05-01 14:52:53 +02:00
|
|
|
/*! Looks for fragments in the index which have the longest
|
|
|
|
common prefix with the pattern. This method return the list of
|
|
|
|
locations of these longest fragments (as return value) and their
|
|
|
|
length in the length parameter. There is a tight limit on the number
|
|
|
|
of longest fragments (currently set to 3). This method is used in
|
|
|
|
conordiaSearch.
|
|
|
|
\param T hashed index to search in
|
|
|
|
\param markers markers array for the needs of searching
|
|
|
|
\param SA suffix array for the needs of searching
|
|
|
|
\param pattern pattern to be searched in the index.
|
|
|
|
This pattern needs to be hashed.
|
|
|
|
\param length the returned length of the longest fragments
|
|
|
|
\returns list of locations of the longest fragments
|
|
|
|
\throws ConcordiaException
|
|
|
|
*/
|
2015-04-15 10:55:26 +02:00
|
|
|
std::vector<SubstringOccurence> lcpSearch(
|
2014-05-14 16:29:44 +02:00
|
|
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
2015-04-15 10:55:26 +02:00
|
|
|
const std::vector<sauchar_t> & pattern,
|
2019-01-18 13:30:51 +01:00
|
|
|
SUFFIX_MARKER_TYPE & length);
|
2014-05-15 22:20:31 +02:00
|
|
|
|
2014-05-14 16:29:44 +02:00
|
|
|
private:
|
2015-04-15 14:14:10 +02:00
|
|
|
void _collectResults(std::vector<SubstringOccurence> & result,
|
2014-05-15 22:20:31 +02:00
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
|
|
saidx_t left, saidx_t size);
|
2015-04-14 20:14:30 +02:00
|
|
|
|
|
|
|
void _addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
|
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
|
|
|
saidx_t sa_pos,
|
|
|
|
SUFFIX_MARKER_TYPE totalPatternLength,
|
|
|
|
SUFFIX_MARKER_TYPE matchedFragmentLength,
|
|
|
|
SUFFIX_MARKER_TYPE patternOffset);
|
|
|
|
|
|
|
|
bool _getOccurenceFromSA(boost::shared_ptr<std::vector<saidx_t> > SA,
|
2015-04-15 14:14:10 +02:00
|
|
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
|
|
|
saidx_t sa_pos,
|
|
|
|
SubstringOccurence & occurence);
|
2015-04-14 20:14:30 +02:00
|
|
|
|
|
|
|
void _addOccurenceToMap(boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
|
|
|
SubstringOccurence & occurence,
|
|
|
|
SUFFIX_MARKER_TYPE totalPatternLength,
|
|
|
|
SUFFIX_MARKER_TYPE matchedFragmentLength,
|
|
|
|
SUFFIX_MARKER_TYPE patternOffset);
|
2014-05-14 16:29:44 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|