concordia-library/concordia/concordia_searcher.hpp

137 lines
5.8 KiB
C++
Raw Permalink Normal View History

#ifndef ANUBIS_SEARCHER_HDR
#define ANUBIS_SEARCHER_HDR
#include <boost/shared_ptr.hpp>
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
2019-01-22 14:07:28 +01:00
#include "concordia/substring_occurrence.hpp"
#include "concordia/concordia_exception.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_search_result.hpp"
#include "concordia/anubis_search_result.hpp"
#include "concordia/tm_matches.hpp"
#include <vector>
#include <divsufsort.h>
/*!
2015-05-01 14:52:53 +02:00
Class for searching using Concordia algorithm. All searches are performed
on data structures passed to the methods of this class by smart pointers.
*/
class ConcordiaSearcher {
public:
explicit ConcordiaSearcher();
/*! Destructor.
*/
virtual ~ConcordiaSearcher();
2015-05-01 14:52:53 +02:00
/*! Performs concordia lookup on the RAM-based index.
This is a unique library functionality, designed
to facilitate Computer-Aided Translation.
For more info see \ref tutorial1_3.
\param result variable to store the result
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
This pattern needs to be hashed.
*/
void concordiaSearch(
boost::shared_ptr<ConcordiaSearchResult> result,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
2019-01-18 13:30:51 +01:00
const std::vector<INDEX_CHARACTER_TYPE> & pattern);
2015-05-01 14:52:53 +02:00
/*! \deprecated
Finds the examples from the index, whose resemblance to the
pattern is maximal. This method may perform very slow,
try using concordiaSearch instead.
\param config concordia config object
(to read the anubis threshold parameter)
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
This pattern needs to be hashed.
\returns vector of results
\throws ConcordiaException
*/
std::vector<AnubisSearchResult> anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
2019-01-18 13:30:51 +01:00
const std::vector<INDEX_CHARACTER_TYPE> & pattern);
2015-05-01 14:52:53 +02:00
/*! Generates map of all examples in the index which have
at least one word in common with the pattern. This method
is internally used in anubisSearch and may perform slow.
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
This pattern needs to be hashed.
\returns generated map
\throws ConcordiaException
*/
boost::shared_ptr<TmMatchesMap> getTmMatches(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
2019-01-18 13:30:51 +01:00
const std::vector<INDEX_CHARACTER_TYPE> & pattern);
2015-05-01 14:52:53 +02:00
/*! Looks for fragments in the index which have the longest
common prefix with the pattern. This method return the list of
locations of these longest fragments (as return value) and their
length in the length parameter. There is a tight limit on the number
of longest fragments (currently set to 3). This method is used in
conordiaSearch.
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
This pattern needs to be hashed.
\param length the returned length of the longest fragments
\returns list of locations of the longest fragments
\throws ConcordiaException
*/
2019-01-22 14:07:28 +01:00
std::vector<SubstringOccurrence> lcpSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::vector<sauchar_t> & pattern,
2019-01-18 13:30:51 +01:00
SUFFIX_MARKER_TYPE & length);
private:
2019-01-22 14:07:28 +01:00
void _collectResults(std::vector<SubstringOccurrence> & result,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
saidx_t left, saidx_t size);
void _addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
saidx_t sa_pos,
SUFFIX_MARKER_TYPE totalPatternLength,
SUFFIX_MARKER_TYPE matchedFragmentLength,
SUFFIX_MARKER_TYPE patternOffset);
2019-01-22 14:07:28 +01:00
bool _getOccurrenceFromSA(boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
saidx_t sa_pos,
2019-01-22 14:07:28 +01:00
SubstringOccurrence & occurrence);
2019-01-22 14:07:28 +01:00
void _addOccurrenceToMap(boost::shared_ptr<TmMatchesMap> tmMatchesMap,
SubstringOccurrence & occurrence,
SUFFIX_MARKER_TYPE totalPatternLength,
SUFFIX_MARKER_TYPE matchedFragmentLength,
SUFFIX_MARKER_TYPE patternOffset);
};
#endif