finished documentation
This commit is contained in:
parent
9e550ca1cf
commit
abbd5b1ae8
12
Doxyfile.in
12
Doxyfile.in
@ -1355,18 +1355,6 @@ GENERATE_XML = NO
|
|||||||
|
|
||||||
XML_OUTPUT = xml
|
XML_OUTPUT = xml
|
||||||
|
|
||||||
# The XML_SCHEMA tag can be used to specify an XML schema,
|
|
||||||
# which can be used by a validating XML parser to check the
|
|
||||||
# syntax of the XML files.
|
|
||||||
|
|
||||||
XML_SCHEMA =
|
|
||||||
|
|
||||||
# The XML_DTD tag can be used to specify an XML DTD,
|
|
||||||
# which can be used by a validating XML parser to check the
|
|
||||||
# syntax of the XML files.
|
|
||||||
|
|
||||||
XML_DTD =
|
|
||||||
|
|
||||||
# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
|
# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
|
||||||
# dump the program listings (including syntax highlighting
|
# dump the program listings (including syntax highlighting
|
||||||
# and cross-referencing information) to the XML output. Note that
|
# and cross-referencing information) to the XML output. Note that
|
||||||
|
@ -43,6 +43,7 @@ public:
|
|||||||
bool operator > (const AnubisSearchResult & other) const {
|
bool operator > (const AnubisSearchResult & other) const {
|
||||||
return (_score > other.getScore());
|
return (_score > other.getScore());
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
SUFFIX_MARKER_TYPE _exampleId;
|
SUFFIX_MARKER_TYPE _exampleId;
|
||||||
|
|
||||||
|
@ -20,7 +20,6 @@
|
|||||||
|
|
||||||
class Utils {
|
class Utils {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/*! Constructor
|
/*! Constructor
|
||||||
*/
|
*/
|
||||||
explicit Utils();
|
explicit Utils();
|
||||||
|
@ -19,6 +19,18 @@
|
|||||||
|
|
||||||
/*!
|
/*!
|
||||||
The Concordia class is the main access point to the library.
|
The Concordia class is the main access point to the library.
|
||||||
|
This class holds references to three out of four main data
|
||||||
|
structures used by Concordia: hashed index, markers array
|
||||||
|
and suffix array. Word map is maintained by the class
|
||||||
|
HashGenerator. Concordia has references to:
|
||||||
|
- the hash generator (HashGenerator)
|
||||||
|
- concordia index (ConcordiaIndex)
|
||||||
|
- concordia searcher (ConcordiaSearcher)
|
||||||
|
- configuration (ConcordiaConfig)
|
||||||
|
|
||||||
|
Whenever it is necessary, the data structures and tools
|
||||||
|
held by Concordia are passed by smart pointers to methods which
|
||||||
|
carry out specific functionalities.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -85,7 +97,7 @@ public:
|
|||||||
|
|
||||||
/*! Loads HDD stored index files to RAM and generates
|
/*! Loads HDD stored index files to RAM and generates
|
||||||
suffix array based on RAM stored data structures.
|
suffix array based on RAM stored data structures.
|
||||||
For more info see \ref tutorial2.
|
For more info see \ref tutorial2.
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
||||||
|
@ -14,12 +14,22 @@
|
|||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for creating and maintaining the index.
|
Class for creating and maintaining the index. This class
|
||||||
|
does not hold the index data structures but only operates on
|
||||||
|
them when they are passed to ConcordiaIndex methods by
|
||||||
|
smart pointers. This class only remembers paths to two
|
||||||
|
files: hashed index and markers array, which are backups
|
||||||
|
of the respective data structures on HDD.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class ConcordiaIndex {
|
class ConcordiaIndex {
|
||||||
public:
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
\param hashedIndexFilePath path to the hashed index file
|
||||||
|
\param markersFilePath path to the markers array
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
explicit ConcordiaIndex(const std::string & hashedIndexFilePath,
|
explicit ConcordiaIndex(const std::string & hashedIndexFilePath,
|
||||||
const std::string & markersFilePath)
|
const std::string & markersFilePath)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
@ -28,23 +38,50 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~ConcordiaIndex();
|
virtual ~ConcordiaIndex();
|
||||||
|
|
||||||
|
/*! Adds an Example to the index. Example is first hashed using
|
||||||
|
the hash generator passed to this method. Then, hashed index
|
||||||
|
and markers array (also passed to this method) are appended
|
||||||
|
with the hashed example. At the same time, HDD versions of these
|
||||||
|
two data structures are also appended with the same example.
|
||||||
|
\param hashGenerator hash generator to be used to prepare the hash
|
||||||
|
of the example
|
||||||
|
\param T RAM-based hash index to be appended to
|
||||||
|
\param markers RAM-based markers array to be appended to
|
||||||
|
\param example example to be added to index
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
void addExample(
|
void addExample(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const Example & example);
|
const Example & example);
|
||||||
|
|
||||||
|
/*! Adds multiple examples to the index. Examples are first hashed using
|
||||||
|
the hash generator passed to this method. Then, hashed index
|
||||||
|
and markers array (also passed to this method) are appended
|
||||||
|
with the hashed examples. At the same time, HDD versions of these
|
||||||
|
two data structures are also appended with the same examples.
|
||||||
|
\param hashGenerator hash generator to be used to prepare the hash
|
||||||
|
of the example
|
||||||
|
\param T RAM-based hash index to be appended to
|
||||||
|
\param markers RAM-based markers array to be appended to
|
||||||
|
\param examples vector of examples to be added to index
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
void addAllExamples(
|
void addAllExamples(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const std::vector<Example> & examples);
|
const std::vector<Example> & examples);
|
||||||
|
|
||||||
|
/*! Generates suffix array based on the passed hashed index.
|
||||||
|
\returns the generated suffix array
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
boost::shared_ptr<std::vector<saidx_t> > generateSuffixArray(
|
boost::shared_ptr<std::vector<saidx_t> > generateSuffixArray(
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T);
|
boost::shared_ptr<std::vector<sauchar_t> > T);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Add example to disk index and update RAM index.
|
|
||||||
void _addSingleExample(std::ofstream & hashedIndexFile,
|
void _addSingleExample(std::ofstream & hashedIndexFile,
|
||||||
std::ofstream & markersFile,
|
std::ofstream & markersFile,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
@ -23,12 +23,11 @@ void ConcordiaSearchResult::sortFragments() {
|
|||||||
std::greater<MatchedPatternFragment>());
|
std::greater<MatchedPatternFragment>());
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaSearchResult::computeBestOverlay(
|
void ConcordiaSearchResult::computeBestOverlay() {
|
||||||
SUFFIX_MARKER_TYPE patternSize) {
|
|
||||||
// the fragments are already sorted by their ends, ascending
|
// the fragments are already sorted by their ends, ascending
|
||||||
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
|
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
|
||||||
-1,
|
-1,
|
||||||
patternSize);
|
_tokenVector.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaSearchResult::_checkPossibleOverlays(
|
void ConcordiaSearchResult::_checkPossibleOverlays(
|
||||||
|
@ -8,12 +8,22 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing result of concordia search.
|
Class representing result of concordia search. Contains the following
|
||||||
|
information:
|
||||||
|
- tokenized pattern which was used for searching
|
||||||
|
- list of longest matched fragments sorted in descending order by length
|
||||||
|
- the best overlay
|
||||||
|
- the score of the best overlay.
|
||||||
|
|
||||||
|
For more info about concordia searching see \ref tutorial1_3.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class ConcordiaSearchResult {
|
class ConcordiaSearchResult {
|
||||||
public:
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
\param tokenVector tokenized patter which was used for searching
|
||||||
|
*/
|
||||||
explicit ConcordiaSearchResult(
|
explicit ConcordiaSearchResult(
|
||||||
const std::vector<std::string> & tokenVector);
|
const std::vector<std::string> & tokenVector);
|
||||||
|
|
||||||
@ -21,24 +31,45 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~ConcordiaSearchResult();
|
virtual ~ConcordiaSearchResult();
|
||||||
|
|
||||||
|
/*! Adds a matched pattern fragment to the list.
|
||||||
|
\param fragment fragment to be added
|
||||||
|
*/
|
||||||
void addFragment(const MatchedPatternFragment & fragment);
|
void addFragment(const MatchedPatternFragment & fragment);
|
||||||
|
|
||||||
|
/*! Sorts the list of matched pattern fragments in descending order
|
||||||
|
by length.
|
||||||
|
*/
|
||||||
void sortFragments();
|
void sortFragments();
|
||||||
|
|
||||||
void computeBestOverlay(SUFFIX_MARKER_TYPE patternSize);
|
/*! Computes the best overlay by choosing appropriate fragments
|
||||||
|
from the fragments list. For more info see \ref tutorial1_3.
|
||||||
|
*/
|
||||||
|
void computeBestOverlay();
|
||||||
|
|
||||||
|
/*! Getter for tokenized pattern.
|
||||||
|
\returns tokenized search pattern
|
||||||
|
*/
|
||||||
std::vector<std::string> getTokenVector() const {
|
std::vector<std::string> getTokenVector() const {
|
||||||
return _tokenVector;
|
return _tokenVector;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for all matched pattern fragments list.
|
||||||
|
\returns matched pattern fragments list
|
||||||
|
*/
|
||||||
std::vector<MatchedPatternFragment> getFragments() const {
|
std::vector<MatchedPatternFragment> getFragments() const {
|
||||||
return _matchedPatternFragments;
|
return _matchedPatternFragments;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for best overlay.
|
||||||
|
\returns list of fragments that comprise the best overlay
|
||||||
|
*/
|
||||||
std::vector<MatchedPatternFragment> getBestOverlay() const {
|
std::vector<MatchedPatternFragment> getBestOverlay() const {
|
||||||
return _bestOverlay;
|
return _bestOverlay;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for best overlay score.
|
||||||
|
\returns score of the best overlay
|
||||||
|
*/
|
||||||
double getBestOverlayScore() const {
|
double getBestOverlayScore() const {
|
||||||
return _bestOverlayScore;
|
return _bestOverlayScore;
|
||||||
}
|
}
|
||||||
|
@ -46,7 +46,7 @@ void ConcordiaSearcher::concordiaSearch(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// compute best overlay of the pattern by matched fragments
|
// compute best overlay of the pattern by matched fragments
|
||||||
result->computeBestOverlay(pattern.size());
|
result->computeBestOverlay();
|
||||||
|
|
||||||
result->sortFragments();
|
result->sortFragments();
|
||||||
}
|
}
|
||||||
|
@ -16,7 +16,8 @@
|
|||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for searching using Concordia algorithm.
|
Class for searching using Concordia algorithm. All searches are performed
|
||||||
|
on data structures passed to the methods of this class by smart pointers.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -28,6 +29,18 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~ConcordiaSearcher();
|
virtual ~ConcordiaSearcher();
|
||||||
|
|
||||||
|
/*! Performs concordia lookup on the RAM-based index.
|
||||||
|
This is a unique library functionality, designed
|
||||||
|
to facilitate Computer-Aided Translation.
|
||||||
|
For more info see \ref tutorial1_3.
|
||||||
|
\param result variable to store the result
|
||||||
|
\param T hashed index to search in
|
||||||
|
\param markers markers array for the needs of searching
|
||||||
|
\param SA suffix array for the needs of searching
|
||||||
|
\param pattern pattern to be searched in the index.
|
||||||
|
This pattern needs to be hashed.
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
void concordiaSearch(
|
void concordiaSearch(
|
||||||
boost::shared_ptr<ConcordiaSearchResult> result,
|
boost::shared_ptr<ConcordiaSearchResult> result,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
@ -36,6 +49,20 @@ public:
|
|||||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! \deprecated
|
||||||
|
Finds the examples from the index, whose resemblance to the
|
||||||
|
pattern is maximal. This method may perform very slow,
|
||||||
|
try using concordiaSearch instead.
|
||||||
|
\param config concordia config object
|
||||||
|
(to read the anubis threshold parameter)
|
||||||
|
\param T hashed index to search in
|
||||||
|
\param markers markers array for the needs of searching
|
||||||
|
\param SA suffix array for the needs of searching
|
||||||
|
\param pattern pattern to be searched in the index.
|
||||||
|
This pattern needs to be hashed.
|
||||||
|
\returns vector of results
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
std::vector<AnubisSearchResult> anubisSearch(
|
std::vector<AnubisSearchResult> anubisSearch(
|
||||||
boost::shared_ptr<ConcordiaConfig> config,
|
boost::shared_ptr<ConcordiaConfig> config,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
@ -44,6 +71,17 @@ public:
|
|||||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Generates map of all examples in the index which have
|
||||||
|
at least one word in common with the pattern. This method
|
||||||
|
is internally used in anubisSearch and may perform slow.
|
||||||
|
\param T hashed index to search in
|
||||||
|
\param markers markers array for the needs of searching
|
||||||
|
\param SA suffix array for the needs of searching
|
||||||
|
\param pattern pattern to be searched in the index.
|
||||||
|
This pattern needs to be hashed.
|
||||||
|
\returns generated map
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
boost::shared_ptr<TmMatchesMap> getTmMatches(
|
boost::shared_ptr<TmMatchesMap> getTmMatches(
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
@ -51,6 +89,21 @@ public:
|
|||||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Looks for fragments in the index which have the longest
|
||||||
|
common prefix with the pattern. This method return the list of
|
||||||
|
locations of these longest fragments (as return value) and their
|
||||||
|
length in the length parameter. There is a tight limit on the number
|
||||||
|
of longest fragments (currently set to 3). This method is used in
|
||||||
|
conordiaSearch.
|
||||||
|
\param T hashed index to search in
|
||||||
|
\param markers markers array for the needs of searching
|
||||||
|
\param SA suffix array for the needs of searching
|
||||||
|
\param pattern pattern to be searched in the index.
|
||||||
|
This pattern needs to be hashed.
|
||||||
|
\param length the returned length of the longest fragments
|
||||||
|
\returns list of locations of the longest fragments
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
std::vector<SubstringOccurence> lcpSearch(
|
std::vector<SubstringOccurence> lcpSearch(
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
@ -7,11 +7,17 @@
|
|||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing a single sentence to be added into index along with its id.
|
Class representing a single sentence to be added into index along with its id.
|
||||||
|
For more info see \ref tutorial1_2.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class Example {
|
class Example {
|
||||||
public:
|
public:
|
||||||
|
/*!
|
||||||
|
Constructor.
|
||||||
|
\param sentence sentence to be added to index
|
||||||
|
\param id id of this sentence
|
||||||
|
*/
|
||||||
explicit Example(const std::string & sentence,
|
explicit Example(const std::string & sentence,
|
||||||
const SUFFIX_MARKER_TYPE & id)
|
const SUFFIX_MARKER_TYPE & id)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
@ -20,10 +26,16 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~Example();
|
virtual ~Example();
|
||||||
|
|
||||||
|
/*! Getter for sentence.
|
||||||
|
\return sentence
|
||||||
|
*/
|
||||||
std::string getSentence() const {
|
std::string getSentence() const {
|
||||||
return _sentence;
|
return _sentence;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for sentence id.
|
||||||
|
\return sentence id
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getId() const {
|
SUFFIX_MARKER_TYPE getId() const {
|
||||||
return _id;
|
return _id;
|
||||||
}
|
}
|
||||||
|
@ -14,12 +14,24 @@
|
|||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for generating a sentence hash.
|
Class for generating a sentence hash. The hash is generated from a sentence
|
||||||
|
given in raw string. String is first anonymized and tokenized. After these
|
||||||
|
operations, each token is coded as an integer, according to WordMap.
|
||||||
|
Resulting hash is a vector of integers.
|
||||||
|
|
||||||
|
Sentence hashed is used when adding a sentence to index and during searching.
|
||||||
|
|
||||||
|
HashGenerator holds an instance of WordMap, used to code tokens as integers
|
||||||
|
and SentenceAnonymizer, used to preprocess the sentence string.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class HashGenerator {
|
class HashGenerator {
|
||||||
public:
|
public:
|
||||||
|
/*!
|
||||||
|
Constructor.
|
||||||
|
\param config pointer to current config object
|
||||||
|
*/
|
||||||
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
@ -27,11 +39,28 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~HashGenerator();
|
virtual ~HashGenerator();
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Generates hash of a sentence.
|
||||||
|
\param sentence sentence to generate hash from
|
||||||
|
\returns vector of integers
|
||||||
|
*/
|
||||||
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
|
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Generates vector of tokens from a sentence. This method is internally
|
||||||
|
used by generateHash. However, for the sake of concordiaSearch
|
||||||
|
(see \ref tutorial1_3), the vector of tokens resulting from sentence
|
||||||
|
anonymizing and tokenization is also needed.
|
||||||
|
\param sentence sentence to tokenize
|
||||||
|
\returns vector of tokens
|
||||||
|
*/
|
||||||
std::vector<std::string> generateTokenVector(const std::string & sentence);
|
std::vector<std::string> generateTokenVector(const std::string & sentence);
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Saves the contents of current WordMap to HDD.
|
||||||
|
*/
|
||||||
void serializeWordMap();
|
void serializeWordMap();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -16,18 +16,35 @@
|
|||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for searching the index with a sentence.
|
Class for searching the index with a sentence. In all searches the sentence
|
||||||
|
is first hashed and then used as a query.
|
||||||
|
|
||||||
|
IndexSearcher performs the simpleSearch on its own, but uses a
|
||||||
|
ConcordiaSearcher object to carry out concordiaSearch.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class IndexSearcher {
|
class IndexSearcher {
|
||||||
public:
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
*/
|
||||||
explicit IndexSearcher();
|
explicit IndexSearcher();
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~IndexSearcher();
|
virtual ~IndexSearcher();
|
||||||
|
|
||||||
|
/*! Performs a simple substring lookup in RAM-based index.
|
||||||
|
For more info see \ref tutorial1_2.
|
||||||
|
\param hashGenerator hash generator to be used to convert
|
||||||
|
input sentence to a hash
|
||||||
|
\param T hashed index to search in
|
||||||
|
\param markers markers array for the needs of searching
|
||||||
|
\param SA suffix array for the needs of searching
|
||||||
|
\param pattern string pattern to be searched in the index.
|
||||||
|
\returns vector of occurences of the pattern in the index
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
std::vector<SubstringOccurence> simpleSearch(
|
std::vector<SubstringOccurence> simpleSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
@ -35,6 +52,21 @@ public:
|
|||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const std::string & pattern) throw(ConcordiaException);
|
const std::string & pattern) throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! \deprecated
|
||||||
|
Finds the examples from the index, whose resemblance to the
|
||||||
|
pattern is maximal. This method may perform very slow,
|
||||||
|
try using concordiaSearch instead.
|
||||||
|
\param config concordia config object
|
||||||
|
(to read the anubis threshold parameter)
|
||||||
|
\param hashGenerator hash generator to be used to convert
|
||||||
|
input sentence to a hash
|
||||||
|
\param T hashed index to search in
|
||||||
|
\param markers markers array for the needs of searching
|
||||||
|
\param SA suffix array for the needs of searching
|
||||||
|
\param pattern string pattern to be searched in the index.
|
||||||
|
\returns vector of results
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
std::vector<AnubisSearchResult> anubisSearch(
|
std::vector<AnubisSearchResult> anubisSearch(
|
||||||
boost::shared_ptr<ConcordiaConfig> config,
|
boost::shared_ptr<ConcordiaConfig> config,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
@ -43,6 +75,19 @@ public:
|
|||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const std::string & pattern) throw(ConcordiaException);
|
const std::string & pattern) throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Performs concordia lookup on the RAM-based index.
|
||||||
|
This is a unique library functionality, designed
|
||||||
|
to facilitate Computer-Aided Translation.
|
||||||
|
For more info see \ref tutorial1_3.
|
||||||
|
\param hashGenerator hash generator to be used to convert
|
||||||
|
input sentence to a hash
|
||||||
|
\param T hashed index to search in
|
||||||
|
\param markers markers array for the needs of searching
|
||||||
|
\param SA suffix array for the needs of searching
|
||||||
|
\param pattern pattern to be searched in the index.
|
||||||
|
\returns result of the search
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
|
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
@ -4,12 +4,20 @@
|
|||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing word interval.
|
Class representing interval of a sentence, i.e. a sequence of words
|
||||||
|
coming from that sentence. An interval only has its start and end indexes,
|
||||||
|
where the start index is inclusive and end index is exclusive. For example,
|
||||||
|
an interval [2,5] of the sentence "This is just for testing purposes" is:
|
||||||
|
"just for testing".
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class Interval {
|
class Interval {
|
||||||
public:
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
\param start start index of the interval (0-based)
|
||||||
|
\param end end index of the interval (0-based)
|
||||||
|
*/
|
||||||
explicit Interval(const SUFFIX_MARKER_TYPE start,
|
explicit Interval(const SUFFIX_MARKER_TYPE start,
|
||||||
const SUFFIX_MARKER_TYPE end);
|
const SUFFIX_MARKER_TYPE end);
|
||||||
|
|
||||||
@ -17,14 +25,27 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~Interval();
|
virtual ~Interval();
|
||||||
|
|
||||||
|
/*! Checks if this interval intersects another.
|
||||||
|
\param interval another interval
|
||||||
|
\returns true if the two intervals intersect
|
||||||
|
*/
|
||||||
bool intersects(Interval & interval);
|
bool intersects(Interval & interval);
|
||||||
|
|
||||||
|
/*! Getter for interval length.
|
||||||
|
\returns end - start
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getLength();
|
SUFFIX_MARKER_TYPE getLength();
|
||||||
|
|
||||||
|
/*! Getter for interval start.
|
||||||
|
\returns start
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getStart() const {
|
SUFFIX_MARKER_TYPE getStart() const {
|
||||||
return _start;
|
return _start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for interval end.
|
||||||
|
\returns end
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getEnd() const {
|
SUFFIX_MARKER_TYPE getEnd() const {
|
||||||
return _end;
|
return _end;
|
||||||
}
|
}
|
||||||
|
@ -7,10 +7,21 @@
|
|||||||
/*!
|
/*!
|
||||||
Class representing matched pattern fragment in concordia search.
|
Class representing matched pattern fragment in concordia search.
|
||||||
This fragment can be seen as an interval of the pattern.
|
This fragment can be seen as an interval of the pattern.
|
||||||
|
|
||||||
|
This class holds information about:
|
||||||
|
- where the pattern fragment was matched (example id and example offset)
|
||||||
|
- where the fragment is located within the pattern
|
||||||
|
(patternOffset, matchedLength)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class MatchedPatternFragment : public Interval {
|
class MatchedPatternFragment : public Interval {
|
||||||
public:
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
\param exampleId id of the example where the pattern fragment was matched
|
||||||
|
\param exampleOffset offset of the matched fragment in the example
|
||||||
|
\param patternOffset offset of the matched fragment in the pattern
|
||||||
|
\param matchedLength length of the matched pattern
|
||||||
|
*/
|
||||||
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
|
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
|
||||||
const SUFFIX_MARKER_TYPE & exampleOffset,
|
const SUFFIX_MARKER_TYPE & exampleOffset,
|
||||||
const SUFFIX_MARKER_TYPE & patternOffset,
|
const SUFFIX_MARKER_TYPE & patternOffset,
|
||||||
@ -19,22 +30,37 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~MatchedPatternFragment();
|
virtual ~MatchedPatternFragment();
|
||||||
|
|
||||||
|
/*! Getter for example id.
|
||||||
|
\returns example id
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getExampleId() const {
|
SUFFIX_MARKER_TYPE getExampleId() const {
|
||||||
return _exampleId;
|
return _exampleId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for example offset.
|
||||||
|
\returns example offset
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getExampleOffset() const {
|
SUFFIX_MARKER_TYPE getExampleOffset() const {
|
||||||
return _exampleOffset;
|
return _exampleOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for pattern offset.
|
||||||
|
\returns pattern offset
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getPatternOffset() const {
|
SUFFIX_MARKER_TYPE getPatternOffset() const {
|
||||||
return _patternOffset;
|
return _patternOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for matched length.
|
||||||
|
\returns matched fragment length
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getMatchedLength() const {
|
SUFFIX_MARKER_TYPE getMatchedLength() const {
|
||||||
return _matchedLength;
|
return _matchedLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Operator for comparing fragments by their length.
|
||||||
|
\returns true if current pattern is longer than the other
|
||||||
|
*/
|
||||||
bool operator > (const MatchedPatternFragment & other) const {
|
bool operator > (const MatchedPatternFragment & other) const {
|
||||||
return (_matchedLength > other.getMatchedLength());
|
return (_matchedLength > other.getMatchedLength());
|
||||||
}
|
}
|
||||||
|
@ -9,15 +9,23 @@
|
|||||||
#include <boost/regex/icu.hpp>
|
#include <boost/regex/icu.hpp>
|
||||||
|
|
||||||
|
|
||||||
/*!
|
|
||||||
Class for replacing string occurences.
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
|
typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Class for representing a regular expression replacement operation.
|
||||||
|
Holds regex pattern string for matching and replacement string for
|
||||||
|
replacing found matches.
|
||||||
|
|
||||||
|
*/
|
||||||
class RegexReplacement {
|
class RegexReplacement {
|
||||||
public:
|
public:
|
||||||
|
/*!
|
||||||
|
Constructor.
|
||||||
|
\param patternString regex pattern to match
|
||||||
|
\param replacement string to substitute the found match
|
||||||
|
\param caseSensitive case sensitivity of the pattern
|
||||||
|
*/
|
||||||
RegexReplacement(std::string patternString, std::string replacement,
|
RegexReplacement(std::string patternString, std::string replacement,
|
||||||
bool caseSensitive = true)
|
bool caseSensitive = true)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
@ -26,6 +34,10 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~RegexReplacement();
|
virtual ~RegexReplacement();
|
||||||
|
|
||||||
|
/*! Applies the operation on input string.
|
||||||
|
\param text the input string
|
||||||
|
\returns altered version of the input string
|
||||||
|
*/
|
||||||
std::string apply(const std::string & text);
|
std::string apply(const std::string & text);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -12,12 +12,20 @@
|
|||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for anonymizing sentence before adding to index.
|
Class for anonymizing sentence before generating hash.
|
||||||
|
This operation is is used to
|
||||||
|
remove unnecessary symbols and possibly words from sentences added to index
|
||||||
|
and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
||||||
|
with a single space, removes stop words (if the option is enabled), as well as
|
||||||
|
named entities and special symbols. All these have to be listed in files
|
||||||
|
(see \ref tutorial3).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class SentenceAnonymizer {
|
class SentenceAnonymizer {
|
||||||
public:
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
\param config config object, holding paths to necessary files
|
||||||
|
*/
|
||||||
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
@ -25,6 +33,10 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~SentenceAnonymizer();
|
virtual ~SentenceAnonymizer();
|
||||||
|
|
||||||
|
/*! Anonymizes the sentence.
|
||||||
|
\param sentence input sentence
|
||||||
|
\returns altered version of the input sentence
|
||||||
|
*/
|
||||||
std::string anonymize(const std::string & sentence);
|
std::string anonymize(const std::string & sentence);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -6,15 +6,32 @@
|
|||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing occurence of a searched substring.
|
Class representing occurence of a searched substring.
|
||||||
|
It holds the following information:
|
||||||
|
- id of the example where the substring was found
|
||||||
|
- offset of the matched substring in this example
|
||||||
|
- length of the example
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class SubstringOccurence {
|
class SubstringOccurence {
|
||||||
public:
|
public:
|
||||||
|
/*!
|
||||||
|
Constructor.
|
||||||
|
|
||||||
|
*/
|
||||||
SubstringOccurence();
|
SubstringOccurence();
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Constructor taking data from a marker.
|
||||||
|
\param marker
|
||||||
|
*/
|
||||||
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & marker);
|
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & marker);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Constructor with three arguments.
|
||||||
|
\param id example id
|
||||||
|
\param offset offset of the substring in the example
|
||||||
|
\param exampleLength length of the example
|
||||||
|
*/
|
||||||
SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
||||||
const SUFFIX_MARKER_TYPE & offset,
|
const SUFFIX_MARKER_TYPE & offset,
|
||||||
const SUFFIX_MARKER_TYPE & exampleLength);
|
const SUFFIX_MARKER_TYPE & exampleLength);
|
||||||
@ -22,18 +39,30 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~SubstringOccurence();
|
virtual ~SubstringOccurence();
|
||||||
|
|
||||||
|
/*! Getter for example id.
|
||||||
|
\returns example id
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getId() const {
|
SUFFIX_MARKER_TYPE getId() const {
|
||||||
return _id;
|
return _id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for example offset
|
||||||
|
\returns example offset
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getOffset() const {
|
SUFFIX_MARKER_TYPE getOffset() const {
|
||||||
return _offset;
|
return _offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for example length.
|
||||||
|
\returns example length
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getExampleLength() const {
|
SUFFIX_MARKER_TYPE getExampleLength() const {
|
||||||
return _exampleLength;
|
return _exampleLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Setter of all the fields, based on input marker.
|
||||||
|
\param marker marker to read the data from
|
||||||
|
*/
|
||||||
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
|
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -11,13 +11,25 @@
|
|||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class used within Anubis search algorithm to store partial results.
|
Class used within Anubis search algorithm to store partial results.
|
||||||
|
Holds information about mutual overlay of the pattern and found
|
||||||
|
example.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class TmMatches {
|
class TmMatches {
|
||||||
public:
|
public:
|
||||||
|
/*!
|
||||||
|
Constructor.
|
||||||
|
|
||||||
|
*/
|
||||||
TmMatches();
|
TmMatches();
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Constructor setting basic information.
|
||||||
|
\param exampleId id of found example
|
||||||
|
\param exampleSize size of the found example
|
||||||
|
\param patternSize size of the searched pattern
|
||||||
|
*/
|
||||||
TmMatches(const SUFFIX_MARKER_TYPE exampleId,
|
TmMatches(const SUFFIX_MARKER_TYPE exampleId,
|
||||||
const SUFFIX_MARKER_TYPE exampleSize,
|
const SUFFIX_MARKER_TYPE exampleSize,
|
||||||
const SUFFIX_MARKER_TYPE patternSize);
|
const SUFFIX_MARKER_TYPE patternSize);
|
||||||
@ -26,28 +38,74 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~TmMatches();
|
virtual ~TmMatches();
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Getter for score of the mutual overlay.
|
||||||
|
\returns score
|
||||||
|
*/
|
||||||
double getScore() const {
|
double getScore() const {
|
||||||
return _score;
|
return _score;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Getter for the list of overlays of the example.
|
||||||
|
\returns example overlays list
|
||||||
|
*/
|
||||||
std::vector<Interval> getExampleIntervals() const {
|
std::vector<Interval> getExampleIntervals() const {
|
||||||
return _exampleMatchedRegions;
|
return _exampleMatchedRegions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Getter for the list of overlays of the pattern.
|
||||||
|
\returns pattern overlays list
|
||||||
|
*/
|
||||||
std::vector<Interval> getPatternIntervals() const {
|
std::vector<Interval> getPatternIntervals() const {
|
||||||
return _patternMatchedRegions;
|
return _patternMatchedRegions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Getter for example id.
|
||||||
|
\returns example id
|
||||||
|
*/
|
||||||
SUFFIX_MARKER_TYPE getExampleId() const {
|
SUFFIX_MARKER_TYPE getExampleId() const {
|
||||||
return _exampleId;
|
return _exampleId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Calculates mutual overlay score in the scale [0,1].
|
||||||
|
Uses generalized Jaccard index for the computation.
|
||||||
|
Score 1 - perfect score - is assigned when the whole pattern
|
||||||
|
and the whole example are covered. Result of the computation
|
||||||
|
is stored in the score field, use getScore() to retrieve it.
|
||||||
|
*/
|
||||||
void calculateSimpleScore();
|
void calculateSimpleScore();
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Calculates mutual overlay score in the scale [0,1].
|
||||||
|
Takes into account the number and the length of the
|
||||||
|
fragments (the fewer fragments, the better).
|
||||||
|
Score 1 - perfect score - is assigned when the whole pattern
|
||||||
|
and the whole example are covered with only one fragment.
|
||||||
|
Result of the computation is stored in the score field,
|
||||||
|
use getScore() to retrieve it.
|
||||||
|
*/
|
||||||
void calculateScore();
|
void calculateScore();
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Adds information about covering of example. If the new
|
||||||
|
fragment intersects with any previous fragment, it is
|
||||||
|
not added.
|
||||||
|
\param start start of the example overlay fragment
|
||||||
|
\param end end of the example overlay fragment
|
||||||
|
*/
|
||||||
void addExampleInterval(int start, int end);
|
void addExampleInterval(int start, int end);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Adds information about covering of pattern. If the new
|
||||||
|
fragment intersects with any previous fragment, it is
|
||||||
|
not added.
|
||||||
|
\param start start of the pattern overlay fragment
|
||||||
|
\param end end of the pattern overlay fragment
|
||||||
|
*/
|
||||||
void addPatternInterval(int start, int end);
|
void addPatternInterval(int start, int end);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -10,18 +10,30 @@
|
|||||||
#include <boost/serialization/map.hpp>
|
#include <boost/serialization/map.hpp>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing dictionary for word to int encoding.
|
Class representing dictionary for word to integer encoding.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class WordMap {
|
class WordMap {
|
||||||
public:
|
public:
|
||||||
|
/*!
|
||||||
|
Constructor.
|
||||||
|
|
||||||
|
*/
|
||||||
explicit WordMap() throw(ConcordiaException);
|
explicit WordMap() throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~WordMap();
|
virtual ~WordMap();
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Gets the integer code of a token. If the token is found in
|
||||||
|
the dictionary, the dictionary code is returned. If not,
|
||||||
|
the word is added to the dictionary and its newly created
|
||||||
|
code is returned.
|
||||||
|
\param word token to generate the code
|
||||||
|
\returns code of the token
|
||||||
|
*/
|
||||||
INDEX_CHARACTER_TYPE getWordCode(const std::string & word)
|
INDEX_CHARACTER_TYPE getWordCode(const std::string & word)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user