finished documentation
This commit is contained in:
parent
9e550ca1cf
commit
abbd5b1ae8
12
Doxyfile.in
12
Doxyfile.in
@ -1355,18 +1355,6 @@ GENERATE_XML = NO
|
||||
|
||||
XML_OUTPUT = xml
|
||||
|
||||
# The XML_SCHEMA tag can be used to specify an XML schema,
|
||||
# which can be used by a validating XML parser to check the
|
||||
# syntax of the XML files.
|
||||
|
||||
XML_SCHEMA =
|
||||
|
||||
# The XML_DTD tag can be used to specify an XML DTD,
|
||||
# which can be used by a validating XML parser to check the
|
||||
# syntax of the XML files.
|
||||
|
||||
XML_DTD =
|
||||
|
||||
# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
|
||||
# dump the program listings (including syntax highlighting
|
||||
# and cross-referencing information) to the XML output. Note that
|
||||
|
@ -43,6 +43,7 @@ public:
|
||||
bool operator > (const AnubisSearchResult & other) const {
|
||||
return (_score > other.getScore());
|
||||
}
|
||||
|
||||
private:
|
||||
SUFFIX_MARKER_TYPE _exampleId;
|
||||
|
||||
|
@ -20,7 +20,6 @@
|
||||
|
||||
class Utils {
|
||||
public:
|
||||
|
||||
/*! Constructor
|
||||
*/
|
||||
explicit Utils();
|
||||
|
@ -19,6 +19,18 @@
|
||||
|
||||
/*!
|
||||
The Concordia class is the main access point to the library.
|
||||
This class holds references to three out of four main data
|
||||
structures used by Concordia: hashed index, markers array
|
||||
and suffix array. Word map is maintained by the class
|
||||
HashGenerator. Concordia has references to:
|
||||
- the hash generator (HashGenerator)
|
||||
- concordia index (ConcordiaIndex)
|
||||
- concordia searcher (ConcordiaSearcher)
|
||||
- configuration (ConcordiaConfig)
|
||||
|
||||
Whenever it is necessary, the data structures and tools
|
||||
held by Concordia are passed by smart pointers to methods which
|
||||
carry out specific functionalities.
|
||||
|
||||
*/
|
||||
|
||||
@ -85,7 +97,7 @@ public:
|
||||
|
||||
/*! Loads HDD stored index files to RAM and generates
|
||||
suffix array based on RAM stored data structures.
|
||||
For more info see \ref tutorial2.
|
||||
For more info see \ref tutorial2.
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
||||
|
@ -14,12 +14,22 @@
|
||||
#include <divsufsort.h>
|
||||
|
||||
/*!
|
||||
Class for creating and maintaining the index.
|
||||
Class for creating and maintaining the index. This class
|
||||
does not hold the index data structures but only operates on
|
||||
them when they are passed to ConcordiaIndex methods by
|
||||
smart pointers. This class only remembers paths to two
|
||||
files: hashed index and markers array, which are backups
|
||||
of the respective data structures on HDD.
|
||||
|
||||
*/
|
||||
|
||||
class ConcordiaIndex {
|
||||
public:
|
||||
/*! Constructor.
|
||||
\param hashedIndexFilePath path to the hashed index file
|
||||
\param markersFilePath path to the markers array
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
explicit ConcordiaIndex(const std::string & hashedIndexFilePath,
|
||||
const std::string & markersFilePath)
|
||||
throw(ConcordiaException);
|
||||
@ -28,23 +38,50 @@ public:
|
||||
*/
|
||||
virtual ~ConcordiaIndex();
|
||||
|
||||
/*! Adds an Example to the index. Example is first hashed using
|
||||
the hash generator passed to this method. Then, hashed index
|
||||
and markers array (also passed to this method) are appended
|
||||
with the hashed example. At the same time, HDD versions of these
|
||||
two data structures are also appended with the same example.
|
||||
\param hashGenerator hash generator to be used to prepare the hash
|
||||
of the example
|
||||
\param T RAM-based hash index to be appended to
|
||||
\param markers RAM-based markers array to be appended to
|
||||
\param example example to be added to index
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void addExample(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const Example & example);
|
||||
|
||||
/*! Adds multiple examples to the index. Examples are first hashed using
|
||||
the hash generator passed to this method. Then, hashed index
|
||||
and markers array (also passed to this method) are appended
|
||||
with the hashed examples. At the same time, HDD versions of these
|
||||
two data structures are also appended with the same examples.
|
||||
\param hashGenerator hash generator to be used to prepare the hash
|
||||
of the example
|
||||
\param T RAM-based hash index to be appended to
|
||||
\param markers RAM-based markers array to be appended to
|
||||
\param examples vector of examples to be added to index
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void addAllExamples(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const std::vector<Example> & examples);
|
||||
|
||||
/*! Generates suffix array based on the passed hashed index.
|
||||
\returns the generated suffix array
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
boost::shared_ptr<std::vector<saidx_t> > generateSuffixArray(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T);
|
||||
|
||||
private:
|
||||
// Add example to disk index and update RAM index.
|
||||
void _addSingleExample(std::ofstream & hashedIndexFile,
|
||||
std::ofstream & markersFile,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
|
@ -23,12 +23,11 @@ void ConcordiaSearchResult::sortFragments() {
|
||||
std::greater<MatchedPatternFragment>());
|
||||
}
|
||||
|
||||
void ConcordiaSearchResult::computeBestOverlay(
|
||||
SUFFIX_MARKER_TYPE patternSize) {
|
||||
void ConcordiaSearchResult::computeBestOverlay() {
|
||||
// the fragments are already sorted by their ends, ascending
|
||||
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
|
||||
-1,
|
||||
patternSize);
|
||||
_tokenVector.size());
|
||||
}
|
||||
|
||||
void ConcordiaSearchResult::_checkPossibleOverlays(
|
||||
|
@ -8,12 +8,22 @@
|
||||
#include <string>
|
||||
|
||||
/*!
|
||||
Class representing result of concordia search.
|
||||
Class representing result of concordia search. Contains the following
|
||||
information:
|
||||
- tokenized pattern which was used for searching
|
||||
- list of longest matched fragments sorted in descending order by length
|
||||
- the best overlay
|
||||
- the score of the best overlay.
|
||||
|
||||
For more info about concordia searching see \ref tutorial1_3.
|
||||
|
||||
*/
|
||||
|
||||
class ConcordiaSearchResult {
|
||||
public:
|
||||
/*! Constructor.
|
||||
\param tokenVector tokenized patter which was used for searching
|
||||
*/
|
||||
explicit ConcordiaSearchResult(
|
||||
const std::vector<std::string> & tokenVector);
|
||||
|
||||
@ -21,24 +31,45 @@ public:
|
||||
*/
|
||||
virtual ~ConcordiaSearchResult();
|
||||
|
||||
/*! Adds a matched pattern fragment to the list.
|
||||
\param fragment fragment to be added
|
||||
*/
|
||||
void addFragment(const MatchedPatternFragment & fragment);
|
||||
|
||||
/*! Sorts the list of matched pattern fragments in descending order
|
||||
by length.
|
||||
*/
|
||||
void sortFragments();
|
||||
|
||||
void computeBestOverlay(SUFFIX_MARKER_TYPE patternSize);
|
||||
/*! Computes the best overlay by choosing appropriate fragments
|
||||
from the fragments list. For more info see \ref tutorial1_3.
|
||||
*/
|
||||
void computeBestOverlay();
|
||||
|
||||
/*! Getter for tokenized pattern.
|
||||
\returns tokenized search pattern
|
||||
*/
|
||||
std::vector<std::string> getTokenVector() const {
|
||||
return _tokenVector;
|
||||
}
|
||||
|
||||
/*! Getter for all matched pattern fragments list.
|
||||
\returns matched pattern fragments list
|
||||
*/
|
||||
std::vector<MatchedPatternFragment> getFragments() const {
|
||||
return _matchedPatternFragments;
|
||||
}
|
||||
|
||||
/*! Getter for best overlay.
|
||||
\returns list of fragments that comprise the best overlay
|
||||
*/
|
||||
std::vector<MatchedPatternFragment> getBestOverlay() const {
|
||||
return _bestOverlay;
|
||||
}
|
||||
|
||||
/*! Getter for best overlay score.
|
||||
\returns score of the best overlay
|
||||
*/
|
||||
double getBestOverlayScore() const {
|
||||
return _bestOverlayScore;
|
||||
}
|
||||
|
@ -46,7 +46,7 @@ void ConcordiaSearcher::concordiaSearch(
|
||||
}
|
||||
|
||||
// compute best overlay of the pattern by matched fragments
|
||||
result->computeBestOverlay(pattern.size());
|
||||
result->computeBestOverlay();
|
||||
|
||||
result->sortFragments();
|
||||
}
|
||||
|
@ -16,7 +16,8 @@
|
||||
#include <divsufsort.h>
|
||||
|
||||
/*!
|
||||
Class for searching using Concordia algorithm.
|
||||
Class for searching using Concordia algorithm. All searches are performed
|
||||
on data structures passed to the methods of this class by smart pointers.
|
||||
|
||||
*/
|
||||
|
||||
@ -28,6 +29,18 @@ public:
|
||||
*/
|
||||
virtual ~ConcordiaSearcher();
|
||||
|
||||
/*! Performs concordia lookup on the RAM-based index.
|
||||
This is a unique library functionality, designed
|
||||
to facilitate Computer-Aided Translation.
|
||||
For more info see \ref tutorial1_3.
|
||||
\param result variable to store the result
|
||||
\param T hashed index to search in
|
||||
\param markers markers array for the needs of searching
|
||||
\param SA suffix array for the needs of searching
|
||||
\param pattern pattern to be searched in the index.
|
||||
This pattern needs to be hashed.
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void concordiaSearch(
|
||||
boost::shared_ptr<ConcordiaSearchResult> result,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
@ -36,6 +49,20 @@ public:
|
||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! \deprecated
|
||||
Finds the examples from the index, whose resemblance to the
|
||||
pattern is maximal. This method may perform very slow,
|
||||
try using concordiaSearch instead.
|
||||
\param config concordia config object
|
||||
(to read the anubis threshold parameter)
|
||||
\param T hashed index to search in
|
||||
\param markers markers array for the needs of searching
|
||||
\param SA suffix array for the needs of searching
|
||||
\param pattern pattern to be searched in the index.
|
||||
This pattern needs to be hashed.
|
||||
\returns vector of results
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
std::vector<AnubisSearchResult> anubisSearch(
|
||||
boost::shared_ptr<ConcordiaConfig> config,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
@ -44,6 +71,17 @@ public:
|
||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Generates map of all examples in the index which have
|
||||
at least one word in common with the pattern. This method
|
||||
is internally used in anubisSearch and may perform slow.
|
||||
\param T hashed index to search in
|
||||
\param markers markers array for the needs of searching
|
||||
\param SA suffix array for the needs of searching
|
||||
\param pattern pattern to be searched in the index.
|
||||
This pattern needs to be hashed.
|
||||
\returns generated map
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
boost::shared_ptr<TmMatchesMap> getTmMatches(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
@ -51,6 +89,21 @@ public:
|
||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Looks for fragments in the index which have the longest
|
||||
common prefix with the pattern. This method return the list of
|
||||
locations of these longest fragments (as return value) and their
|
||||
length in the length parameter. There is a tight limit on the number
|
||||
of longest fragments (currently set to 3). This method is used in
|
||||
conordiaSearch.
|
||||
\param T hashed index to search in
|
||||
\param markers markers array for the needs of searching
|
||||
\param SA suffix array for the needs of searching
|
||||
\param pattern pattern to be searched in the index.
|
||||
This pattern needs to be hashed.
|
||||
\param length the returned length of the longest fragments
|
||||
\returns list of locations of the longest fragments
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
std::vector<SubstringOccurence> lcpSearch(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
|
@ -7,11 +7,17 @@
|
||||
|
||||
/*!
|
||||
Class representing a single sentence to be added into index along with its id.
|
||||
For more info see \ref tutorial1_2.
|
||||
|
||||
*/
|
||||
|
||||
class Example {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
\param sentence sentence to be added to index
|
||||
\param id id of this sentence
|
||||
*/
|
||||
explicit Example(const std::string & sentence,
|
||||
const SUFFIX_MARKER_TYPE & id)
|
||||
throw(ConcordiaException);
|
||||
@ -20,10 +26,16 @@ public:
|
||||
*/
|
||||
virtual ~Example();
|
||||
|
||||
/*! Getter for sentence.
|
||||
\return sentence
|
||||
*/
|
||||
std::string getSentence() const {
|
||||
return _sentence;
|
||||
}
|
||||
|
||||
/*! Getter for sentence id.
|
||||
\return sentence id
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getId() const {
|
||||
return _id;
|
||||
}
|
||||
|
@ -14,12 +14,24 @@
|
||||
|
||||
|
||||
/*!
|
||||
Class for generating a sentence hash.
|
||||
Class for generating a sentence hash. The hash is generated from a sentence
|
||||
given in raw string. String is first anonymized and tokenized. After these
|
||||
operations, each token is coded as an integer, according to WordMap.
|
||||
Resulting hash is a vector of integers.
|
||||
|
||||
Sentence hashed is used when adding a sentence to index and during searching.
|
||||
|
||||
HashGenerator holds an instance of WordMap, used to code tokens as integers
|
||||
and SentenceAnonymizer, used to preprocess the sentence string.
|
||||
|
||||
*/
|
||||
|
||||
class HashGenerator {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
\param config pointer to current config object
|
||||
*/
|
||||
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||
throw(ConcordiaException);
|
||||
|
||||
@ -27,11 +39,28 @@ public:
|
||||
*/
|
||||
virtual ~HashGenerator();
|
||||
|
||||
/*!
|
||||
Generates hash of a sentence.
|
||||
\param sentence sentence to generate hash from
|
||||
\returns vector of integers
|
||||
*/
|
||||
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*!
|
||||
Generates vector of tokens from a sentence. This method is internally
|
||||
used by generateHash. However, for the sake of concordiaSearch
|
||||
(see \ref tutorial1_3), the vector of tokens resulting from sentence
|
||||
anonymizing and tokenization is also needed.
|
||||
\param sentence sentence to tokenize
|
||||
\returns vector of tokens
|
||||
*/
|
||||
std::vector<std::string> generateTokenVector(const std::string & sentence);
|
||||
|
||||
|
||||
/*!
|
||||
Saves the contents of current WordMap to HDD.
|
||||
*/
|
||||
void serializeWordMap();
|
||||
|
||||
private:
|
||||
|
@ -16,18 +16,35 @@
|
||||
#include <divsufsort.h>
|
||||
|
||||
/*!
|
||||
Class for searching the index with a sentence.
|
||||
Class for searching the index with a sentence. In all searches the sentence
|
||||
is first hashed and then used as a query.
|
||||
|
||||
IndexSearcher performs the simpleSearch on its own, but uses a
|
||||
ConcordiaSearcher object to carry out concordiaSearch.
|
||||
|
||||
*/
|
||||
|
||||
class IndexSearcher {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
explicit IndexSearcher();
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~IndexSearcher();
|
||||
|
||||
/*! Performs a simple substring lookup in RAM-based index.
|
||||
For more info see \ref tutorial1_2.
|
||||
\param hashGenerator hash generator to be used to convert
|
||||
input sentence to a hash
|
||||
\param T hashed index to search in
|
||||
\param markers markers array for the needs of searching
|
||||
\param SA suffix array for the needs of searching
|
||||
\param pattern string pattern to be searched in the index.
|
||||
\returns vector of occurences of the pattern in the index
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
std::vector<SubstringOccurence> simpleSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
@ -35,6 +52,21 @@ public:
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern) throw(ConcordiaException);
|
||||
|
||||
/*! \deprecated
|
||||
Finds the examples from the index, whose resemblance to the
|
||||
pattern is maximal. This method may perform very slow,
|
||||
try using concordiaSearch instead.
|
||||
\param config concordia config object
|
||||
(to read the anubis threshold parameter)
|
||||
\param hashGenerator hash generator to be used to convert
|
||||
input sentence to a hash
|
||||
\param T hashed index to search in
|
||||
\param markers markers array for the needs of searching
|
||||
\param SA suffix array for the needs of searching
|
||||
\param pattern string pattern to be searched in the index.
|
||||
\returns vector of results
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
std::vector<AnubisSearchResult> anubisSearch(
|
||||
boost::shared_ptr<ConcordiaConfig> config,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
@ -43,6 +75,19 @@ public:
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const std::string & pattern) throw(ConcordiaException);
|
||||
|
||||
/*! Performs concordia lookup on the RAM-based index.
|
||||
This is a unique library functionality, designed
|
||||
to facilitate Computer-Aided Translation.
|
||||
For more info see \ref tutorial1_3.
|
||||
\param hashGenerator hash generator to be used to convert
|
||||
input sentence to a hash
|
||||
\param T hashed index to search in
|
||||
\param markers markers array for the needs of searching
|
||||
\param SA suffix array for the needs of searching
|
||||
\param pattern pattern to be searched in the index.
|
||||
\returns result of the search
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
|
@ -4,12 +4,20 @@
|
||||
#include "concordia/common/config.hpp"
|
||||
|
||||
/*!
|
||||
Class representing word interval.
|
||||
Class representing interval of a sentence, i.e. a sequence of words
|
||||
coming from that sentence. An interval only has its start and end indexes,
|
||||
where the start index is inclusive and end index is exclusive. For example,
|
||||
an interval [2,5] of the sentence "This is just for testing purposes" is:
|
||||
"just for testing".
|
||||
|
||||
*/
|
||||
|
||||
class Interval {
|
||||
public:
|
||||
/*! Constructor.
|
||||
\param start start index of the interval (0-based)
|
||||
\param end end index of the interval (0-based)
|
||||
*/
|
||||
explicit Interval(const SUFFIX_MARKER_TYPE start,
|
||||
const SUFFIX_MARKER_TYPE end);
|
||||
|
||||
@ -17,14 +25,27 @@ public:
|
||||
*/
|
||||
virtual ~Interval();
|
||||
|
||||
/*! Checks if this interval intersects another.
|
||||
\param interval another interval
|
||||
\returns true if the two intervals intersect
|
||||
*/
|
||||
bool intersects(Interval & interval);
|
||||
|
||||
/*! Getter for interval length.
|
||||
\returns end - start
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getLength();
|
||||
|
||||
/*! Getter for interval start.
|
||||
\returns start
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getStart() const {
|
||||
return _start;
|
||||
}
|
||||
|
||||
/*! Getter for interval end.
|
||||
\returns end
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getEnd() const {
|
||||
return _end;
|
||||
}
|
||||
|
@ -7,10 +7,21 @@
|
||||
/*!
|
||||
Class representing matched pattern fragment in concordia search.
|
||||
This fragment can be seen as an interval of the pattern.
|
||||
|
||||
This class holds information about:
|
||||
- where the pattern fragment was matched (example id and example offset)
|
||||
- where the fragment is located within the pattern
|
||||
(patternOffset, matchedLength)
|
||||
*/
|
||||
|
||||
class MatchedPatternFragment : public Interval {
|
||||
public:
|
||||
/*! Constructor.
|
||||
\param exampleId id of the example where the pattern fragment was matched
|
||||
\param exampleOffset offset of the matched fragment in the example
|
||||
\param patternOffset offset of the matched fragment in the pattern
|
||||
\param matchedLength length of the matched pattern
|
||||
*/
|
||||
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
|
||||
const SUFFIX_MARKER_TYPE & exampleOffset,
|
||||
const SUFFIX_MARKER_TYPE & patternOffset,
|
||||
@ -19,22 +30,37 @@ public:
|
||||
*/
|
||||
virtual ~MatchedPatternFragment();
|
||||
|
||||
/*! Getter for example id.
|
||||
\returns example id
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getExampleId() const {
|
||||
return _exampleId;
|
||||
}
|
||||
|
||||
/*! Getter for example offset.
|
||||
\returns example offset
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getExampleOffset() const {
|
||||
return _exampleOffset;
|
||||
}
|
||||
|
||||
/*! Getter for pattern offset.
|
||||
\returns pattern offset
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getPatternOffset() const {
|
||||
return _patternOffset;
|
||||
}
|
||||
|
||||
/*! Getter for matched length.
|
||||
\returns matched fragment length
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getMatchedLength() const {
|
||||
return _matchedLength;
|
||||
}
|
||||
|
||||
/*! Operator for comparing fragments by their length.
|
||||
\returns true if current pattern is longer than the other
|
||||
*/
|
||||
bool operator > (const MatchedPatternFragment & other) const {
|
||||
return (_matchedLength > other.getMatchedLength());
|
||||
}
|
||||
|
@ -9,15 +9,23 @@
|
||||
#include <boost/regex/icu.hpp>
|
||||
|
||||
|
||||
/*!
|
||||
Class for replacing string occurences.
|
||||
|
||||
*/
|
||||
|
||||
typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
|
||||
|
||||
/*!
|
||||
Class for representing a regular expression replacement operation.
|
||||
Holds regex pattern string for matching and replacement string for
|
||||
replacing found matches.
|
||||
|
||||
*/
|
||||
class RegexReplacement {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
\param patternString regex pattern to match
|
||||
\param replacement string to substitute the found match
|
||||
\param caseSensitive case sensitivity of the pattern
|
||||
*/
|
||||
RegexReplacement(std::string patternString, std::string replacement,
|
||||
bool caseSensitive = true)
|
||||
throw(ConcordiaException);
|
||||
@ -26,6 +34,10 @@ public:
|
||||
*/
|
||||
virtual ~RegexReplacement();
|
||||
|
||||
/*! Applies the operation on input string.
|
||||
\param text the input string
|
||||
\returns altered version of the input string
|
||||
*/
|
||||
std::string apply(const std::string & text);
|
||||
|
||||
private:
|
||||
|
@ -12,12 +12,20 @@
|
||||
|
||||
|
||||
/*!
|
||||
Class for anonymizing sentence before adding to index.
|
||||
|
||||
Class for anonymizing sentence before generating hash.
|
||||
This operation is is used to
|
||||
remove unnecessary symbols and possibly words from sentences added to index
|
||||
and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
||||
with a single space, removes stop words (if the option is enabled), as well as
|
||||
named entities and special symbols. All these have to be listed in files
|
||||
(see \ref tutorial3).
|
||||
*/
|
||||
|
||||
class SentenceAnonymizer {
|
||||
public:
|
||||
/*! Constructor.
|
||||
\param config config object, holding paths to necessary files
|
||||
*/
|
||||
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
||||
throw(ConcordiaException);
|
||||
|
||||
@ -25,6 +33,10 @@ public:
|
||||
*/
|
||||
virtual ~SentenceAnonymizer();
|
||||
|
||||
/*! Anonymizes the sentence.
|
||||
\param sentence input sentence
|
||||
\returns altered version of the input sentence
|
||||
*/
|
||||
std::string anonymize(const std::string & sentence);
|
||||
|
||||
private:
|
||||
|
@ -6,15 +6,32 @@
|
||||
|
||||
/*!
|
||||
Class representing occurence of a searched substring.
|
||||
|
||||
It holds the following information:
|
||||
- id of the example where the substring was found
|
||||
- offset of the matched substring in this example
|
||||
- length of the example
|
||||
*/
|
||||
|
||||
class SubstringOccurence {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
|
||||
*/
|
||||
SubstringOccurence();
|
||||
|
||||
/*!
|
||||
Constructor taking data from a marker.
|
||||
\param marker
|
||||
*/
|
||||
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & marker);
|
||||
|
||||
/*!
|
||||
Constructor with three arguments.
|
||||
\param id example id
|
||||
\param offset offset of the substring in the example
|
||||
\param exampleLength length of the example
|
||||
*/
|
||||
SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
||||
const SUFFIX_MARKER_TYPE & offset,
|
||||
const SUFFIX_MARKER_TYPE & exampleLength);
|
||||
@ -22,18 +39,30 @@ public:
|
||||
*/
|
||||
virtual ~SubstringOccurence();
|
||||
|
||||
/*! Getter for example id.
|
||||
\returns example id
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getId() const {
|
||||
return _id;
|
||||
}
|
||||
|
||||
/*! Getter for example offset
|
||||
\returns example offset
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getOffset() const {
|
||||
return _offset;
|
||||
}
|
||||
|
||||
/*! Getter for example length.
|
||||
\returns example length
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getExampleLength() const {
|
||||
return _exampleLength;
|
||||
}
|
||||
|
||||
/*! Setter of all the fields, based on input marker.
|
||||
\param marker marker to read the data from
|
||||
*/
|
||||
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
|
||||
|
||||
private:
|
||||
|
@ -11,13 +11,25 @@
|
||||
|
||||
/*!
|
||||
Class used within Anubis search algorithm to store partial results.
|
||||
Holds information about mutual overlay of the pattern and found
|
||||
example.
|
||||
|
||||
*/
|
||||
|
||||
class TmMatches {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
|
||||
*/
|
||||
TmMatches();
|
||||
|
||||
/*!
|
||||
Constructor setting basic information.
|
||||
\param exampleId id of found example
|
||||
\param exampleSize size of the found example
|
||||
\param patternSize size of the searched pattern
|
||||
*/
|
||||
TmMatches(const SUFFIX_MARKER_TYPE exampleId,
|
||||
const SUFFIX_MARKER_TYPE exampleSize,
|
||||
const SUFFIX_MARKER_TYPE patternSize);
|
||||
@ -26,28 +38,74 @@ public:
|
||||
*/
|
||||
virtual ~TmMatches();
|
||||
|
||||
/*!
|
||||
Getter for score of the mutual overlay.
|
||||
\returns score
|
||||
*/
|
||||
double getScore() const {
|
||||
return _score;
|
||||
}
|
||||
|
||||
/*!
|
||||
Getter for the list of overlays of the example.
|
||||
\returns example overlays list
|
||||
*/
|
||||
std::vector<Interval> getExampleIntervals() const {
|
||||
return _exampleMatchedRegions;
|
||||
}
|
||||
|
||||
/*!
|
||||
Getter for the list of overlays of the pattern.
|
||||
\returns pattern overlays list
|
||||
*/
|
||||
std::vector<Interval> getPatternIntervals() const {
|
||||
return _patternMatchedRegions;
|
||||
}
|
||||
|
||||
/*!
|
||||
Getter for example id.
|
||||
\returns example id
|
||||
*/
|
||||
SUFFIX_MARKER_TYPE getExampleId() const {
|
||||
return _exampleId;
|
||||
}
|
||||
|
||||
/*!
|
||||
Calculates mutual overlay score in the scale [0,1].
|
||||
Uses generalized Jaccard index for the computation.
|
||||
Score 1 - perfect score - is assigned when the whole pattern
|
||||
and the whole example are covered. Result of the computation
|
||||
is stored in the score field, use getScore() to retrieve it.
|
||||
*/
|
||||
void calculateSimpleScore();
|
||||
|
||||
/*!
|
||||
Calculates mutual overlay score in the scale [0,1].
|
||||
Takes into account the number and the length of the
|
||||
fragments (the fewer fragments, the better).
|
||||
Score 1 - perfect score - is assigned when the whole pattern
|
||||
and the whole example are covered with only one fragment.
|
||||
Result of the computation is stored in the score field,
|
||||
use getScore() to retrieve it.
|
||||
*/
|
||||
void calculateScore();
|
||||
|
||||
/*!
|
||||
Adds information about covering of example. If the new
|
||||
fragment intersects with any previous fragment, it is
|
||||
not added.
|
||||
\param start start of the example overlay fragment
|
||||
\param end end of the example overlay fragment
|
||||
*/
|
||||
void addExampleInterval(int start, int end);
|
||||
|
||||
/*!
|
||||
Adds information about covering of pattern. If the new
|
||||
fragment intersects with any previous fragment, it is
|
||||
not added.
|
||||
\param start start of the pattern overlay fragment
|
||||
\param end end of the pattern overlay fragment
|
||||
*/
|
||||
void addPatternInterval(int start, int end);
|
||||
|
||||
private:
|
||||
|
@ -10,18 +10,30 @@
|
||||
#include <boost/serialization/map.hpp>
|
||||
|
||||
/*!
|
||||
Class representing dictionary for word to int encoding.
|
||||
Class representing dictionary for word to integer encoding.
|
||||
|
||||
*/
|
||||
|
||||
class WordMap {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
|
||||
*/
|
||||
explicit WordMap() throw(ConcordiaException);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~WordMap();
|
||||
|
||||
/*!
|
||||
Gets the integer code of a token. If the token is found in
|
||||
the dictionary, the dictionary code is returned. If not,
|
||||
the word is added to the dictionary and its newly created
|
||||
code is returned.
|
||||
\param word token to generate the code
|
||||
\returns code of the token
|
||||
*/
|
||||
INDEX_CHARACTER_TYPE getWordCode(const std::string & word)
|
||||
throw(ConcordiaException);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user