finished documentation

This commit is contained in:
rjawor 2015-05-01 14:52:53 +02:00
parent 9e550ca1cf
commit abbd5b1ae8
19 changed files with 410 additions and 34 deletions

View File

@ -1355,18 +1355,6 @@ GENERATE_XML = NO
XML_OUTPUT = xml
# The XML_SCHEMA tag can be used to specify an XML schema,
# which can be used by a validating XML parser to check the
# syntax of the XML files.
XML_SCHEMA =
# The XML_DTD tag can be used to specify an XML DTD,
# which can be used by a validating XML parser to check the
# syntax of the XML files.
XML_DTD =
# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
# dump the program listings (including syntax highlighting
# and cross-referencing information) to the XML output. Note that

View File

@ -43,6 +43,7 @@ public:
bool operator > (const AnubisSearchResult & other) const {
return (_score > other.getScore());
}
private:
SUFFIX_MARKER_TYPE _exampleId;

View File

@ -20,7 +20,6 @@
class Utils {
public:
/*! Constructor
*/
explicit Utils();

View File

@ -19,6 +19,18 @@
/*!
The Concordia class is the main access point to the library.
This class holds references to three out of four main data
structures used by Concordia: hashed index, markers array
and suffix array. Word map is maintained by the class
HashGenerator. Concordia has references to:
- the hash generator (HashGenerator)
- concordia index (ConcordiaIndex)
- concordia searcher (ConcordiaSearcher)
- configuration (ConcordiaConfig)
Whenever it is necessary, the data structures and tools
held by Concordia are passed by smart pointers to methods which
carry out specific functionalities.
*/
@ -85,7 +97,7 @@ public:
/*! Loads HDD stored index files to RAM and generates
suffix array based on RAM stored data structures.
For more info see \ref tutorial2.
For more info see \ref tutorial2.
\throws ConcordiaException
*/
void loadRAMIndexFromDisk() throw(ConcordiaException);

View File

@ -14,12 +14,22 @@
#include <divsufsort.h>
/*!
Class for creating and maintaining the index.
Class for creating and maintaining the index. This class
does not hold the index data structures but only operates on
them when they are passed to ConcordiaIndex methods by
smart pointers. This class only remembers paths to two
files: hashed index and markers array, which are backups
of the respective data structures on HDD.
*/
class ConcordiaIndex {
public:
/*! Constructor.
\param hashedIndexFilePath path to the hashed index file
\param markersFilePath path to the markers array
\throws ConcordiaException
*/
explicit ConcordiaIndex(const std::string & hashedIndexFilePath,
const std::string & markersFilePath)
throw(ConcordiaException);
@ -28,23 +38,50 @@ public:
*/
virtual ~ConcordiaIndex();
/*! Adds an Example to the index. Example is first hashed using
the hash generator passed to this method. Then, hashed index
and markers array (also passed to this method) are appended
with the hashed example. At the same time, HDD versions of these
two data structures are also appended with the same example.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param example example to be added to index
\throws ConcordiaException
*/
void addExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example);
/*! Adds multiple examples to the index. Examples are first hashed using
the hash generator passed to this method. Then, hashed index
and markers array (also passed to this method) are appended
with the hashed examples. At the same time, HDD versions of these
two data structures are also appended with the same examples.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param examples vector of examples to be added to index
\throws ConcordiaException
*/
void addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const std::vector<Example> & examples);
/*! Generates suffix array based on the passed hashed index.
\returns the generated suffix array
\throws ConcordiaException
*/
boost::shared_ptr<std::vector<saidx_t> > generateSuffixArray(
boost::shared_ptr<std::vector<sauchar_t> > T);
private:
// Add example to disk index and update RAM index.
void _addSingleExample(std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,

View File

@ -23,12 +23,11 @@ void ConcordiaSearchResult::sortFragments() {
std::greater<MatchedPatternFragment>());
}
void ConcordiaSearchResult::computeBestOverlay(
SUFFIX_MARKER_TYPE patternSize) {
void ConcordiaSearchResult::computeBestOverlay() {
// the fragments are already sorted by their ends, ascending
_checkPossibleOverlays(std::vector<MatchedPatternFragment>(),
-1,
patternSize);
_tokenVector.size());
}
void ConcordiaSearchResult::_checkPossibleOverlays(

View File

@ -8,12 +8,22 @@
#include <string>
/*!
Class representing result of concordia search.
Class representing result of concordia search. Contains the following
information:
- tokenized pattern which was used for searching
- list of longest matched fragments sorted in descending order by length
- the best overlay
- the score of the best overlay.
For more info about concordia searching see \ref tutorial1_3.
*/
class ConcordiaSearchResult {
public:
/*! Constructor.
\param tokenVector tokenized patter which was used for searching
*/
explicit ConcordiaSearchResult(
const std::vector<std::string> & tokenVector);
@ -21,24 +31,45 @@ public:
*/
virtual ~ConcordiaSearchResult();
/*! Adds a matched pattern fragment to the list.
\param fragment fragment to be added
*/
void addFragment(const MatchedPatternFragment & fragment);
/*! Sorts the list of matched pattern fragments in descending order
by length.
*/
void sortFragments();
void computeBestOverlay(SUFFIX_MARKER_TYPE patternSize);
/*! Computes the best overlay by choosing appropriate fragments
from the fragments list. For more info see \ref tutorial1_3.
*/
void computeBestOverlay();
/*! Getter for tokenized pattern.
\returns tokenized search pattern
*/
std::vector<std::string> getTokenVector() const {
return _tokenVector;
}
/*! Getter for all matched pattern fragments list.
\returns matched pattern fragments list
*/
std::vector<MatchedPatternFragment> getFragments() const {
return _matchedPatternFragments;
}
/*! Getter for best overlay.
\returns list of fragments that comprise the best overlay
*/
std::vector<MatchedPatternFragment> getBestOverlay() const {
return _bestOverlay;
}
/*! Getter for best overlay score.
\returns score of the best overlay
*/
double getBestOverlayScore() const {
return _bestOverlayScore;
}

View File

@ -46,7 +46,7 @@ void ConcordiaSearcher::concordiaSearch(
}
// compute best overlay of the pattern by matched fragments
result->computeBestOverlay(pattern.size());
result->computeBestOverlay();
result->sortFragments();
}

View File

@ -16,7 +16,8 @@
#include <divsufsort.h>
/*!
Class for searching using Concordia algorithm.
Class for searching using Concordia algorithm. All searches are performed
on data structures passed to the methods of this class by smart pointers.
*/
@ -28,6 +29,18 @@ public:
*/
virtual ~ConcordiaSearcher();
/*! Performs concordia lookup on the RAM-based index.
This is a unique library functionality, designed
to facilitate Computer-Aided Translation.
For more info see \ref tutorial1_3.
\param result variable to store the result
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
This pattern needs to be hashed.
\throws ConcordiaException
*/
void concordiaSearch(
boost::shared_ptr<ConcordiaSearchResult> result,
boost::shared_ptr<std::vector<sauchar_t> > T,
@ -36,6 +49,20 @@ public:
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
throw(ConcordiaException);
/*! \deprecated
Finds the examples from the index, whose resemblance to the
pattern is maximal. This method may perform very slow,
try using concordiaSearch instead.
\param config concordia config object
(to read the anubis threshold parameter)
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
This pattern needs to be hashed.
\returns vector of results
\throws ConcordiaException
*/
std::vector<AnubisSearchResult> anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<std::vector<sauchar_t> > T,
@ -44,6 +71,17 @@ public:
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
throw(ConcordiaException);
/*! Generates map of all examples in the index which have
at least one word in common with the pattern. This method
is internally used in anubisSearch and may perform slow.
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
This pattern needs to be hashed.
\returns generated map
\throws ConcordiaException
*/
boost::shared_ptr<TmMatchesMap> getTmMatches(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -51,6 +89,21 @@ public:
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
throw(ConcordiaException);
/*! Looks for fragments in the index which have the longest
common prefix with the pattern. This method return the list of
locations of these longest fragments (as return value) and their
length in the length parameter. There is a tight limit on the number
of longest fragments (currently set to 3). This method is used in
conordiaSearch.
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
This pattern needs to be hashed.
\param length the returned length of the longest fragments
\returns list of locations of the longest fragments
\throws ConcordiaException
*/
std::vector<SubstringOccurence> lcpSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,

View File

@ -7,11 +7,17 @@
/*!
Class representing a single sentence to be added into index along with its id.
For more info see \ref tutorial1_2.
*/
class Example {
public:
/*!
Constructor.
\param sentence sentence to be added to index
\param id id of this sentence
*/
explicit Example(const std::string & sentence,
const SUFFIX_MARKER_TYPE & id)
throw(ConcordiaException);
@ -20,10 +26,16 @@ public:
*/
virtual ~Example();
/*! Getter for sentence.
\return sentence
*/
std::string getSentence() const {
return _sentence;
}
/*! Getter for sentence id.
\return sentence id
*/
SUFFIX_MARKER_TYPE getId() const {
return _id;
}

View File

@ -14,12 +14,24 @@
/*!
Class for generating a sentence hash.
Class for generating a sentence hash. The hash is generated from a sentence
given in raw string. String is first anonymized and tokenized. After these
operations, each token is coded as an integer, according to WordMap.
Resulting hash is a vector of integers.
Sentence hashed is used when adding a sentence to index and during searching.
HashGenerator holds an instance of WordMap, used to code tokens as integers
and SentenceAnonymizer, used to preprocess the sentence string.
*/
class HashGenerator {
public:
/*!
Constructor.
\param config pointer to current config object
*/
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
@ -27,11 +39,28 @@ public:
*/
virtual ~HashGenerator();
/*!
Generates hash of a sentence.
\param sentence sentence to generate hash from
\returns vector of integers
*/
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
throw(ConcordiaException);
/*!
Generates vector of tokens from a sentence. This method is internally
used by generateHash. However, for the sake of concordiaSearch
(see \ref tutorial1_3), the vector of tokens resulting from sentence
anonymizing and tokenization is also needed.
\param sentence sentence to tokenize
\returns vector of tokens
*/
std::vector<std::string> generateTokenVector(const std::string & sentence);
/*!
Saves the contents of current WordMap to HDD.
*/
void serializeWordMap();
private:

View File

@ -16,18 +16,35 @@
#include <divsufsort.h>
/*!
Class for searching the index with a sentence.
Class for searching the index with a sentence. In all searches the sentence
is first hashed and then used as a query.
IndexSearcher performs the simpleSearch on its own, but uses a
ConcordiaSearcher object to carry out concordiaSearch.
*/
class IndexSearcher {
public:
/*! Constructor.
*/
explicit IndexSearcher();
/*! Destructor.
*/
virtual ~IndexSearcher();
/*! Performs a simple substring lookup in RAM-based index.
For more info see \ref tutorial1_2.
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\returns vector of occurences of the pattern in the index
\throws ConcordiaException
*/
std::vector<SubstringOccurence> simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
@ -35,6 +52,21 @@ public:
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException);
/*! \deprecated
Finds the examples from the index, whose resemblance to the
pattern is maximal. This method may perform very slow,
try using concordiaSearch instead.
\param config concordia config object
(to read the anubis threshold parameter)
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern string pattern to be searched in the index.
\returns vector of results
\throws ConcordiaException
*/
std::vector<AnubisSearchResult> anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<HashGenerator> hashGenerator,
@ -43,6 +75,19 @@ public:
boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException);
/*! Performs concordia lookup on the RAM-based index.
This is a unique library functionality, designed
to facilitate Computer-Aided Translation.
For more info see \ref tutorial1_3.
\param hashGenerator hash generator to be used to convert
input sentence to a hash
\param T hashed index to search in
\param markers markers array for the needs of searching
\param SA suffix array for the needs of searching
\param pattern pattern to be searched in the index.
\returns result of the search
\throws ConcordiaException
*/
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,

View File

@ -4,12 +4,20 @@
#include "concordia/common/config.hpp"
/*!
Class representing word interval.
Class representing interval of a sentence, i.e. a sequence of words
coming from that sentence. An interval only has its start and end indexes,
where the start index is inclusive and end index is exclusive. For example,
an interval [2,5] of the sentence "This is just for testing purposes" is:
"just for testing".
*/
class Interval {
public:
/*! Constructor.
\param start start index of the interval (0-based)
\param end end index of the interval (0-based)
*/
explicit Interval(const SUFFIX_MARKER_TYPE start,
const SUFFIX_MARKER_TYPE end);
@ -17,14 +25,27 @@ public:
*/
virtual ~Interval();
/*! Checks if this interval intersects another.
\param interval another interval
\returns true if the two intervals intersect
*/
bool intersects(Interval & interval);
/*! Getter for interval length.
\returns end - start
*/
SUFFIX_MARKER_TYPE getLength();
/*! Getter for interval start.
\returns start
*/
SUFFIX_MARKER_TYPE getStart() const {
return _start;
}
/*! Getter for interval end.
\returns end
*/
SUFFIX_MARKER_TYPE getEnd() const {
return _end;
}

View File

@ -7,10 +7,21 @@
/*!
Class representing matched pattern fragment in concordia search.
This fragment can be seen as an interval of the pattern.
This class holds information about:
- where the pattern fragment was matched (example id and example offset)
- where the fragment is located within the pattern
(patternOffset, matchedLength)
*/
class MatchedPatternFragment : public Interval {
public:
/*! Constructor.
\param exampleId id of the example where the pattern fragment was matched
\param exampleOffset offset of the matched fragment in the example
\param patternOffset offset of the matched fragment in the pattern
\param matchedLength length of the matched pattern
*/
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
const SUFFIX_MARKER_TYPE & exampleOffset,
const SUFFIX_MARKER_TYPE & patternOffset,
@ -19,22 +30,37 @@ public:
*/
virtual ~MatchedPatternFragment();
/*! Getter for example id.
\returns example id
*/
SUFFIX_MARKER_TYPE getExampleId() const {
return _exampleId;
}
/*! Getter for example offset.
\returns example offset
*/
SUFFIX_MARKER_TYPE getExampleOffset() const {
return _exampleOffset;
}
/*! Getter for pattern offset.
\returns pattern offset
*/
SUFFIX_MARKER_TYPE getPatternOffset() const {
return _patternOffset;
}
/*! Getter for matched length.
\returns matched fragment length
*/
SUFFIX_MARKER_TYPE getMatchedLength() const {
return _matchedLength;
}
/*! Operator for comparing fragments by their length.
\returns true if current pattern is longer than the other
*/
bool operator > (const MatchedPatternFragment & other) const {
return (_matchedLength > other.getMatchedLength());
}

View File

@ -9,15 +9,23 @@
#include <boost/regex/icu.hpp>
/*!
Class for replacing string occurences.
*/
typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
/*!
Class for representing a regular expression replacement operation.
Holds regex pattern string for matching and replacement string for
replacing found matches.
*/
class RegexReplacement {
public:
/*!
Constructor.
\param patternString regex pattern to match
\param replacement string to substitute the found match
\param caseSensitive case sensitivity of the pattern
*/
RegexReplacement(std::string patternString, std::string replacement,
bool caseSensitive = true)
throw(ConcordiaException);
@ -26,6 +34,10 @@ public:
*/
virtual ~RegexReplacement();
/*! Applies the operation on input string.
\param text the input string
\returns altered version of the input string
*/
std::string apply(const std::string & text);
private:

View File

@ -12,12 +12,20 @@
/*!
Class for anonymizing sentence before adding to index.
Class for anonymizing sentence before generating hash.
This operation is is used to
remove unnecessary symbols and possibly words from sentences added to index
and search patterns. Anonymizer removes html tags, substitutes predefined symbols
with a single space, removes stop words (if the option is enabled), as well as
named entities and special symbols. All these have to be listed in files
(see \ref tutorial3).
*/
class SentenceAnonymizer {
public:
/*! Constructor.
\param config config object, holding paths to necessary files
*/
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
@ -25,6 +33,10 @@ public:
*/
virtual ~SentenceAnonymizer();
/*! Anonymizes the sentence.
\param sentence input sentence
\returns altered version of the input sentence
*/
std::string anonymize(const std::string & sentence);
private:

View File

@ -6,15 +6,32 @@
/*!
Class representing occurence of a searched substring.
It holds the following information:
- id of the example where the substring was found
- offset of the matched substring in this example
- length of the example
*/
class SubstringOccurence {
public:
/*!
Constructor.
*/
SubstringOccurence();
/*!
Constructor taking data from a marker.
\param marker
*/
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & marker);
/*!
Constructor with three arguments.
\param id example id
\param offset offset of the substring in the example
\param exampleLength length of the example
*/
SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
const SUFFIX_MARKER_TYPE & offset,
const SUFFIX_MARKER_TYPE & exampleLength);
@ -22,18 +39,30 @@ public:
*/
virtual ~SubstringOccurence();
/*! Getter for example id.
\returns example id
*/
SUFFIX_MARKER_TYPE getId() const {
return _id;
}
/*! Getter for example offset
\returns example offset
*/
SUFFIX_MARKER_TYPE getOffset() const {
return _offset;
}
/*! Getter for example length.
\returns example length
*/
SUFFIX_MARKER_TYPE getExampleLength() const {
return _exampleLength;
}
/*! Setter of all the fields, based on input marker.
\param marker marker to read the data from
*/
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
private:

View File

@ -11,13 +11,25 @@
/*!
Class used within Anubis search algorithm to store partial results.
Holds information about mutual overlay of the pattern and found
example.
*/
class TmMatches {
public:
/*!
Constructor.
*/
TmMatches();
/*!
Constructor setting basic information.
\param exampleId id of found example
\param exampleSize size of the found example
\param patternSize size of the searched pattern
*/
TmMatches(const SUFFIX_MARKER_TYPE exampleId,
const SUFFIX_MARKER_TYPE exampleSize,
const SUFFIX_MARKER_TYPE patternSize);
@ -26,28 +38,74 @@ public:
*/
virtual ~TmMatches();
/*!
Getter for score of the mutual overlay.
\returns score
*/
double getScore() const {
return _score;
}
/*!
Getter for the list of overlays of the example.
\returns example overlays list
*/
std::vector<Interval> getExampleIntervals() const {
return _exampleMatchedRegions;
}
/*!
Getter for the list of overlays of the pattern.
\returns pattern overlays list
*/
std::vector<Interval> getPatternIntervals() const {
return _patternMatchedRegions;
}
/*!
Getter for example id.
\returns example id
*/
SUFFIX_MARKER_TYPE getExampleId() const {
return _exampleId;
}
/*!
Calculates mutual overlay score in the scale [0,1].
Uses generalized Jaccard index for the computation.
Score 1 - perfect score - is assigned when the whole pattern
and the whole example are covered. Result of the computation
is stored in the score field, use getScore() to retrieve it.
*/
void calculateSimpleScore();
/*!
Calculates mutual overlay score in the scale [0,1].
Takes into account the number and the length of the
fragments (the fewer fragments, the better).
Score 1 - perfect score - is assigned when the whole pattern
and the whole example are covered with only one fragment.
Result of the computation is stored in the score field,
use getScore() to retrieve it.
*/
void calculateScore();
/*!
Adds information about covering of example. If the new
fragment intersects with any previous fragment, it is
not added.
\param start start of the example overlay fragment
\param end end of the example overlay fragment
*/
void addExampleInterval(int start, int end);
/*!
Adds information about covering of pattern. If the new
fragment intersects with any previous fragment, it is
not added.
\param start start of the pattern overlay fragment
\param end end of the pattern overlay fragment
*/
void addPatternInterval(int start, int end);
private:

View File

@ -10,18 +10,30 @@
#include <boost/serialization/map.hpp>
/*!
Class representing dictionary for word to int encoding.
Class representing dictionary for word to integer encoding.
*/
class WordMap {
public:
/*!
Constructor.
*/
explicit WordMap() throw(ConcordiaException);
/*! Destructor.
*/
virtual ~WordMap();
/*!
Gets the integer code of a token. If the token is found in
the dictionary, the dictionary code is returned. If not,
the word is added to the dictionary and its newly created
code is returned.
\param word token to generate the code
\returns code of the token
*/
INDEX_CHARACTER_TYPE getWordCode(const std::string & word)
throw(ConcordiaException);