finished original word positions

This commit is contained in:
rjawor 2015-06-27 12:40:24 +02:00
parent a8c5fa0c75
commit 5a57406875
22 changed files with 238 additions and 150 deletions

View File

@ -28,7 +28,8 @@ void checkConcordiaResults(
long baseLineCount) { long baseLineCount) {
long lineIndex = 1; long lineIndex = 1;
BOOST_FOREACH(ConcordiaSearchResult result, results) { BOOST_FOREACH(ConcordiaSearchResult result, results) {
SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size(); SUFFIX_MARKER_TYPE patternSize =
result.getTokenizedPattern()->getTokens().size();
if (patternSize > 0) { if (patternSize > 0) {
if (result.getBestOverlay().size() != 1) { if (result.getBestOverlay().size() != 1) {
reportError(baseLineCount + lineIndex, reportError(baseLineCount + lineIndex,
@ -37,7 +38,7 @@ void checkConcordiaResults(
if (result.getBestOverlay().at(0).getMatchedLength() if (result.getBestOverlay().at(0).getMatchedLength()
!= patternSize) { != patternSize) {
reportError(baseLineCount + lineIndex, reportError(baseLineCount + lineIndex,
"best overlay fragment has different size than pattern."); "best overlay fragment has different size than pattern.");
} }
if (result.getBestOverlayScore() != 1) { if (result.getBestOverlayScore() != 1) {
reportError(baseLineCount + lineIndex, reportError(baseLineCount + lineIndex,
@ -201,7 +202,8 @@ int main(int argc, char** argv) {
msdiff = time_end - time_start; msdiff = time_end - time_start;
std::cout << "\tPattern used: " << std::endl << "\t\t"; std::cout << "\tPattern used: " << std::endl << "\t\t";
BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) { BOOST_FOREACH(TokenAnnotation annotation,
result->getTokenizedPattern()->getTokens()) {
std::cout << annotation.getValue() << " "; std::cout << annotation.getValue() << " ";
} }
std::cout << std::endl; std::cout << std::endl;

View File

@ -9,6 +9,7 @@ Before you compile, make sure you have these installed:
- cmake - cmake
- Boost library - Boost library
- Log4cpp - Log4cpp
- ICU
- (optional) Doxygen - (optional) Doxygen
- (optional) TeX - (optional) TeX

View File

@ -44,15 +44,17 @@ std::string _createLibraryVersion() {
// Sentences are written to disk and added to T. // Sentences are written to disk and added to T.
// SA is generated on command by other methods. // SA is generated on command by other methods.
boost::shared_ptr<TokenizedSentence> Concordia::addExample(const Example & example) boost::shared_ptr<TokenizedSentence> Concordia::addExample(
throw(ConcordiaException) { const Example & example)
throw(ConcordiaException) {
return _index->addExample(_hashGenerator, _T, _markers, example); return _index->addExample(_hashGenerator, _T, _markers, example);
} }
// Sentences are written to disk and added to T. // Sentences are written to disk and added to T.
// SA is generated on command by other methods. // SA is generated on command by other methods.
std::vector<TokenizedSentence> Concordia::addAllExamples(const std::vector<Example> & examples) std::vector<TokenizedSentence> Concordia::addAllExamples(
throw(ConcordiaException) { const std::vector<Example> & examples)
throw(ConcordiaException) {
return _index->addAllExamples(_hashGenerator, _T, _markers, examples); return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
} }
@ -165,7 +167,8 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
} else { } else {
std::string empty; std::string empty;
return boost::shared_ptr<ConcordiaSearchResult>( return boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(new TokenizedSentence(empty)))); new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(
new TokenizedSentence(empty))));
} }
} }
@ -182,4 +185,3 @@ void Concordia::clearIndex() throw(ConcordiaException) {
boost::filesystem::remove(_config->getMarkersFilePath()); boost::filesystem::remove(_config->getMarkersFilePath());
} }

View File

@ -54,16 +54,22 @@ public:
/*! Adds an Example to the index. /*! Adds an Example to the index.
\param example example to be added \param example example to be added
\returns tokenized sentence object,
containing information about original word positions
\throws ConcordiaException \throws ConcordiaException
*/ */
boost::shared_ptr<TokenizedSentence> addExample(const Example & example) throw(ConcordiaException); boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
throw(ConcordiaException);
/*! Adds multiple examples to the index. /*! Adds multiple examples to the index.
\param examples vector of examples to be added \param examples vector of examples to be added
\returns vector of tokenized sentence objects,
containing information about original word positions
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<TokenizedSentence> addAllExamples(const std::vector<Example> & examples) std::vector<TokenizedSentence> addAllExamples(
throw(ConcordiaException); const std::vector<Example> & examples)
throw(ConcordiaException);
/*! Performs a simple substring lookup on the index. /*! Performs a simple substring lookup on the index.
For more info see \ref tutorial1_2. For more info see \ref tutorial1_2.

View File

@ -25,7 +25,7 @@ boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
} }
boost::shared_ptr<std::vector<saidx_t> > result = boost::shared_ptr<std::vector<saidx_t> > result =
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>); boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
for (int i = 0; i < T->size(); i++) { for (int i = 0; i < T->size(); i++) {
result->push_back(SA_array[i]); result->push_back(SA_array[i]);
} }
@ -48,7 +48,8 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
std::vector<TokenizedSentence> hashedPatterns; std::vector<TokenizedSentence> hashedPatterns;
BOOST_FOREACH(Example example, examples) { BOOST_FOREACH(Example example, examples) {
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator, boost::shared_ptr<TokenizedSentence> hashedPattern =
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example); T, markers, example);
hashedPatterns.push_back(*hashedPattern); hashedPatterns.push_back(*hashedPattern);
} }
@ -71,7 +72,8 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
std::ofstream markersFile; std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out| markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary); std::ios::app|std::ios::binary);
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator, boost::shared_ptr<TokenizedSentence> hashedPattern =
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example); T, markers, example);
hashedIndexFile.close(); hashedIndexFile.close();
markersFile.close(); markersFile.close();
@ -87,7 +89,8 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) { const Example & example) {
boost::shared_ptr<TokenizedSentence> hashedPattern = hashGenerator->generateHash(example.getSentence()); boost::shared_ptr<TokenizedSentence> hashedPattern =
hashGenerator->generateHash(example.getSentence());
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes(); std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
int offset = 0; int offset = 0;

View File

@ -44,11 +44,13 @@ public:
and markers array (also passed to this method) are appended and markers array (also passed to this method) are appended
with the hashed example. At the same time, HDD versions of these with the hashed example. At the same time, HDD versions of these
two data structures are also appended with the same example. two data structures are also appended with the same example.
The method returns a tokenized version of the example.
\param hashGenerator hash generator to be used to prepare the hash \param hashGenerator hash generator to be used to prepare the hash
of the example of the example
\param T RAM-based hash index to be appended to \param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to \param markers RAM-based markers array to be appended to
\param example example to be added to index \param example example to be added to index
\returns tokenized example
\throws ConcordiaException \throws ConcordiaException
*/ */
boost::shared_ptr<TokenizedSentence> addExample( boost::shared_ptr<TokenizedSentence> addExample(
@ -62,11 +64,13 @@ public:
and markers array (also passed to this method) are appended and markers array (also passed to this method) are appended
with the hashed examples. At the same time, HDD versions of these with the hashed examples. At the same time, HDD versions of these
two data structures are also appended with the same examples. two data structures are also appended with the same examples.
The method returns a vector of tokenized examples.
\param hashGenerator hash generator to be used to prepare the hash \param hashGenerator hash generator to be used to prepare the hash
of the example of the example
\param T RAM-based hash index to be appended to \param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to \param markers RAM-based markers array to be appended to
\param examples vector of examples to be added to index \param examples vector of examples to be added to index
\returns vector of tokenized examples
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<TokenizedSentence> addAllExamples( std::vector<TokenizedSentence> addAllExamples(
@ -83,7 +87,8 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > T); boost::shared_ptr<std::vector<sauchar_t> > T);
private: private:
boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile, boost::shared_ptr<TokenizedSentence> _addSingleExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile, std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,

View File

@ -29,7 +29,8 @@ HashGenerator::~HashGenerator() {
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash( boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) { const std::string & sentence) throw(ConcordiaException) {
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence); boost::shared_ptr<TokenizedSentence> ts =
_sentenceTokenizer->tokenize(sentence);
ts->generateHash(_wordMap); ts->generateHash(_wordMap);
if (ts->getTokens().size() > Utils::maxSentenceSize) { if (ts->getTokens().size() > Utils::maxSentenceSize) {

View File

@ -15,14 +15,14 @@
/*! /*!
Class for generating a sentence hash. The hash is generated from a sentence Class for generating a sentence hash. The hash is generated from a sentence
given in raw string. String is first anonymized and tokenized. After these given in raw string. String is first tokenized by SentenceTokenizer and
operations, each token is coded as an integer, according to WordMap. then each token is coded as an integer, according to WordMap.
Resulting hash is a vector of integers. Resulting hash is an instance of TokenizedSentence.
Sentence hashed is used when adding a sentence to index and during searching. Hashed sentence is used when adding a sentence to index and during searching.
HashGenerator holds an instance of WordMap, used to code tokens as integers HashGenerator holds an instance of WordMap, used to code tokens as integers
and SentenceAnonymizer, used to preprocess the sentence string. and SentenceTokenizer, used to tokenize the sentence string.
*/ */
@ -42,9 +42,10 @@ public:
/*! /*!
Generates hash of a sentence. Generates hash of a sentence.
\param sentence sentence to generate hash from \param sentence sentence to generate hash from
\returns vector of integers \returns tokenized sentence, containing the hash
*/ */
boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence) boost::shared_ptr<TokenizedSentence> generateHash(
const std::string & sentence)
throw(ConcordiaException); throw(ConcordiaException);
/*! /*!

View File

@ -73,6 +73,7 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
boost::shared_ptr<ConcordiaSearchResult>( boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(hashedPattern)); new ConcordiaSearchResult(hashedPattern));
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes()); _concordiaSearcher->concordiaSearch(result, T, markers,
SA, hashedPattern->getCodes());
return result; return result;
} }

View File

@ -51,9 +51,12 @@ public:
return _end; return _end;
} }
friend std::ostream & operator << (std::ostream & o, const Interval & interval) { friend std::ostream & operator << (std::ostream & o,
return o << "[" << interval.getStart() << "," << interval.getEnd() << ")"; const Interval & interval) {
return o << "[" << interval.getStart()
<< "," << interval.getEnd() << ")";
} }
protected: protected:
SUFFIX_MARKER_TYPE _start; SUFFIX_MARKER_TYPE _start;

View File

@ -6,7 +6,7 @@
/*! /*!
Class representing matched pattern fragment in concordia search. Class representing matched pattern fragment in concordia search.
This fragment can be seen as an interval of the pattern. This fragment can be seen as a word interval of the pattern.
This class holds information about: This class holds information about:
- where the pattern fragment was matched (example id and example offset) - where the pattern fragment was matched (example id and example offset)

View File

@ -13,9 +13,11 @@ RegexRule::RegexRule(std::string patternString,
_value(value) { _value(value) {
try { try {
if (caseSensitive) { if (caseSensitive) {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str())); _pattern = boost::make_u32regex(
UnicodeString(patternString.c_str()));
} else { } else {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase); _pattern = boost::make_u32regex(
UnicodeString(patternString.c_str()), boost::regex::icase);
} }
} catch(const std::exception & e) { } catch(const std::exception & e) {
std::stringstream ss; std::stringstream ss;
@ -37,7 +39,8 @@ RegexRule::~RegexRule() {
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) { void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
try { try {
UnicodeString s(sentence->getSentence().c_str()); UnicodeString s(sentence->getSentence().c_str());
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern)); boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(s, _pattern));
boost::u32regex_iterator<const UChar*> end; boost::u32regex_iterator<const UChar*> end;
std::vector<TokenAnnotation> annotations; std::vector<TokenAnnotation> annotations;
for (; begin != end; ++begin) { for (; begin != end; ++begin) {
@ -51,14 +54,16 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
} else { } else {
value = _value; value = _value;
} }
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value); TokenAnnotation annotation(matchBegin, matchEnd,
_annotationType, value);
annotations.push_back(annotation); annotations.push_back(annotation);
} }
sentence->addAnnotations(annotations); sentence->addAnnotations(annotations);
} catch(const std::exception & e) { } catch(const std::exception & e) {
std::stringstream ss; std::stringstream ss;
ss << "Exception while applying regex rule: " ss << "Exception while applying regex rule: "
<< _annotationType << " to text: " << sentence->getSentence(); << _annotationType << " to text: "
<< sentence->getSentence();
ss << ", message: " << e.what(); ss << ", message: " << e.what();
throw ConcordiaException(ss.str()); throw ConcordiaException(ss.str());
} }

View File

@ -15,8 +15,9 @@ typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
/*! /*!
Class for representing a regular expression annotation rule. Class for representing a regular expression annotation rule.
Holds regex pattern string for matching and replacement string for Holds regex pattern string for matching and default value to assign
annotating found matches. to the annotations. Rule also has a type, given to all annotations
produced by it.
*/ */
class RegexRule { class RegexRule {
@ -25,6 +26,7 @@ public:
Constructor. Constructor.
\param patternString regex pattern to match \param patternString regex pattern to match
\param annoationType type of annotation \param annoationType type of annotation
\param value value to be assigned to the annotation
\param caseSensitive case sensitivity of the pattern \param caseSensitive case sensitivity of the pattern
*/ */
RegexRule(std::string patternString, RegexRule(std::string patternString,
@ -37,7 +39,7 @@ public:
*/ */
virtual ~RegexRule(); virtual ~RegexRule();
/*! Applies the operation on anonymized sentence. /*! Applies regex annotation on tokenized sentence.
\param sentence the input sentence \param sentence the input sentence
*/ */
void apply(boost::shared_ptr<TokenizedSentence> sentence); void apply(boost::shared_ptr<TokenizedSentence> sentence);

View File

@ -42,7 +42,8 @@ boost::shared_ptr<TokenizedSentence>
} }
boost::shared_ptr<RegexRule> wordsRule( boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, "")); new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
TokenAnnotation::WORD, ""));
wordsRule->apply(result); wordsRule->apply(result);
boost::shared_ptr<RegexRule> singleLetterWordsRule( boost::shared_ptr<RegexRule> singleLetterWordsRule(
new RegexRule("\\p{L}", TokenAnnotation::WORD, "")); new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
@ -103,7 +104,8 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
tagsExpression += "br).*?>"; tagsExpression += "br).*?>";
_htmlTags = boost::shared_ptr<RegexRule>( _htmlTags = boost::shared_ptr<RegexRule>(
new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false)); new RegexRule(tagsExpression,
TokenAnnotation::HTML_TAG, "", false));
} }
boost::shared_ptr<RegexRule> boost::shared_ptr<RegexRule>
@ -137,6 +139,6 @@ boost::shared_ptr<RegexRule>
expression = expression.substr(0, expression.size()-1); expression = expression.substr(0, expression.size()-1);
expression += ")"; expression += ")";
return boost::shared_ptr<RegexRule>( return boost::shared_ptr<RegexRule>(
new RegexRule(expression, annotationType, value, false)); new RegexRule(expression, annotationType, value, false));
} }

View File

@ -14,10 +14,9 @@
/*! /*!
Class for tokenizing sentence before generating hash. Class for tokenizing sentence before generating hash.
This operation is is used to Tokenizer ignores unnecessary symbols, html tags and possibly stop words
remove unnecessary symbols and possibly words from sentences added to index (if the option is enabled) in sentences added to index
and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled), as well as annotates named entities. All these have to be listed in files
as well as annotates named entities and special symbols. All these have to be listed in files
(see \ref tutorial3). (see \ref tutorial3).
*/ */
@ -35,7 +34,7 @@ public:
/*! Tokenizes the sentence. /*! Tokenizes the sentence.
\param sentence input sentence \param sentence input sentence
\returns altered version of the input sentence \returns tokenized sentence object build on the input sentence
*/ */
boost::shared_ptr<TokenizedSentence> boost::shared_ptr<TokenizedSentence>
tokenize(const std::string & sentence); tokenize(const std::string & sentence);
@ -58,7 +57,6 @@ private:
bool _stopWordsEnabled; bool _stopWordsEnabled;
boost::shared_ptr<RegexRule> _stopWords; boost::shared_ptr<RegexRule> _stopWords;
}; };
#endif #endif

View File

@ -21,19 +21,19 @@ BOOST_AUTO_TEST_CASE( SimpleAnnotation )
BOOST_CHECK_EQUAL(iter->getStart(),7); BOOST_CHECK_EQUAL(iter->getStart(),7);
BOOST_CHECK_EQUAL(iter->getEnd(),8); BOOST_CHECK_EQUAL(iter->getEnd(),8);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),11); BOOST_CHECK_EQUAL(iter->getStart(),11);
BOOST_CHECK_EQUAL(iter->getEnd(),12); BOOST_CHECK_EQUAL(iter->getEnd(),12);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),14); BOOST_CHECK_EQUAL(iter->getStart(),14);
BOOST_CHECK_EQUAL(iter->getEnd(),15); BOOST_CHECK_EQUAL(iter->getEnd(),15);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),16); BOOST_CHECK_EQUAL(iter->getStart(),16);
BOOST_CHECK_EQUAL(iter->getEnd(),17); BOOST_CHECK_EQUAL(iter->getEnd(),17);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),18); BOOST_CHECK_EQUAL(iter->getStart(),18);
BOOST_CHECK_EQUAL(iter->getEnd(),19); BOOST_CHECK_EQUAL(iter->getEnd(),19);
@ -64,19 +64,19 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
BOOST_CHECK_EQUAL(iter->getStart(),3); BOOST_CHECK_EQUAL(iter->getStart(),3);
BOOST_CHECK_EQUAL(iter->getEnd(),4); BOOST_CHECK_EQUAL(iter->getEnd(),4);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),19); BOOST_CHECK_EQUAL(iter->getStart(),19);
BOOST_CHECK_EQUAL(iter->getEnd(),20); BOOST_CHECK_EQUAL(iter->getEnd(),20);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),21); BOOST_CHECK_EQUAL(iter->getStart(),21);
BOOST_CHECK_EQUAL(iter->getEnd(),22); BOOST_CHECK_EQUAL(iter->getEnd(),22);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),44); BOOST_CHECK_EQUAL(iter->getStart(),44);
BOOST_CHECK_EQUAL(iter->getEnd(),45); BOOST_CHECK_EQUAL(iter->getEnd(),45);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),45); BOOST_CHECK_EQUAL(iter->getStart(),45);
BOOST_CHECK_EQUAL(iter->getEnd(),46); BOOST_CHECK_EQUAL(iter->getEnd(),46);
@ -94,15 +94,15 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
BOOST_CHECK_EQUAL(iter->getStart(),8); BOOST_CHECK_EQUAL(iter->getStart(),8);
BOOST_CHECK_EQUAL(iter->getEnd(),11); BOOST_CHECK_EQUAL(iter->getEnd(),11);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),16); BOOST_CHECK_EQUAL(iter->getStart(),16);
BOOST_CHECK_EQUAL(iter->getEnd(),19); BOOST_CHECK_EQUAL(iter->getEnd(),19);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),24); BOOST_CHECK_EQUAL(iter->getStart(),24);
BOOST_CHECK_EQUAL(iter->getEnd(),27); BOOST_CHECK_EQUAL(iter->getEnd(),27);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),32); BOOST_CHECK_EQUAL(iter->getStart(),32);
BOOST_CHECK_EQUAL(iter->getEnd(),35); BOOST_CHECK_EQUAL(iter->getEnd(),35);
@ -132,7 +132,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
BOOST_CHECK_EQUAL(iter->getStart(),11); BOOST_CHECK_EQUAL(iter->getStart(),11);
BOOST_CHECK_EQUAL(iter->getEnd(),12); BOOST_CHECK_EQUAL(iter->getEnd(),12);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),29); BOOST_CHECK_EQUAL(iter->getStart(),29);
BOOST_CHECK_EQUAL(iter->getEnd(),30); BOOST_CHECK_EQUAL(iter->getEnd(),30);
@ -149,71 +149,71 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
BOOST_CHECK_EQUAL(iter->getStart(),2); BOOST_CHECK_EQUAL(iter->getStart(),2);
BOOST_CHECK_EQUAL(iter->getEnd(),3); BOOST_CHECK_EQUAL(iter->getEnd(),3);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),3); BOOST_CHECK_EQUAL(iter->getStart(),3);
BOOST_CHECK_EQUAL(iter->getEnd(),4); BOOST_CHECK_EQUAL(iter->getEnd(),4);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),4); BOOST_CHECK_EQUAL(iter->getStart(),4);
BOOST_CHECK_EQUAL(iter->getEnd(),5); BOOST_CHECK_EQUAL(iter->getEnd(),5);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),5); BOOST_CHECK_EQUAL(iter->getStart(),5);
BOOST_CHECK_EQUAL(iter->getEnd(),6); BOOST_CHECK_EQUAL(iter->getEnd(),6);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),8); BOOST_CHECK_EQUAL(iter->getStart(),8);
BOOST_CHECK_EQUAL(iter->getEnd(),9); BOOST_CHECK_EQUAL(iter->getEnd(),9);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),9); BOOST_CHECK_EQUAL(iter->getStart(),9);
BOOST_CHECK_EQUAL(iter->getEnd(),10); BOOST_CHECK_EQUAL(iter->getEnd(),10);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),11); BOOST_CHECK_EQUAL(iter->getStart(),11);
BOOST_CHECK_EQUAL(iter->getEnd(),12); BOOST_CHECK_EQUAL(iter->getEnd(),12);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),15); BOOST_CHECK_EQUAL(iter->getStart(),15);
BOOST_CHECK_EQUAL(iter->getEnd(),16); BOOST_CHECK_EQUAL(iter->getEnd(),16);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),16); BOOST_CHECK_EQUAL(iter->getStart(),16);
BOOST_CHECK_EQUAL(iter->getEnd(),17); BOOST_CHECK_EQUAL(iter->getEnd(),17);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),20); BOOST_CHECK_EQUAL(iter->getStart(),20);
BOOST_CHECK_EQUAL(iter->getEnd(),21); BOOST_CHECK_EQUAL(iter->getEnd(),21);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),21); BOOST_CHECK_EQUAL(iter->getStart(),21);
BOOST_CHECK_EQUAL(iter->getEnd(),22); BOOST_CHECK_EQUAL(iter->getEnd(),22);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),22); BOOST_CHECK_EQUAL(iter->getStart(),22);
BOOST_CHECK_EQUAL(iter->getEnd(),23); BOOST_CHECK_EQUAL(iter->getEnd(),23);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),23); BOOST_CHECK_EQUAL(iter->getStart(),23);
BOOST_CHECK_EQUAL(iter->getEnd(),24); BOOST_CHECK_EQUAL(iter->getEnd(),24);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),26); BOOST_CHECK_EQUAL(iter->getStart(),26);
BOOST_CHECK_EQUAL(iter->getEnd(),27); BOOST_CHECK_EQUAL(iter->getEnd(),27);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),27); BOOST_CHECK_EQUAL(iter->getStart(),27);
BOOST_CHECK_EQUAL(iter->getEnd(),28); BOOST_CHECK_EQUAL(iter->getEnd(),28);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),29); BOOST_CHECK_EQUAL(iter->getStart(),29);
BOOST_CHECK_EQUAL(iter->getEnd(),30); BOOST_CHECK_EQUAL(iter->getEnd(),30);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),33); BOOST_CHECK_EQUAL(iter->getStart(),33);
BOOST_CHECK_EQUAL(iter->getEnd(),34); BOOST_CHECK_EQUAL(iter->getEnd(),34);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),34); BOOST_CHECK_EQUAL(iter->getStart(),34);
BOOST_CHECK_EQUAL(iter->getEnd(),35); BOOST_CHECK_EQUAL(iter->getEnd(),35);

View File

@ -47,79 +47,79 @@ BOOST_AUTO_TEST_CASE( NETest )
BOOST_CHECK_EQUAL(iter->getEnd(),4); BOOST_CHECK_EQUAL(iter->getEnd(),4);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "date"); BOOST_CHECK_EQUAL(iter->getValue(), "date");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),6); BOOST_CHECK_EQUAL(iter->getStart(),6);
BOOST_CHECK_EQUAL(iter->getEnd(),16); BOOST_CHECK_EQUAL(iter->getEnd(),16);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date"); BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),18); BOOST_CHECK_EQUAL(iter->getStart(),18);
BOOST_CHECK_EQUAL(iter->getEnd(),22); BOOST_CHECK_EQUAL(iter->getEnd(),22);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "mail"); BOOST_CHECK_EQUAL(iter->getValue(), "mail");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),24); BOOST_CHECK_EQUAL(iter->getStart(),24);
BOOST_CHECK_EQUAL(iter->getEnd(),40); BOOST_CHECK_EQUAL(iter->getEnd(),40);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email"); BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),42); BOOST_CHECK_EQUAL(iter->getStart(),42);
BOOST_CHECK_EQUAL(iter->getEnd(),48); BOOST_CHECK_EQUAL(iter->getEnd(),48);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "number"); BOOST_CHECK_EQUAL(iter->getValue(), "number");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),50); BOOST_CHECK_EQUAL(iter->getStart(),50);
BOOST_CHECK_EQUAL(iter->getEnd(),54); BOOST_CHECK_EQUAL(iter->getEnd(),54);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number"); BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),56); BOOST_CHECK_EQUAL(iter->getStart(),56);
BOOST_CHECK_EQUAL(iter->getEnd(),61); BOOST_CHECK_EQUAL(iter->getEnd(),61);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "hello"); BOOST_CHECK_EQUAL(iter->getValue(), "hello");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),61); BOOST_CHECK_EQUAL(iter->getStart(),61);
BOOST_CHECK_EQUAL(iter->getEnd(),62); BOOST_CHECK_EQUAL(iter->getEnd(),62);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number"); BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),63); BOOST_CHECK_EQUAL(iter->getStart(),63);
BOOST_CHECK_EQUAL(iter->getEnd(),69); BOOST_CHECK_EQUAL(iter->getEnd(),69);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć"); BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),70); BOOST_CHECK_EQUAL(iter->getStart(),70);
BOOST_CHECK_EQUAL(iter->getEnd(),75); BOOST_CHECK_EQUAL(iter->getEnd(),75);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą"); BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),76); BOOST_CHECK_EQUAL(iter->getStart(),76);
BOOST_CHECK_EQUAL(iter->getEnd(),80); BOOST_CHECK_EQUAL(iter->getEnd(),80);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń"); BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),82); BOOST_CHECK_EQUAL(iter->getStart(),82);
BOOST_CHECK_EQUAL(iter->getEnd(),88); BOOST_CHECK_EQUAL(iter->getEnd(),88);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć"); BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),89); BOOST_CHECK_EQUAL(iter->getStart(),89);
BOOST_CHECK_EQUAL(iter->getEnd(),94); BOOST_CHECK_EQUAL(iter->getEnd(),94);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą"); BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),95); BOOST_CHECK_EQUAL(iter->getStart(),95);
BOOST_CHECK_EQUAL(iter->getEnd(),99); BOOST_CHECK_EQUAL(iter->getEnd(),99);
@ -156,52 +156,52 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
BOOST_CHECK_EQUAL(iter->getStart(),0); BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),23); BOOST_CHECK_EQUAL(iter->getEnd(),23);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),23); BOOST_CHECK_EQUAL(iter->getStart(),23);
BOOST_CHECK_EQUAL(iter->getEnd(),27); BOOST_CHECK_EQUAL(iter->getEnd(),27);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"link"); BOOST_CHECK_EQUAL(iter->getValue(),"link");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),27); BOOST_CHECK_EQUAL(iter->getStart(),27);
BOOST_CHECK_EQUAL(iter->getEnd(),31); BOOST_CHECK_EQUAL(iter->getEnd(),31);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),32); BOOST_CHECK_EQUAL(iter->getStart(),32);
BOOST_CHECK_EQUAL(iter->getEnd(),35); BOOST_CHECK_EQUAL(iter->getEnd(),35);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and"); BOOST_CHECK_EQUAL(iter->getValue(),"and");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),36); BOOST_CHECK_EQUAL(iter->getStart(),36);
BOOST_CHECK_EQUAL(iter->getEnd(),39); BOOST_CHECK_EQUAL(iter->getEnd(),39);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),39); BOOST_CHECK_EQUAL(iter->getStart(),39);
BOOST_CHECK_EQUAL(iter->getEnd(),43); BOOST_CHECK_EQUAL(iter->getEnd(),43);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"bold"); BOOST_CHECK_EQUAL(iter->getValue(),"bold");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),43); BOOST_CHECK_EQUAL(iter->getStart(),43);
BOOST_CHECK_EQUAL(iter->getEnd(),47); BOOST_CHECK_EQUAL(iter->getEnd(),47);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),48); BOOST_CHECK_EQUAL(iter->getStart(),48);
BOOST_CHECK_EQUAL(iter->getEnd(),51); BOOST_CHECK_EQUAL(iter->getEnd(),51);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and"); BOOST_CHECK_EQUAL(iter->getValue(),"and");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),52); BOOST_CHECK_EQUAL(iter->getStart(),52);
BOOST_CHECK_EQUAL(iter->getEnd(),59); BOOST_CHECK_EQUAL(iter->getEnd(),59);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"newline"); BOOST_CHECK_EQUAL(iter->getValue(),"newline");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),60); BOOST_CHECK_EQUAL(iter->getStart(),60);
BOOST_CHECK_EQUAL(iter->getEnd(),65); BOOST_CHECK_EQUAL(iter->getEnd(),65);
@ -240,79 +240,79 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
BOOST_CHECK_EQUAL(iter->getEnd(),4); BOOST_CHECK_EQUAL(iter->getEnd(),4);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"this"); BOOST_CHECK_EQUAL(iter->getValue(),"this");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),5); BOOST_CHECK_EQUAL(iter->getStart(),5);
BOOST_CHECK_EQUAL(iter->getEnd(),7); BOOST_CHECK_EQUAL(iter->getEnd(),7);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"is"); BOOST_CHECK_EQUAL(iter->getValue(),"is");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),8); BOOST_CHECK_EQUAL(iter->getStart(),8);
BOOST_CHECK_EQUAL(iter->getEnd(),9); BOOST_CHECK_EQUAL(iter->getEnd(),9);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"a"); BOOST_CHECK_EQUAL(iter->getValue(),"a");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),10); BOOST_CHECK_EQUAL(iter->getStart(),10);
BOOST_CHECK_EQUAL(iter->getEnd(),18); BOOST_CHECK_EQUAL(iter->getEnd(),18);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"sentence"); BOOST_CHECK_EQUAL(iter->getValue(),"sentence");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),20); BOOST_CHECK_EQUAL(iter->getStart(),20);
BOOST_CHECK_EQUAL(iter->getEnd(),25); BOOST_CHECK_EQUAL(iter->getEnd(),25);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"don't"); BOOST_CHECK_EQUAL(iter->getValue(),"don't");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),26); BOOST_CHECK_EQUAL(iter->getStart(),26);
BOOST_CHECK_EQUAL(iter->getEnd(),38); BOOST_CHECK_EQUAL(iter->getEnd(),38);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze"); BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),39); BOOST_CHECK_EQUAL(iter->getStart(),39);
BOOST_CHECK_EQUAL(iter->getEnd(),41); BOOST_CHECK_EQUAL(iter->getEnd(),41);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"it"); BOOST_CHECK_EQUAL(iter->getValue(),"it");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),43); BOOST_CHECK_EQUAL(iter->getStart(),43);
BOOST_CHECK_EQUAL(iter->getEnd(),49); BOOST_CHECK_EQUAL(iter->getEnd(),49);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"zażółć"); BOOST_CHECK_EQUAL(iter->getValue(),"zażółć");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),51); BOOST_CHECK_EQUAL(iter->getStart(),51);
BOOST_CHECK_EQUAL(iter->getEnd(),57); BOOST_CHECK_EQUAL(iter->getEnd(),57);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą"); BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),59); BOOST_CHECK_EQUAL(iter->getStart(),59);
BOOST_CHECK_EQUAL(iter->getEnd(),63); BOOST_CHECK_EQUAL(iter->getEnd(),63);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"jaźń"); BOOST_CHECK_EQUAL(iter->getValue(),"jaźń");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),64); BOOST_CHECK_EQUAL(iter->getStart(),64);
BOOST_CHECK_EQUAL(iter->getEnd(),71); BOOST_CHECK_EQUAL(iter->getEnd(),71);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć"); BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),72); BOOST_CHECK_EQUAL(iter->getStart(),72);
BOOST_CHECK_EQUAL(iter->getEnd(),77); BOOST_CHECK_EQUAL(iter->getEnd(),77);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"gęślą"); BOOST_CHECK_EQUAL(iter->getValue(),"gęślą");
iter++; ++iter;
BOOST_CHECK_EQUAL(iter->getStart(),78); BOOST_CHECK_EQUAL(iter->getStart(),78);
BOOST_CHECK_EQUAL(iter->getEnd(),83); BOOST_CHECK_EQUAL(iter->getEnd(),83);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń"); BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń");
iter++; ++iter;
} }

View File

@ -7,7 +7,7 @@
#include <string> #include <string>
/*! /*!
Class representing annotatio of char sequence as a token. Class representing annotation of char sequence as a token.
It is a type of interval that is also storing information It is a type of interval that is also storing information
about the annoation type and value. about the annoation type and value.
@ -18,7 +18,7 @@ public:
/*! Constructor. /*! Constructor.
\param start start index of the annotation (char-level, 0-based) \param start start index of the annotation (char-level, 0-based)
\param end end index of the annotation (char-level, 0-based) \param end end index of the annotation (char-level, 0-based)
\param type annotation type \param annotationType annotation type
\param value annotation value \param value annotation value
*/ */
TokenAnnotation(const SUFFIX_MARKER_TYPE start, TokenAnnotation(const SUFFIX_MARKER_TYPE start,
@ -44,12 +44,20 @@ public:
return _value; return _value;
} }
/*! Named entity annotation type
*/
static int NE; static int NE;
/*! Word annotation type
*/
static int WORD; static int WORD;
/*! Html tag annotation type
*/
static int HTML_TAG; static int HTML_TAG;
/*! Stop word annotation type
*/
static int STOP_WORD; static int STOP_WORD;
protected: protected:

View File

@ -11,37 +11,43 @@ TokenizedSentence::TokenizedSentence(std::string sentence):
TokenizedSentence::~TokenizedSentence() { TokenizedSentence::~TokenizedSentence() {
} }
void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) { void TokenizedSentence::addAnnotations(
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin(); std::vector<TokenAnnotation> annotations) {
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin(); std::vector<TokenAnnotation>::iterator newAnnotation =
annotations.begin();
std::list<TokenAnnotation>::iterator existingAnnotation =
_tokenAnnotations.begin();
while(newAnnotation != annotations.end()) { while (newAnnotation != annotations.end()) {
if (existingAnnotation != _tokenAnnotations.end()) { if (existingAnnotation != _tokenAnnotations.end()) {
// there are still some existing annotations, so perform checks // there are still some existing annotations, so perform checks
if (newAnnotation->intersects(*existingAnnotation)) { if (newAnnotation->intersects(*existingAnnotation)) {
// The new annotation intersects with the existing. // The new annotation intersects with the existing.
// We can not add it, so let us just move on to the // We can not add it, so let us just move on to the
// next new annoation. // next new annoation.
newAnnotation++; ++newAnnotation;
} else { } else {
// it is now important whether the new interval is before // it is now important whether the new interval is before
// or after existing // or after existing
if (newAnnotation->getStart() < existingAnnotation->getStart()) { if (newAnnotation->getStart() <
// New interval does not intersect and is before existing. We add it. existingAnnotation->getStart()) {
_tokenAnnotations.insert(existingAnnotation, *newAnnotation); // New interval does not intersect and is
newAnnotation++; // before existing. We add it.
_tokenAnnotations.insert(existingAnnotation,
*newAnnotation);
++newAnnotation;
} else { } else {
// If the new interval is after existing we move to the next existing annoation. // If the new interval is after existing
existingAnnotation++; // we move to the next existing annoation.
++existingAnnotation;
} }
} }
} else { } else {
// no more existing annotations, so just add the new annotation // no more existing annotations, so just add the new annotation
_tokenAnnotations.push_back(*newAnnotation); _tokenAnnotations.push_back(*newAnnotation);
newAnnotation++; ++newAnnotation;
} }
} }
} }
void TokenizedSentence::toLowerCase() { void TokenizedSentence::toLowerCase() {
@ -56,6 +62,5 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
_tokens.push_back(annotation); _tokens.push_back(annotation);
} }
} }
} }

View File

@ -11,9 +11,12 @@
#include <list> #include <list>
/*! /*!
A sentence after anonymization operations. The class A sentence after tokenizing operations. The class
holds the current string represenation of the sentence holds the current string represenation of the sentence
along with the annotations list. along with the annotations list. The class also allows
for generating hash. After that operation the class
also holds the list of hashed codes and corresponding
tokens.
*/ */
class TokenizedSentence { class TokenizedSentence {
@ -22,7 +25,7 @@ public:
Constructor. Constructor.
*/ */
TokenizedSentence(std::string sentence); explicit TokenizedSentence(std::string sentence);
/*! Destructor. /*! Destructor.
*/ */
@ -35,21 +38,40 @@ public:
return _sentence; return _sentence;
} }
/*! Getter for annotations list /*! Getter for all annotations list. This method returns
all annotations, including those which are not considered
in the hash, i.e. stop words and html tags.
\returns annotations list \returns annotations list
*/ */
std::list<TokenAnnotation> getAnnotations() const { std::list<TokenAnnotation> getAnnotations() const {
return _tokenAnnotations; return _tokenAnnotations;
} }
/*! Getter for codes list. This data is available after calling
the hashGenerator method.
\returns codes list
*/
std::vector<INDEX_CHARACTER_TYPE> getCodes() const { std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
return _codes; return _codes;
} }
/*! Getter for tokens list. This method returns
only those annotations considered
in the hash, i.e. words and named entities.
\returns tokens list
*/
std::vector<TokenAnnotation> getTokens() const { std::vector<TokenAnnotation> getTokens() const {
return _tokens; return _tokens;
} }
/*! Method for generating hash based on annotations.
This method takes into account annotations of type
word and named entity. These are encoded and added
to to code list. Annotations corresponding to these
tokens are added to the tokens list.
\param wordMap word map to use when encoding tokens
\returns tokens list
*/
void generateHash(boost::shared_ptr<WordMap> wordMap); void generateHash(boost::shared_ptr<WordMap> wordMap);
/*! /*!

View File

@ -98,12 +98,15 @@ Concordia is equipped with a unique functionality of so called Concordia search,
Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples. Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples.
Moreover, the below example presents the feature of retrieving a tokenized version of the example.
File concordia_searching.cpp: File concordia_searching.cpp:
\verbatim \verbatim
#include <concordia/concordia.hpp> #include <concordia/concordia.hpp>
#include <concordia/concordia_search_result.hpp> #include <concordia/concordia_search_result.hpp>
#include <concordia/matched_pattern_fragment.hpp> #include <concordia/matched_pattern_fragment.hpp>
#include <concordia/example.hpp> #include <concordia/example.hpp>
#include <concordia/tokenized_sentence.hpp>
#include "config.hpp" #include "config.hpp"
@ -115,7 +118,13 @@ using namespace std;
int main() { int main() {
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
concordia.addExample(Example("Alice has a cat", 56)); boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
cout << "Added the following tokens: " << endl;
BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
<< token.getEnd() << ")" << endl;
}
concordia.addExample(Example("Alice has a dog", 23)); concordia.addExample(Example("Alice has a dog", 23));
concordia.addExample(Example("New test product has a mistake", 321)); concordia.addExample(Example("New test product has a mistake", 321));
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14)); concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
@ -153,6 +162,11 @@ int main() {
This program should print: This program should print:
\verbatim \verbatim
Added the following tokens:
"alice" at positions: [0,5)
"has" at positions: [6,9)
"a" at positions: [10,11)
"cat" at positions: [12,15)
Searching for pattern: Our new test product has nothing to do with computers Searching for pattern: Our new test product has nothing to do with computers
Printing all matched fragments: Printing all matched fragments:
Matched pattern fragment found. Pattern fragment: [4,9] in sentence 14, at offset: 6 Matched pattern fragment found. Pattern fragment: [4,9] in sentence 14, at offset: 6

View File

@ -2,6 +2,7 @@
#include <concordia/concordia_search_result.hpp> #include <concordia/concordia_search_result.hpp>
#include <concordia/matched_pattern_fragment.hpp> #include <concordia/matched_pattern_fragment.hpp>
#include <concordia/example.hpp> #include <concordia/example.hpp>
#include <concordia/tokenized_sentence.hpp>
#include "config.hpp" #include "config.hpp"
@ -13,7 +14,13 @@ using namespace std;
int main() { int main() {
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
concordia.addExample(Example("Alice has a cat", 56)); boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
cout << "Added the following tokens: " << endl;
BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
<< token.getEnd() << ")" << endl;
}
concordia.addExample(Example("Alice has a dog", 23)); concordia.addExample(Example("Alice has a dog", 23));
concordia.addExample(Example("New test product has a mistake", 321)); concordia.addExample(Example("New test product has a mistake", 321));
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14)); concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));