From 5a57406875d80234b39b82126a9d0e1790877429 Mon Sep 17 00:00:00 2001 From: rjawor Date: Sat, 27 Jun 2015 12:40:24 +0200 Subject: [PATCH] finished original word positions --- concordia-console/concordia-console.cpp | 10 ++-- concordia/compilation.dox | 1 + concordia/concordia.cpp | 14 ++--- concordia/concordia.hpp | 12 +++-- concordia/concordia_index.cpp | 19 ++++--- concordia/concordia_index.hpp | 7 ++- concordia/hash_generator.cpp | 5 +- concordia/hash_generator.hpp | 15 +++--- concordia/index_searcher.cpp | 3 +- concordia/interval.hpp | 7 ++- concordia/matched_pattern_fragment.hpp | 2 +- concordia/regex_rule.cpp | 17 +++--- concordia/regex_rule.hpp | 10 ++-- concordia/sentence_tokenizer.cpp | 12 +++-- concordia/sentence_tokenizer.hpp | 10 ++-- concordia/t/test_regex_rule.cpp | 58 ++++++++++---------- concordia/t/test_sentence_tokenizer.cpp | 70 ++++++++++++------------- concordia/token_annotation.hpp | 14 +++-- concordia/tokenized_sentence.cpp | 37 +++++++------ concordia/tokenized_sentence.hpp | 40 ++++++++++---- concordia/tutorial.dox | 16 +++++- examples/concordia_search.cpp | 9 +++- 22 files changed, 238 insertions(+), 150 deletions(-) diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index dbb44cd..8f69b48 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -28,8 +28,9 @@ void checkConcordiaResults( long baseLineCount) { long lineIndex = 1; BOOST_FOREACH(ConcordiaSearchResult result, results) { - SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size(); - if (patternSize > 0) { + SUFFIX_MARKER_TYPE patternSize = + result.getTokenizedPattern()->getTokens().size(); + if (patternSize > 0) { if (result.getBestOverlay().size() != 1) { reportError(baseLineCount + lineIndex, "best overlay has more than one fragment."); @@ -37,7 +38,7 @@ void checkConcordiaResults( if (result.getBestOverlay().at(0).getMatchedLength() != patternSize) { reportError(baseLineCount + lineIndex, - "best overlay fragment has different size than pattern."); + "best overlay fragment has different size than pattern."); } if (result.getBestOverlayScore() != 1) { reportError(baseLineCount + lineIndex, @@ -201,7 +202,8 @@ int main(int argc, char** argv) { msdiff = time_end - time_start; std::cout << "\tPattern used: " << std::endl << "\t\t"; - BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) { + BOOST_FOREACH(TokenAnnotation annotation, + result->getTokenizedPattern()->getTokens()) { std::cout << annotation.getValue() << " "; } std::cout << std::endl; diff --git a/concordia/compilation.dox b/concordia/compilation.dox index 27c834b..68c5b33 100644 --- a/concordia/compilation.dox +++ b/concordia/compilation.dox @@ -9,6 +9,7 @@ Before you compile, make sure you have these installed: - cmake - Boost library - Log4cpp +- ICU - (optional) Doxygen - (optional) TeX diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index be762a4..3c6d4b6 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -44,15 +44,17 @@ std::string _createLibraryVersion() { // Sentences are written to disk and added to T. // SA is generated on command by other methods. -boost::shared_ptr Concordia::addExample(const Example & example) - throw(ConcordiaException) { +boost::shared_ptr Concordia::addExample( + const Example & example) + throw(ConcordiaException) { return _index->addExample(_hashGenerator, _T, _markers, example); } // Sentences are written to disk and added to T. // SA is generated on command by other methods. -std::vector Concordia::addAllExamples(const std::vector & examples) - throw(ConcordiaException) { +std::vector Concordia::addAllExamples( + const std::vector & examples) + throw(ConcordiaException) { return _index->addAllExamples(_hashGenerator, _T, _markers, examples); } @@ -165,7 +167,8 @@ boost::shared_ptr Concordia::concordiaSearch( } else { std::string empty; return boost::shared_ptr( - new ConcordiaSearchResult(boost::shared_ptr(new TokenizedSentence(empty)))); + new ConcordiaSearchResult(boost::shared_ptr( + new TokenizedSentence(empty)))); } } @@ -182,4 +185,3 @@ void Concordia::clearIndex() throw(ConcordiaException) { boost::filesystem::remove(_config->getMarkersFilePath()); } - diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index 5051de3..9c707e9 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -54,16 +54,22 @@ public: /*! Adds an Example to the index. \param example example to be added + \returns tokenized sentence object, + containing information about original word positions \throws ConcordiaException */ - boost::shared_ptr addExample(const Example & example) throw(ConcordiaException); + boost::shared_ptr addExample(const Example & example) + throw(ConcordiaException); /*! Adds multiple examples to the index. \param examples vector of examples to be added + \returns vector of tokenized sentence objects, + containing information about original word positions \throws ConcordiaException */ - std::vector addAllExamples(const std::vector & examples) - throw(ConcordiaException); + std::vector addAllExamples( + const std::vector & examples) + throw(ConcordiaException); /*! Performs a simple substring lookup on the index. For more info see \ref tutorial1_2. diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index 3d65797..23c4ca4 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -25,7 +25,7 @@ boost::shared_ptr > ConcordiaIndex::generateSuffixArray( } boost::shared_ptr > result = - boost::shared_ptr >(new std::vector); + boost::shared_ptr >(new std::vector); for (int i = 0; i < T->size(); i++) { result->push_back(SA_array[i]); } @@ -48,7 +48,8 @@ std::vector ConcordiaIndex::addAllExamples( std::vector hashedPatterns; BOOST_FOREACH(Example example, examples) { - boost::shared_ptr hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator, + boost::shared_ptr hashedPattern = + _addSingleExample(hashedIndexFile, markersFile, hashGenerator, T, markers, example); hashedPatterns.push_back(*hashedPattern); } @@ -56,7 +57,7 @@ std::vector ConcordiaIndex::addAllExamples( hashedIndexFile.close(); markersFile.close(); hashGenerator->serializeWordMap(); - + return hashedPatterns; } @@ -71,12 +72,13 @@ boost::shared_ptr ConcordiaIndex::addExample( std::ofstream markersFile; markersFile.open(_markersFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); - boost::shared_ptr hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator, + boost::shared_ptr hashedPattern = + _addSingleExample(hashedIndexFile, markersFile, hashGenerator, T, markers, example); hashedIndexFile.close(); markersFile.close(); hashGenerator->serializeWordMap(); - + return hashedPattern; } @@ -87,9 +89,10 @@ boost::shared_ptr ConcordiaIndex::_addSingleExample( boost::shared_ptr > T, boost::shared_ptr > markers, const Example & example) { - boost::shared_ptr hashedPattern = hashGenerator->generateHash(example.getSentence()); + boost::shared_ptr hashedPattern = + hashGenerator->generateHash(example.getSentence()); std::vector hash = hashedPattern->getCodes(); - + int offset = 0; for (std::vector::iterator it = hash.begin(); it != hash.end(); ++it) { @@ -117,7 +120,7 @@ boost::shared_ptr ConcordiaIndex::_addSingleExample( SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE; Utils::writeMarker(markersFile, sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA); - + return hashedPattern; } diff --git a/concordia/concordia_index.hpp b/concordia/concordia_index.hpp index e90f546..6d79cb9 100644 --- a/concordia/concordia_index.hpp +++ b/concordia/concordia_index.hpp @@ -44,11 +44,13 @@ public: and markers array (also passed to this method) are appended with the hashed example. At the same time, HDD versions of these two data structures are also appended with the same example. + The method returns a tokenized version of the example. \param hashGenerator hash generator to be used to prepare the hash of the example \param T RAM-based hash index to be appended to \param markers RAM-based markers array to be appended to \param example example to be added to index + \returns tokenized example \throws ConcordiaException */ boost::shared_ptr addExample( @@ -62,11 +64,13 @@ public: and markers array (also passed to this method) are appended with the hashed examples. At the same time, HDD versions of these two data structures are also appended with the same examples. + The method returns a vector of tokenized examples. \param hashGenerator hash generator to be used to prepare the hash of the example \param T RAM-based hash index to be appended to \param markers RAM-based markers array to be appended to \param examples vector of examples to be added to index + \returns vector of tokenized examples \throws ConcordiaException */ std::vector addAllExamples( @@ -83,7 +87,8 @@ public: boost::shared_ptr > T); private: - boost::shared_ptr _addSingleExample(std::ofstream & hashedIndexFile, + boost::shared_ptr _addSingleExample( + std::ofstream & hashedIndexFile, std::ofstream & markersFile, boost::shared_ptr hashGenerator, boost::shared_ptr > T, diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index eb69ce9..8b93ce4 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -29,9 +29,10 @@ HashGenerator::~HashGenerator() { boost::shared_ptr HashGenerator::generateHash( const std::string & sentence) throw(ConcordiaException) { - boost::shared_ptr ts = _sentenceTokenizer->tokenize(sentence); + boost::shared_ptr ts = + _sentenceTokenizer->tokenize(sentence); ts->generateHash(_wordMap); - + if (ts->getTokens().size() > Utils::maxSentenceSize) { throw ConcordiaException("Trying to add too long sentence."); } diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index 6c8a752..6528dcf 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -15,14 +15,14 @@ /*! Class for generating a sentence hash. The hash is generated from a sentence - given in raw string. String is first anonymized and tokenized. After these - operations, each token is coded as an integer, according to WordMap. - Resulting hash is a vector of integers. + given in raw string. String is first tokenized by SentenceTokenizer and + then each token is coded as an integer, according to WordMap. + Resulting hash is an instance of TokenizedSentence. - Sentence hashed is used when adding a sentence to index and during searching. + Hashed sentence is used when adding a sentence to index and during searching. HashGenerator holds an instance of WordMap, used to code tokens as integers - and SentenceAnonymizer, used to preprocess the sentence string. + and SentenceTokenizer, used to tokenize the sentence string. */ @@ -42,9 +42,10 @@ public: /*! Generates hash of a sentence. \param sentence sentence to generate hash from - \returns vector of integers + \returns tokenized sentence, containing the hash */ - boost::shared_ptr generateHash(const std::string & sentence) + boost::shared_ptr generateHash( + const std::string & sentence) throw(ConcordiaException); /*! diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index 45c9559..c2c119e 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -73,6 +73,7 @@ boost::shared_ptr IndexSearcher::concordiaSearch( boost::shared_ptr( new ConcordiaSearchResult(hashedPattern)); - _concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes()); + _concordiaSearcher->concordiaSearch(result, T, markers, + SA, hashedPattern->getCodes()); return result; } diff --git a/concordia/interval.hpp b/concordia/interval.hpp index c06dfec..e77fb40 100644 --- a/concordia/interval.hpp +++ b/concordia/interval.hpp @@ -51,9 +51,12 @@ public: return _end; } - friend std::ostream & operator << (std::ostream & o, const Interval & interval) { - return o << "[" << interval.getStart() << "," << interval.getEnd() << ")"; + friend std::ostream & operator << (std::ostream & o, + const Interval & interval) { + return o << "[" << interval.getStart() + << "," << interval.getEnd() << ")"; } + protected: SUFFIX_MARKER_TYPE _start; diff --git a/concordia/matched_pattern_fragment.hpp b/concordia/matched_pattern_fragment.hpp index e45bdcb..01d3ac9 100644 --- a/concordia/matched_pattern_fragment.hpp +++ b/concordia/matched_pattern_fragment.hpp @@ -6,7 +6,7 @@ /*! Class representing matched pattern fragment in concordia search. - This fragment can be seen as an interval of the pattern. + This fragment can be seen as a word interval of the pattern. This class holds information about: - where the pattern fragment was matched (example id and example offset) diff --git a/concordia/regex_rule.cpp b/concordia/regex_rule.cpp index 062c118..04bb825 100644 --- a/concordia/regex_rule.cpp +++ b/concordia/regex_rule.cpp @@ -13,9 +13,11 @@ RegexRule::RegexRule(std::string patternString, _value(value) { try { if (caseSensitive) { - _pattern = boost::make_u32regex(UnicodeString(patternString.c_str())); + _pattern = boost::make_u32regex( + UnicodeString(patternString.c_str())); } else { - _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase); + _pattern = boost::make_u32regex( + UnicodeString(patternString.c_str()), boost::regex::icase); } } catch(const std::exception & e) { std::stringstream ss; @@ -37,7 +39,8 @@ RegexRule::~RegexRule() { void RegexRule::apply(boost::shared_ptr sentence) { try { UnicodeString s(sentence->getSentence().c_str()); - boost::u32regex_iterator begin(boost::make_u32regex_iterator(s, _pattern)); + boost::u32regex_iterator begin( + boost::make_u32regex_iterator(s, _pattern)); boost::u32regex_iterator end; std::vector annotations; for (; begin != end; ++begin) { @@ -46,19 +49,21 @@ void RegexRule::apply(boost::shared_ptr sentence) { std::string value; if (_annotationType == TokenAnnotation::WORD) { UnicodeString unicodeValue; - s.extract(begin->position(), begin->length(), unicodeValue); + s.extract(begin->position(), begin->length(), unicodeValue); unicodeValue.toUTF8String(value); } else { value = _value; } - TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value); + TokenAnnotation annotation(matchBegin, matchEnd, + _annotationType, value); annotations.push_back(annotation); } sentence->addAnnotations(annotations); } catch(const std::exception & e) { std::stringstream ss; ss << "Exception while applying regex rule: " - << _annotationType << " to text: " << sentence->getSentence(); + << _annotationType << " to text: " + << sentence->getSentence(); ss << ", message: " << e.what(); throw ConcordiaException(ss.str()); } diff --git a/concordia/regex_rule.hpp b/concordia/regex_rule.hpp index dbc509a..ce62fd1 100644 --- a/concordia/regex_rule.hpp +++ b/concordia/regex_rule.hpp @@ -15,8 +15,9 @@ typedef boost::error_info my_tag_error_info; /*! Class for representing a regular expression annotation rule. - Holds regex pattern string for matching and replacement string for - annotating found matches. + Holds regex pattern string for matching and default value to assign + to the annotations. Rule also has a type, given to all annotations + produced by it. */ class RegexRule { @@ -25,6 +26,7 @@ public: Constructor. \param patternString regex pattern to match \param annoationType type of annotation + \param value value to be assigned to the annotation \param caseSensitive case sensitivity of the pattern */ RegexRule(std::string patternString, @@ -37,7 +39,7 @@ public: */ virtual ~RegexRule(); - /*! Applies the operation on anonymized sentence. + /*! Applies regex annotation on tokenized sentence. \param sentence the input sentence */ void apply(boost::shared_ptr sentence); @@ -46,7 +48,7 @@ private: int _annotationType; std::string _value; - + boost::u32regex _pattern; }; diff --git a/concordia/sentence_tokenizer.cpp b/concordia/sentence_tokenizer.cpp index 2adfbcd..9ffe173 100644 --- a/concordia/sentence_tokenizer.cpp +++ b/concordia/sentence_tokenizer.cpp @@ -26,7 +26,7 @@ SentenceTokenizer::~SentenceTokenizer() { boost::shared_ptr SentenceTokenizer::tokenize(const std::string & sentence) { - boost::shared_ptr + boost::shared_ptr result(new TokenizedSentence(sentence)); _htmlTags->apply(result); @@ -40,9 +40,10 @@ boost::shared_ptr if (_stopWordsEnabled) { _stopWords->apply(result); } - + boost::shared_ptr wordsRule( - new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, "")); + new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", + TokenAnnotation::WORD, "")); wordsRule->apply(result); boost::shared_ptr singleLetterWordsRule( new RegexRule("\\p{L}", TokenAnnotation::WORD, "")); @@ -103,7 +104,8 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) { tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression += "br).*?>"; _htmlTags = boost::shared_ptr( - new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false)); + new RegexRule(tagsExpression, + TokenAnnotation::HTML_TAG, "", false)); } boost::shared_ptr @@ -137,6 +139,6 @@ boost::shared_ptr expression = expression.substr(0, expression.size()-1); expression += ")"; return boost::shared_ptr( - new RegexRule(expression, annotationType, value, false)); + new RegexRule(expression, annotationType, value, false)); } diff --git a/concordia/sentence_tokenizer.hpp b/concordia/sentence_tokenizer.hpp index be60061..7e354eb 100644 --- a/concordia/sentence_tokenizer.hpp +++ b/concordia/sentence_tokenizer.hpp @@ -14,10 +14,9 @@ /*! Class for tokenizing sentence before generating hash. - This operation is is used to - remove unnecessary symbols and possibly words from sentences added to index - and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled), - as well as annotates named entities and special symbols. All these have to be listed in files + Tokenizer ignores unnecessary symbols, html tags and possibly stop words + (if the option is enabled) in sentences added to index + as well as annotates named entities. All these have to be listed in files (see \ref tutorial3). */ @@ -35,7 +34,7 @@ public: /*! Tokenizes the sentence. \param sentence input sentence - \returns altered version of the input sentence + \returns tokenized sentence object build on the input sentence */ boost::shared_ptr tokenize(const std::string & sentence); @@ -58,7 +57,6 @@ private: bool _stopWordsEnabled; boost::shared_ptr _stopWords; - }; #endif diff --git a/concordia/t/test_regex_rule.cpp b/concordia/t/test_regex_rule.cpp index 78685cf..ada81eb 100644 --- a/concordia/t/test_regex_rule.cpp +++ b/concordia/t/test_regex_rule.cpp @@ -21,19 +21,19 @@ BOOST_AUTO_TEST_CASE( SimpleAnnotation ) BOOST_CHECK_EQUAL(iter->getStart(),7); BOOST_CHECK_EQUAL(iter->getEnd(),8); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),11); BOOST_CHECK_EQUAL(iter->getEnd(),12); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),14); BOOST_CHECK_EQUAL(iter->getEnd(),15); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),16); BOOST_CHECK_EQUAL(iter->getEnd(),17); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),18); BOOST_CHECK_EQUAL(iter->getEnd(),19); @@ -64,19 +64,19 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation ) BOOST_CHECK_EQUAL(iter->getStart(),3); BOOST_CHECK_EQUAL(iter->getEnd(),4); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),19); BOOST_CHECK_EQUAL(iter->getEnd(),20); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),21); BOOST_CHECK_EQUAL(iter->getEnd(),22); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),44); BOOST_CHECK_EQUAL(iter->getEnd(),45); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),45); BOOST_CHECK_EQUAL(iter->getEnd(),46); @@ -94,15 +94,15 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation ) BOOST_CHECK_EQUAL(iter->getStart(),8); BOOST_CHECK_EQUAL(iter->getEnd(),11); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),16); BOOST_CHECK_EQUAL(iter->getEnd(),19); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),24); BOOST_CHECK_EQUAL(iter->getEnd(),27); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),32); BOOST_CHECK_EQUAL(iter->getEnd(),35); @@ -132,7 +132,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation ) BOOST_CHECK_EQUAL(iter->getStart(),11); BOOST_CHECK_EQUAL(iter->getEnd(),12); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),29); BOOST_CHECK_EQUAL(iter->getEnd(),30); @@ -149,71 +149,71 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement ) BOOST_CHECK_EQUAL(iter->getStart(),2); BOOST_CHECK_EQUAL(iter->getEnd(),3); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),3); BOOST_CHECK_EQUAL(iter->getEnd(),4); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),4); BOOST_CHECK_EQUAL(iter->getEnd(),5); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),5); BOOST_CHECK_EQUAL(iter->getEnd(),6); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),8); BOOST_CHECK_EQUAL(iter->getEnd(),9); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),9); BOOST_CHECK_EQUAL(iter->getEnd(),10); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),11); BOOST_CHECK_EQUAL(iter->getEnd(),12); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),15); BOOST_CHECK_EQUAL(iter->getEnd(),16); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),16); BOOST_CHECK_EQUAL(iter->getEnd(),17); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),20); BOOST_CHECK_EQUAL(iter->getEnd(),21); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),21); BOOST_CHECK_EQUAL(iter->getEnd(),22); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),22); BOOST_CHECK_EQUAL(iter->getEnd(),23); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),23); BOOST_CHECK_EQUAL(iter->getEnd(),24); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),26); BOOST_CHECK_EQUAL(iter->getEnd(),27); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),27); BOOST_CHECK_EQUAL(iter->getEnd(),28); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),29); BOOST_CHECK_EQUAL(iter->getEnd(),30); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),33); BOOST_CHECK_EQUAL(iter->getEnd(),34); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),34); BOOST_CHECK_EQUAL(iter->getEnd(),35); diff --git a/concordia/t/test_sentence_tokenizer.cpp b/concordia/t/test_sentence_tokenizer.cpp index cc4ce11..626fdc9 100644 --- a/concordia/t/test_sentence_tokenizer.cpp +++ b/concordia/t/test_sentence_tokenizer.cpp @@ -47,79 +47,79 @@ BOOST_AUTO_TEST_CASE( NETest ) BOOST_CHECK_EQUAL(iter->getEnd(),4); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(), "date"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),6); BOOST_CHECK_EQUAL(iter->getEnd(),16); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); BOOST_CHECK_EQUAL(iter->getValue(), "ne_date"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),18); BOOST_CHECK_EQUAL(iter->getEnd(),22); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(), "mail"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),24); BOOST_CHECK_EQUAL(iter->getEnd(),40); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); BOOST_CHECK_EQUAL(iter->getValue(), "ne_email"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),42); BOOST_CHECK_EQUAL(iter->getEnd(),48); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(), "number"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),50); BOOST_CHECK_EQUAL(iter->getEnd(),54); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); BOOST_CHECK_EQUAL(iter->getValue(), "ne_number"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),56); BOOST_CHECK_EQUAL(iter->getEnd(),61); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(), "hello"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),61); BOOST_CHECK_EQUAL(iter->getEnd(),62); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); BOOST_CHECK_EQUAL(iter->getValue(), "ne_number"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),63); BOOST_CHECK_EQUAL(iter->getEnd(),69); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(), "zażółć"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),70); BOOST_CHECK_EQUAL(iter->getEnd(),75); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(), "gęślą"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),76); BOOST_CHECK_EQUAL(iter->getEnd(),80); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(), "jaźń"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),82); BOOST_CHECK_EQUAL(iter->getEnd(),88); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(), "zażółć"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),89); BOOST_CHECK_EQUAL(iter->getEnd(),94); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(), "gęślą"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),95); BOOST_CHECK_EQUAL(iter->getEnd(),99); @@ -156,52 +156,52 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest ) BOOST_CHECK_EQUAL(iter->getStart(),0); BOOST_CHECK_EQUAL(iter->getEnd(),23); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),23); BOOST_CHECK_EQUAL(iter->getEnd(),27); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"link"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),27); BOOST_CHECK_EQUAL(iter->getEnd(),31); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),32); BOOST_CHECK_EQUAL(iter->getEnd(),35); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"and"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),36); BOOST_CHECK_EQUAL(iter->getEnd(),39); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),39); BOOST_CHECK_EQUAL(iter->getEnd(),43); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"bold"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),43); BOOST_CHECK_EQUAL(iter->getEnd(),47); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),48); BOOST_CHECK_EQUAL(iter->getEnd(),51); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"and"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),52); BOOST_CHECK_EQUAL(iter->getEnd(),59); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"newline"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),60); BOOST_CHECK_EQUAL(iter->getEnd(),65); @@ -240,79 +240,79 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest ) BOOST_CHECK_EQUAL(iter->getEnd(),4); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"this"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),5); BOOST_CHECK_EQUAL(iter->getEnd(),7); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"is"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),8); BOOST_CHECK_EQUAL(iter->getEnd(),9); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"a"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),10); BOOST_CHECK_EQUAL(iter->getEnd(),18); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"sentence"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),20); BOOST_CHECK_EQUAL(iter->getEnd(),25); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"don't"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),26); BOOST_CHECK_EQUAL(iter->getEnd(),38); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),39); BOOST_CHECK_EQUAL(iter->getEnd(),41); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"it"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),43); BOOST_CHECK_EQUAL(iter->getEnd(),49); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"zażółć"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),51); BOOST_CHECK_EQUAL(iter->getEnd(),57); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),59); BOOST_CHECK_EQUAL(iter->getEnd(),63); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"jaźń"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),64); BOOST_CHECK_EQUAL(iter->getEnd(),71); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),72); BOOST_CHECK_EQUAL(iter->getEnd(),77); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"gęślą"); - iter++; + ++iter; BOOST_CHECK_EQUAL(iter->getStart(),78); BOOST_CHECK_EQUAL(iter->getEnd(),83); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń"); - iter++; + ++iter; } diff --git a/concordia/token_annotation.hpp b/concordia/token_annotation.hpp index 11153e5..e90f539 100644 --- a/concordia/token_annotation.hpp +++ b/concordia/token_annotation.hpp @@ -7,7 +7,7 @@ #include /*! - Class representing annotatio of char sequence as a token. + Class representing annotation of char sequence as a token. It is a type of interval that is also storing information about the annoation type and value. @@ -18,7 +18,7 @@ public: /*! Constructor. \param start start index of the annotation (char-level, 0-based) \param end end index of the annotation (char-level, 0-based) - \param type annotation type + \param annotationType annotation type \param value annotation value */ TokenAnnotation(const SUFFIX_MARKER_TYPE start, @@ -44,14 +44,22 @@ public: return _value; } + /*! Named entity annotation type + */ static int NE; + /*! Word annotation type + */ static int WORD; + /*! Html tag annotation type + */ static int HTML_TAG; + /*! Stop word annotation type + */ static int STOP_WORD; - + protected: int _annotationType; diff --git a/concordia/tokenized_sentence.cpp b/concordia/tokenized_sentence.cpp index 6302567..964e5e3 100644 --- a/concordia/tokenized_sentence.cpp +++ b/concordia/tokenized_sentence.cpp @@ -11,37 +11,43 @@ TokenizedSentence::TokenizedSentence(std::string sentence): TokenizedSentence::~TokenizedSentence() { } -void TokenizedSentence::addAnnotations(std::vector annotations) { - std::vector::iterator newAnnotation = annotations.begin(); - std::list::iterator existingAnnotation = _tokenAnnotations.begin(); - - while(newAnnotation != annotations.end()) { +void TokenizedSentence::addAnnotations( + std::vector annotations) { + std::vector::iterator newAnnotation = + annotations.begin(); + std::list::iterator existingAnnotation = + _tokenAnnotations.begin(); + + while (newAnnotation != annotations.end()) { if (existingAnnotation != _tokenAnnotations.end()) { // there are still some existing annotations, so perform checks if (newAnnotation->intersects(*existingAnnotation)) { // The new annotation intersects with the existing. // We can not add it, so let us just move on to the // next new annoation. - newAnnotation++; + ++newAnnotation; } else { // it is now important whether the new interval is before // or after existing - if (newAnnotation->getStart() < existingAnnotation->getStart()) { - // New interval does not intersect and is before existing. We add it. - _tokenAnnotations.insert(existingAnnotation, *newAnnotation); - newAnnotation++; + if (newAnnotation->getStart() < + existingAnnotation->getStart()) { + // New interval does not intersect and is + // before existing. We add it. + _tokenAnnotations.insert(existingAnnotation, + *newAnnotation); + ++newAnnotation; } else { - // If the new interval is after existing we move to the next existing annoation. - existingAnnotation++; + // If the new interval is after existing + // we move to the next existing annoation. + ++existingAnnotation; } } } else { // no more existing annotations, so just add the new annotation _tokenAnnotations.push_back(*newAnnotation); - newAnnotation++; + ++newAnnotation; } } - } void TokenizedSentence::toLowerCase() { @@ -54,8 +60,7 @@ void TokenizedSentence::generateHash(boost::shared_ptr wordMap) { annotation.getType() == TokenAnnotation::NE) { _codes.push_back(wordMap->getWordCode(annotation.getValue())); _tokens.push_back(annotation); - } + } } - } diff --git a/concordia/tokenized_sentence.hpp b/concordia/tokenized_sentence.hpp index a0ff96b..345c5fb 100644 --- a/concordia/tokenized_sentence.hpp +++ b/concordia/tokenized_sentence.hpp @@ -11,9 +11,12 @@ #include /*! - A sentence after anonymization operations. The class + A sentence after tokenizing operations. The class holds the current string represenation of the sentence - along with the annotations list. + along with the annotations list. The class also allows + for generating hash. After that operation the class + also holds the list of hashed codes and corresponding + tokens. */ class TokenizedSentence { @@ -22,7 +25,7 @@ public: Constructor. */ - TokenizedSentence(std::string sentence); + explicit TokenizedSentence(std::string sentence); /*! Destructor. */ @@ -35,21 +38,40 @@ public: return _sentence; } - /*! Getter for annotations list + /*! Getter for all annotations list. This method returns + all annotations, including those which are not considered + in the hash, i.e. stop words and html tags. \returns annotations list */ std::list getAnnotations() const { return _tokenAnnotations; } + /*! Getter for codes list. This data is available after calling + the hashGenerator method. + \returns codes list + */ std::vector getCodes() const { return _codes; } - + + /*! Getter for tokens list. This method returns + only those annotations considered + in the hash, i.e. words and named entities. + \returns tokens list + */ std::vector getTokens() const { return _tokens; } - + + /*! Method for generating hash based on annotations. + This method takes into account annotations of type + word and named entity. These are encoded and added + to to code list. Annotations corresponding to these + tokens are added to the tokens list. + \param wordMap word map to use when encoding tokens + \returns tokens list + */ void generateHash(boost::shared_ptr wordMap); /*! @@ -66,15 +88,15 @@ public: \param annotations list of annotations to be added */ - void addAnnotations(std::vector annotations); + void addAnnotations(std::vector annotations); private: std::string _sentence; std::list _tokenAnnotations; - + std::vector _codes; - + std::vector _tokens; }; diff --git a/concordia/tutorial.dox b/concordia/tutorial.dox index 6438efc..00d65dc 100644 --- a/concordia/tutorial.dox +++ b/concordia/tutorial.dox @@ -98,12 +98,15 @@ Concordia is equipped with a unique functionality of so called Concordia search, Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples. +Moreover, the below example presents the feature of retrieving a tokenized version of the example. + File concordia_searching.cpp: \verbatim #include #include #include #include +#include #include "config.hpp" @@ -115,7 +118,13 @@ using namespace std; int main() { Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); - concordia.addExample(Example("Alice has a cat", 56)); + boost::shared_ptr ts = concordia.addExample(Example("Alice has a cat", 56)); + cout << "Added the following tokens: " << endl; + BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) { + cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << "," + << token.getEnd() << ")" << endl; + } + concordia.addExample(Example("Alice has a dog", 23)); concordia.addExample(Example("New test product has a mistake", 321)); concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14)); @@ -153,6 +162,11 @@ int main() { This program should print: \verbatim +Added the following tokens: +"alice" at positions: [0,5) +"has" at positions: [6,9) +"a" at positions: [10,11) +"cat" at positions: [12,15) Searching for pattern: Our new test product has nothing to do with computers Printing all matched fragments: Matched pattern fragment found. Pattern fragment: [4,9] in sentence 14, at offset: 6 diff --git a/examples/concordia_search.cpp b/examples/concordia_search.cpp index 5a5cae8..b8ee36a 100644 --- a/examples/concordia_search.cpp +++ b/examples/concordia_search.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include "config.hpp" @@ -13,7 +14,13 @@ using namespace std; int main() { Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg"); - concordia.addExample(Example("Alice has a cat", 56)); + boost::shared_ptr ts = concordia.addExample(Example("Alice has a cat", 56)); + cout << "Added the following tokens: " << endl; + BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) { + cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << "," + << token.getEnd() << ")" << endl; + } + concordia.addExample(Example("Alice has a dog", 23)); concordia.addExample(Example("New test product has a mistake", 321)); concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));