finished original word positions

This commit is contained in:
rjawor 2015-06-27 12:40:24 +02:00
parent a8c5fa0c75
commit 5a57406875
22 changed files with 238 additions and 150 deletions

View File

@ -28,8 +28,9 @@ void checkConcordiaResults(
long baseLineCount) {
long lineIndex = 1;
BOOST_FOREACH(ConcordiaSearchResult result, results) {
SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size();
if (patternSize > 0) {
SUFFIX_MARKER_TYPE patternSize =
result.getTokenizedPattern()->getTokens().size();
if (patternSize > 0) {
if (result.getBestOverlay().size() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay has more than one fragment.");
@ -37,7 +38,7 @@ void checkConcordiaResults(
if (result.getBestOverlay().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"best overlay fragment has different size than pattern.");
"best overlay fragment has different size than pattern.");
}
if (result.getBestOverlayScore() != 1) {
reportError(baseLineCount + lineIndex,
@ -201,7 +202,8 @@ int main(int argc, char** argv) {
msdiff = time_end - time_start;
std::cout << "\tPattern used: " << std::endl << "\t\t";
BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) {
BOOST_FOREACH(TokenAnnotation annotation,
result->getTokenizedPattern()->getTokens()) {
std::cout << annotation.getValue() << " ";
}
std::cout << std::endl;

View File

@ -9,6 +9,7 @@ Before you compile, make sure you have these installed:
- cmake
- Boost library
- Log4cpp
- ICU
- (optional) Doxygen
- (optional) TeX

View File

@ -44,15 +44,17 @@ std::string _createLibraryVersion() {
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
boost::shared_ptr<TokenizedSentence> Concordia::addExample(const Example & example)
throw(ConcordiaException) {
boost::shared_ptr<TokenizedSentence> Concordia::addExample(
const Example & example)
throw(ConcordiaException) {
return _index->addExample(_hashGenerator, _T, _markers, example);
}
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
std::vector<TokenizedSentence> Concordia::addAllExamples(const std::vector<Example> & examples)
throw(ConcordiaException) {
std::vector<TokenizedSentence> Concordia::addAllExamples(
const std::vector<Example> & examples)
throw(ConcordiaException) {
return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
}
@ -165,7 +167,8 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
} else {
std::string empty;
return boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(new TokenizedSentence(empty))));
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(
new TokenizedSentence(empty))));
}
}
@ -182,4 +185,3 @@ void Concordia::clearIndex() throw(ConcordiaException) {
boost::filesystem::remove(_config->getMarkersFilePath());
}

View File

@ -54,16 +54,22 @@ public:
/*! Adds an Example to the index.
\param example example to be added
\returns tokenized sentence object,
containing information about original word positions
\throws ConcordiaException
*/
boost::shared_ptr<TokenizedSentence> addExample(const Example & example) throw(ConcordiaException);
boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
throw(ConcordiaException);
/*! Adds multiple examples to the index.
\param examples vector of examples to be added
\returns vector of tokenized sentence objects,
containing information about original word positions
\throws ConcordiaException
*/
std::vector<TokenizedSentence> addAllExamples(const std::vector<Example> & examples)
throw(ConcordiaException);
std::vector<TokenizedSentence> addAllExamples(
const std::vector<Example> & examples)
throw(ConcordiaException);
/*! Performs a simple substring lookup on the index.
For more info see \ref tutorial1_2.

View File

@ -25,7 +25,7 @@ boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
}
boost::shared_ptr<std::vector<saidx_t> > result =
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
for (int i = 0; i < T->size(); i++) {
result->push_back(SA_array[i]);
}
@ -48,7 +48,8 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
std::vector<TokenizedSentence> hashedPatterns;
BOOST_FOREACH(Example example, examples) {
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
boost::shared_ptr<TokenizedSentence> hashedPattern =
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedPatterns.push_back(*hashedPattern);
}
@ -56,7 +57,7 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap();
return hashedPatterns;
}
@ -71,12 +72,13 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
boost::shared_ptr<TokenizedSentence> hashedPattern =
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, example);
hashedIndexFile.close();
markersFile.close();
hashGenerator->serializeWordMap();
return hashedPattern;
}
@ -87,9 +89,10 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
boost::shared_ptr<TokenizedSentence> hashedPattern = hashGenerator->generateHash(example.getSentence());
boost::shared_ptr<TokenizedSentence> hashedPattern =
hashGenerator->generateHash(example.getSentence());
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
int offset = 0;
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
it != hash.end(); ++it) {
@ -117,7 +120,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
Utils::writeMarker(markersFile, sentenceBoundaryMA);
markers->push_back(sentenceBoundaryMA);
return hashedPattern;
}

View File

@ -44,11 +44,13 @@ public:
and markers array (also passed to this method) are appended
with the hashed example. At the same time, HDD versions of these
two data structures are also appended with the same example.
The method returns a tokenized version of the example.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param example example to be added to index
\returns tokenized example
\throws ConcordiaException
*/
boost::shared_ptr<TokenizedSentence> addExample(
@ -62,11 +64,13 @@ public:
and markers array (also passed to this method) are appended
with the hashed examples. At the same time, HDD versions of these
two data structures are also appended with the same examples.
The method returns a vector of tokenized examples.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param examples vector of examples to be added to index
\returns vector of tokenized examples
\throws ConcordiaException
*/
std::vector<TokenizedSentence> addAllExamples(
@ -83,7 +87,8 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > T);
private:
boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile,
boost::shared_ptr<TokenizedSentence> _addSingleExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,

View File

@ -29,9 +29,10 @@ HashGenerator::~HashGenerator() {
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) {
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
boost::shared_ptr<TokenizedSentence> ts =
_sentenceTokenizer->tokenize(sentence);
ts->generateHash(_wordMap);
if (ts->getTokens().size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence.");
}

View File

@ -15,14 +15,14 @@
/*!
Class for generating a sentence hash. The hash is generated from a sentence
given in raw string. String is first anonymized and tokenized. After these
operations, each token is coded as an integer, according to WordMap.
Resulting hash is a vector of integers.
given in raw string. String is first tokenized by SentenceTokenizer and
then each token is coded as an integer, according to WordMap.
Resulting hash is an instance of TokenizedSentence.
Sentence hashed is used when adding a sentence to index and during searching.
Hashed sentence is used when adding a sentence to index and during searching.
HashGenerator holds an instance of WordMap, used to code tokens as integers
and SentenceAnonymizer, used to preprocess the sentence string.
and SentenceTokenizer, used to tokenize the sentence string.
*/
@ -42,9 +42,10 @@ public:
/*!
Generates hash of a sentence.
\param sentence sentence to generate hash from
\returns vector of integers
\returns tokenized sentence, containing the hash
*/
boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence)
boost::shared_ptr<TokenizedSentence> generateHash(
const std::string & sentence)
throw(ConcordiaException);
/*!

View File

@ -73,6 +73,7 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(hashedPattern));
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes());
_concordiaSearcher->concordiaSearch(result, T, markers,
SA, hashedPattern->getCodes());
return result;
}

View File

@ -51,9 +51,12 @@ public:
return _end;
}
friend std::ostream & operator << (std::ostream & o, const Interval & interval) {
return o << "[" << interval.getStart() << "," << interval.getEnd() << ")";
friend std::ostream & operator << (std::ostream & o,
const Interval & interval) {
return o << "[" << interval.getStart()
<< "," << interval.getEnd() << ")";
}
protected:
SUFFIX_MARKER_TYPE _start;

View File

@ -6,7 +6,7 @@
/*!
Class representing matched pattern fragment in concordia search.
This fragment can be seen as an interval of the pattern.
This fragment can be seen as a word interval of the pattern.
This class holds information about:
- where the pattern fragment was matched (example id and example offset)

View File

@ -13,9 +13,11 @@ RegexRule::RegexRule(std::string patternString,
_value(value) {
try {
if (caseSensitive) {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
_pattern = boost::make_u32regex(
UnicodeString(patternString.c_str()));
} else {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
_pattern = boost::make_u32regex(
UnicodeString(patternString.c_str()), boost::regex::icase);
}
} catch(const std::exception & e) {
std::stringstream ss;
@ -37,7 +39,8 @@ RegexRule::~RegexRule() {
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
try {
UnicodeString s(sentence->getSentence().c_str());
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(s, _pattern));
boost::u32regex_iterator<const UChar*> end;
std::vector<TokenAnnotation> annotations;
for (; begin != end; ++begin) {
@ -46,19 +49,21 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
std::string value;
if (_annotationType == TokenAnnotation::WORD) {
UnicodeString unicodeValue;
s.extract(begin->position(), begin->length(), unicodeValue);
s.extract(begin->position(), begin->length(), unicodeValue);
unicodeValue.toUTF8String(value);
} else {
value = _value;
}
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value);
TokenAnnotation annotation(matchBegin, matchEnd,
_annotationType, value);
annotations.push_back(annotation);
}
sentence->addAnnotations(annotations);
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Exception while applying regex rule: "
<< _annotationType << " to text: " << sentence->getSentence();
<< _annotationType << " to text: "
<< sentence->getSentence();
ss << ", message: " << e.what();
throw ConcordiaException(ss.str());
}

View File

@ -15,8 +15,9 @@ typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
/*!
Class for representing a regular expression annotation rule.
Holds regex pattern string for matching and replacement string for
annotating found matches.
Holds regex pattern string for matching and default value to assign
to the annotations. Rule also has a type, given to all annotations
produced by it.
*/
class RegexRule {
@ -25,6 +26,7 @@ public:
Constructor.
\param patternString regex pattern to match
\param annoationType type of annotation
\param value value to be assigned to the annotation
\param caseSensitive case sensitivity of the pattern
*/
RegexRule(std::string patternString,
@ -37,7 +39,7 @@ public:
*/
virtual ~RegexRule();
/*! Applies the operation on anonymized sentence.
/*! Applies regex annotation on tokenized sentence.
\param sentence the input sentence
*/
void apply(boost::shared_ptr<TokenizedSentence> sentence);
@ -46,7 +48,7 @@ private:
int _annotationType;
std::string _value;
boost::u32regex _pattern;
};

View File

@ -26,7 +26,7 @@ SentenceTokenizer::~SentenceTokenizer() {
boost::shared_ptr<TokenizedSentence>
SentenceTokenizer::tokenize(const std::string & sentence) {
boost::shared_ptr<TokenizedSentence>
boost::shared_ptr<TokenizedSentence>
result(new TokenizedSentence(sentence));
_htmlTags->apply(result);
@ -40,9 +40,10 @@ boost::shared_ptr<TokenizedSentence>
if (_stopWordsEnabled) {
_stopWords->apply(result);
}
boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, ""));
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
TokenAnnotation::WORD, ""));
wordsRule->apply(result);
boost::shared_ptr<RegexRule> singleLetterWordsRule(
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
@ -103,7 +104,8 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
tagsExpression += "br).*?>";
_htmlTags = boost::shared_ptr<RegexRule>(
new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false));
new RegexRule(tagsExpression,
TokenAnnotation::HTML_TAG, "", false));
}
boost::shared_ptr<RegexRule>
@ -137,6 +139,6 @@ boost::shared_ptr<RegexRule>
expression = expression.substr(0, expression.size()-1);
expression += ")";
return boost::shared_ptr<RegexRule>(
new RegexRule(expression, annotationType, value, false));
new RegexRule(expression, annotationType, value, false));
}

View File

@ -14,10 +14,9 @@
/*!
Class for tokenizing sentence before generating hash.
This operation is is used to
remove unnecessary symbols and possibly words from sentences added to index
and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
as well as annotates named entities and special symbols. All these have to be listed in files
Tokenizer ignores unnecessary symbols, html tags and possibly stop words
(if the option is enabled) in sentences added to index
as well as annotates named entities. All these have to be listed in files
(see \ref tutorial3).
*/
@ -35,7 +34,7 @@ public:
/*! Tokenizes the sentence.
\param sentence input sentence
\returns altered version of the input sentence
\returns tokenized sentence object build on the input sentence
*/
boost::shared_ptr<TokenizedSentence>
tokenize(const std::string & sentence);
@ -58,7 +57,6 @@ private:
bool _stopWordsEnabled;
boost::shared_ptr<RegexRule> _stopWords;
};
#endif

View File

@ -21,19 +21,19 @@ BOOST_AUTO_TEST_CASE( SimpleAnnotation )
BOOST_CHECK_EQUAL(iter->getStart(),7);
BOOST_CHECK_EQUAL(iter->getEnd(),8);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),11);
BOOST_CHECK_EQUAL(iter->getEnd(),12);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),14);
BOOST_CHECK_EQUAL(iter->getEnd(),15);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),16);
BOOST_CHECK_EQUAL(iter->getEnd(),17);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),18);
BOOST_CHECK_EQUAL(iter->getEnd(),19);
@ -64,19 +64,19 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
BOOST_CHECK_EQUAL(iter->getStart(),3);
BOOST_CHECK_EQUAL(iter->getEnd(),4);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),19);
BOOST_CHECK_EQUAL(iter->getEnd(),20);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),21);
BOOST_CHECK_EQUAL(iter->getEnd(),22);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),44);
BOOST_CHECK_EQUAL(iter->getEnd(),45);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),45);
BOOST_CHECK_EQUAL(iter->getEnd(),46);
@ -94,15 +94,15 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
BOOST_CHECK_EQUAL(iter->getStart(),8);
BOOST_CHECK_EQUAL(iter->getEnd(),11);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),16);
BOOST_CHECK_EQUAL(iter->getEnd(),19);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),24);
BOOST_CHECK_EQUAL(iter->getEnd(),27);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),32);
BOOST_CHECK_EQUAL(iter->getEnd(),35);
@ -132,7 +132,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
BOOST_CHECK_EQUAL(iter->getStart(),11);
BOOST_CHECK_EQUAL(iter->getEnd(),12);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),29);
BOOST_CHECK_EQUAL(iter->getEnd(),30);
@ -149,71 +149,71 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
BOOST_CHECK_EQUAL(iter->getStart(),2);
BOOST_CHECK_EQUAL(iter->getEnd(),3);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),3);
BOOST_CHECK_EQUAL(iter->getEnd(),4);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),4);
BOOST_CHECK_EQUAL(iter->getEnd(),5);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),5);
BOOST_CHECK_EQUAL(iter->getEnd(),6);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),8);
BOOST_CHECK_EQUAL(iter->getEnd(),9);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),9);
BOOST_CHECK_EQUAL(iter->getEnd(),10);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),11);
BOOST_CHECK_EQUAL(iter->getEnd(),12);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),15);
BOOST_CHECK_EQUAL(iter->getEnd(),16);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),16);
BOOST_CHECK_EQUAL(iter->getEnd(),17);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),20);
BOOST_CHECK_EQUAL(iter->getEnd(),21);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),21);
BOOST_CHECK_EQUAL(iter->getEnd(),22);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),22);
BOOST_CHECK_EQUAL(iter->getEnd(),23);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),23);
BOOST_CHECK_EQUAL(iter->getEnd(),24);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),26);
BOOST_CHECK_EQUAL(iter->getEnd(),27);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),27);
BOOST_CHECK_EQUAL(iter->getEnd(),28);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),29);
BOOST_CHECK_EQUAL(iter->getEnd(),30);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),33);
BOOST_CHECK_EQUAL(iter->getEnd(),34);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),34);
BOOST_CHECK_EQUAL(iter->getEnd(),35);

View File

@ -47,79 +47,79 @@ BOOST_AUTO_TEST_CASE( NETest )
BOOST_CHECK_EQUAL(iter->getEnd(),4);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "date");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),6);
BOOST_CHECK_EQUAL(iter->getEnd(),16);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),18);
BOOST_CHECK_EQUAL(iter->getEnd(),22);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),24);
BOOST_CHECK_EQUAL(iter->getEnd(),40);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),42);
BOOST_CHECK_EQUAL(iter->getEnd(),48);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "number");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),50);
BOOST_CHECK_EQUAL(iter->getEnd(),54);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),56);
BOOST_CHECK_EQUAL(iter->getEnd(),61);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),61);
BOOST_CHECK_EQUAL(iter->getEnd(),62);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),63);
BOOST_CHECK_EQUAL(iter->getEnd(),69);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),70);
BOOST_CHECK_EQUAL(iter->getEnd(),75);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),76);
BOOST_CHECK_EQUAL(iter->getEnd(),80);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),82);
BOOST_CHECK_EQUAL(iter->getEnd(),88);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),89);
BOOST_CHECK_EQUAL(iter->getEnd(),94);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),95);
BOOST_CHECK_EQUAL(iter->getEnd(),99);
@ -156,52 +156,52 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),23);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),23);
BOOST_CHECK_EQUAL(iter->getEnd(),27);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"link");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),27);
BOOST_CHECK_EQUAL(iter->getEnd(),31);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),32);
BOOST_CHECK_EQUAL(iter->getEnd(),35);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),36);
BOOST_CHECK_EQUAL(iter->getEnd(),39);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),39);
BOOST_CHECK_EQUAL(iter->getEnd(),43);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),43);
BOOST_CHECK_EQUAL(iter->getEnd(),47);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),48);
BOOST_CHECK_EQUAL(iter->getEnd(),51);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),52);
BOOST_CHECK_EQUAL(iter->getEnd(),59);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),60);
BOOST_CHECK_EQUAL(iter->getEnd(),65);
@ -240,79 +240,79 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
BOOST_CHECK_EQUAL(iter->getEnd(),4);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"this");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),5);
BOOST_CHECK_EQUAL(iter->getEnd(),7);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"is");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),8);
BOOST_CHECK_EQUAL(iter->getEnd(),9);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"a");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),10);
BOOST_CHECK_EQUAL(iter->getEnd(),18);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"sentence");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),20);
BOOST_CHECK_EQUAL(iter->getEnd(),25);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"don't");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),26);
BOOST_CHECK_EQUAL(iter->getEnd(),38);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),39);
BOOST_CHECK_EQUAL(iter->getEnd(),41);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"it");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),43);
BOOST_CHECK_EQUAL(iter->getEnd(),49);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"zażółć");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),51);
BOOST_CHECK_EQUAL(iter->getEnd(),57);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),59);
BOOST_CHECK_EQUAL(iter->getEnd(),63);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"jaźń");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),64);
BOOST_CHECK_EQUAL(iter->getEnd(),71);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),72);
BOOST_CHECK_EQUAL(iter->getEnd(),77);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"gęślą");
iter++;
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),78);
BOOST_CHECK_EQUAL(iter->getEnd(),83);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń");
iter++;
++iter;
}

View File

@ -7,7 +7,7 @@
#include <string>
/*!
Class representing annotatio of char sequence as a token.
Class representing annotation of char sequence as a token.
It is a type of interval that is also storing information
about the annoation type and value.
@ -18,7 +18,7 @@ public:
/*! Constructor.
\param start start index of the annotation (char-level, 0-based)
\param end end index of the annotation (char-level, 0-based)
\param type annotation type
\param annotationType annotation type
\param value annotation value
*/
TokenAnnotation(const SUFFIX_MARKER_TYPE start,
@ -44,14 +44,22 @@ public:
return _value;
}
/*! Named entity annotation type
*/
static int NE;
/*! Word annotation type
*/
static int WORD;
/*! Html tag annotation type
*/
static int HTML_TAG;
/*! Stop word annotation type
*/
static int STOP_WORD;
protected:
int _annotationType;

View File

@ -11,37 +11,43 @@ TokenizedSentence::TokenizedSentence(std::string sentence):
TokenizedSentence::~TokenizedSentence() {
}
void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
while(newAnnotation != annotations.end()) {
void TokenizedSentence::addAnnotations(
std::vector<TokenAnnotation> annotations) {
std::vector<TokenAnnotation>::iterator newAnnotation =
annotations.begin();
std::list<TokenAnnotation>::iterator existingAnnotation =
_tokenAnnotations.begin();
while (newAnnotation != annotations.end()) {
if (existingAnnotation != _tokenAnnotations.end()) {
// there are still some existing annotations, so perform checks
if (newAnnotation->intersects(*existingAnnotation)) {
// The new annotation intersects with the existing.
// We can not add it, so let us just move on to the
// next new annoation.
newAnnotation++;
++newAnnotation;
} else {
// it is now important whether the new interval is before
// or after existing
if (newAnnotation->getStart() < existingAnnotation->getStart()) {
// New interval does not intersect and is before existing. We add it.
_tokenAnnotations.insert(existingAnnotation, *newAnnotation);
newAnnotation++;
if (newAnnotation->getStart() <
existingAnnotation->getStart()) {
// New interval does not intersect and is
// before existing. We add it.
_tokenAnnotations.insert(existingAnnotation,
*newAnnotation);
++newAnnotation;
} else {
// If the new interval is after existing we move to the next existing annoation.
existingAnnotation++;
// If the new interval is after existing
// we move to the next existing annoation.
++existingAnnotation;
}
}
} else {
// no more existing annotations, so just add the new annotation
_tokenAnnotations.push_back(*newAnnotation);
newAnnotation++;
++newAnnotation;
}
}
}
void TokenizedSentence::toLowerCase() {
@ -54,8 +60,7 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
annotation.getType() == TokenAnnotation::NE) {
_codes.push_back(wordMap->getWordCode(annotation.getValue()));
_tokens.push_back(annotation);
}
}
}
}

View File

@ -11,9 +11,12 @@
#include <list>
/*!
A sentence after anonymization operations. The class
A sentence after tokenizing operations. The class
holds the current string represenation of the sentence
along with the annotations list.
along with the annotations list. The class also allows
for generating hash. After that operation the class
also holds the list of hashed codes and corresponding
tokens.
*/
class TokenizedSentence {
@ -22,7 +25,7 @@ public:
Constructor.
*/
TokenizedSentence(std::string sentence);
explicit TokenizedSentence(std::string sentence);
/*! Destructor.
*/
@ -35,21 +38,40 @@ public:
return _sentence;
}
/*! Getter for annotations list
/*! Getter for all annotations list. This method returns
all annotations, including those which are not considered
in the hash, i.e. stop words and html tags.
\returns annotations list
*/
std::list<TokenAnnotation> getAnnotations() const {
return _tokenAnnotations;
}
/*! Getter for codes list. This data is available after calling
the hashGenerator method.
\returns codes list
*/
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
return _codes;
}
/*! Getter for tokens list. This method returns
only those annotations considered
in the hash, i.e. words and named entities.
\returns tokens list
*/
std::vector<TokenAnnotation> getTokens() const {
return _tokens;
}
/*! Method for generating hash based on annotations.
This method takes into account annotations of type
word and named entity. These are encoded and added
to to code list. Annotations corresponding to these
tokens are added to the tokens list.
\param wordMap word map to use when encoding tokens
\returns tokens list
*/
void generateHash(boost::shared_ptr<WordMap> wordMap);
/*!
@ -66,15 +88,15 @@ public:
\param annotations list of annotations to be added
*/
void addAnnotations(std::vector<TokenAnnotation> annotations);
void addAnnotations(std::vector<TokenAnnotation> annotations);
private:
std::string _sentence;
std::list<TokenAnnotation> _tokenAnnotations;
std::vector<INDEX_CHARACTER_TYPE> _codes;
std::vector<TokenAnnotation> _tokens;
};

View File

@ -98,12 +98,15 @@ Concordia is equipped with a unique functionality of so called Concordia search,
Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples.
Moreover, the below example presents the feature of retrieving a tokenized version of the example.
File concordia_searching.cpp:
\verbatim
#include <concordia/concordia.hpp>
#include <concordia/concordia_search_result.hpp>
#include <concordia/matched_pattern_fragment.hpp>
#include <concordia/example.hpp>
#include <concordia/tokenized_sentence.hpp>
#include "config.hpp"
@ -115,7 +118,13 @@ using namespace std;
int main() {
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
concordia.addExample(Example("Alice has a cat", 56));
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
cout << "Added the following tokens: " << endl;
BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
<< token.getEnd() << ")" << endl;
}
concordia.addExample(Example("Alice has a dog", 23));
concordia.addExample(Example("New test product has a mistake", 321));
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
@ -153,6 +162,11 @@ int main() {
This program should print:
\verbatim
Added the following tokens:
"alice" at positions: [0,5)
"has" at positions: [6,9)
"a" at positions: [10,11)
"cat" at positions: [12,15)
Searching for pattern: Our new test product has nothing to do with computers
Printing all matched fragments:
Matched pattern fragment found. Pattern fragment: [4,9] in sentence 14, at offset: 6

View File

@ -2,6 +2,7 @@
#include <concordia/concordia_search_result.hpp>
#include <concordia/matched_pattern_fragment.hpp>
#include <concordia/example.hpp>
#include <concordia/tokenized_sentence.hpp>
#include "config.hpp"
@ -13,7 +14,13 @@ using namespace std;
int main() {
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
concordia.addExample(Example("Alice has a cat", 56));
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
cout << "Added the following tokens: " << endl;
BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
<< token.getEnd() << ")" << endl;
}
concordia.addExample(Example("Alice has a dog", 23));
concordia.addExample(Example("New test product has a mistake", 321));
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));