finished original word positions
This commit is contained in:
parent
a8c5fa0c75
commit
5a57406875
@ -28,8 +28,9 @@ void checkConcordiaResults(
|
||||
long baseLineCount) {
|
||||
long lineIndex = 1;
|
||||
BOOST_FOREACH(ConcordiaSearchResult result, results) {
|
||||
SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size();
|
||||
if (patternSize > 0) {
|
||||
SUFFIX_MARKER_TYPE patternSize =
|
||||
result.getTokenizedPattern()->getTokens().size();
|
||||
if (patternSize > 0) {
|
||||
if (result.getBestOverlay().size() != 1) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay has more than one fragment.");
|
||||
@ -37,7 +38,7 @@ void checkConcordiaResults(
|
||||
if (result.getBestOverlay().at(0).getMatchedLength()
|
||||
!= patternSize) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay fragment has different size than pattern.");
|
||||
"best overlay fragment has different size than pattern.");
|
||||
}
|
||||
if (result.getBestOverlayScore() != 1) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
@ -201,7 +202,8 @@ int main(int argc, char** argv) {
|
||||
msdiff = time_end - time_start;
|
||||
|
||||
std::cout << "\tPattern used: " << std::endl << "\t\t";
|
||||
BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) {
|
||||
BOOST_FOREACH(TokenAnnotation annotation,
|
||||
result->getTokenizedPattern()->getTokens()) {
|
||||
std::cout << annotation.getValue() << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
@ -9,6 +9,7 @@ Before you compile, make sure you have these installed:
|
||||
- cmake
|
||||
- Boost library
|
||||
- Log4cpp
|
||||
- ICU
|
||||
- (optional) Doxygen
|
||||
- (optional) TeX
|
||||
|
||||
|
@ -44,15 +44,17 @@ std::string _createLibraryVersion() {
|
||||
|
||||
// Sentences are written to disk and added to T.
|
||||
// SA is generated on command by other methods.
|
||||
boost::shared_ptr<TokenizedSentence> Concordia::addExample(const Example & example)
|
||||
throw(ConcordiaException) {
|
||||
boost::shared_ptr<TokenizedSentence> Concordia::addExample(
|
||||
const Example & example)
|
||||
throw(ConcordiaException) {
|
||||
return _index->addExample(_hashGenerator, _T, _markers, example);
|
||||
}
|
||||
|
||||
// Sentences are written to disk and added to T.
|
||||
// SA is generated on command by other methods.
|
||||
std::vector<TokenizedSentence> Concordia::addAllExamples(const std::vector<Example> & examples)
|
||||
throw(ConcordiaException) {
|
||||
std::vector<TokenizedSentence> Concordia::addAllExamples(
|
||||
const std::vector<Example> & examples)
|
||||
throw(ConcordiaException) {
|
||||
return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
|
||||
}
|
||||
|
||||
@ -165,7 +167,8 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
|
||||
} else {
|
||||
std::string empty;
|
||||
return boost::shared_ptr<ConcordiaSearchResult>(
|
||||
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(new TokenizedSentence(empty))));
|
||||
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(
|
||||
new TokenizedSentence(empty))));
|
||||
}
|
||||
}
|
||||
|
||||
@ -182,4 +185,3 @@ void Concordia::clearIndex() throw(ConcordiaException) {
|
||||
boost::filesystem::remove(_config->getMarkersFilePath());
|
||||
}
|
||||
|
||||
|
||||
|
@ -54,16 +54,22 @@ public:
|
||||
|
||||
/*! Adds an Example to the index.
|
||||
\param example example to be added
|
||||
\returns tokenized sentence object,
|
||||
containing information about original word positions
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
boost::shared_ptr<TokenizedSentence> addExample(const Example & example) throw(ConcordiaException);
|
||||
boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Adds multiple examples to the index.
|
||||
\param examples vector of examples to be added
|
||||
\returns vector of tokenized sentence objects,
|
||||
containing information about original word positions
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
std::vector<TokenizedSentence> addAllExamples(const std::vector<Example> & examples)
|
||||
throw(ConcordiaException);
|
||||
std::vector<TokenizedSentence> addAllExamples(
|
||||
const std::vector<Example> & examples)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Performs a simple substring lookup on the index.
|
||||
For more info see \ref tutorial1_2.
|
||||
|
@ -25,7 +25,7 @@ boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
||||
}
|
||||
|
||||
boost::shared_ptr<std::vector<saidx_t> > result =
|
||||
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
|
||||
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
|
||||
for (int i = 0; i < T->size(); i++) {
|
||||
result->push_back(SA_array[i]);
|
||||
}
|
||||
@ -48,7 +48,8 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
|
||||
|
||||
std::vector<TokenizedSentence> hashedPatterns;
|
||||
BOOST_FOREACH(Example example, examples) {
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
||||
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||
T, markers, example);
|
||||
hashedPatterns.push_back(*hashedPattern);
|
||||
}
|
||||
@ -56,7 +57,7 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
|
||||
hashedIndexFile.close();
|
||||
markersFile.close();
|
||||
hashGenerator->serializeWordMap();
|
||||
|
||||
|
||||
return hashedPatterns;
|
||||
}
|
||||
|
||||
@ -71,12 +72,13 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
|
||||
std::ofstream markersFile;
|
||||
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||
std::ios::app|std::ios::binary);
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
||||
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||
T, markers, example);
|
||||
hashedIndexFile.close();
|
||||
markersFile.close();
|
||||
hashGenerator->serializeWordMap();
|
||||
|
||||
|
||||
return hashedPattern;
|
||||
}
|
||||
|
||||
@ -87,9 +89,10 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const Example & example) {
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern = hashGenerator->generateHash(example.getSentence());
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
||||
hashGenerator->generateHash(example.getSentence());
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
|
||||
|
||||
|
||||
int offset = 0;
|
||||
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
@ -117,7 +120,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||
markers->push_back(sentenceBoundaryMA);
|
||||
|
||||
|
||||
return hashedPattern;
|
||||
}
|
||||
|
||||
|
@ -44,11 +44,13 @@ public:
|
||||
and markers array (also passed to this method) are appended
|
||||
with the hashed example. At the same time, HDD versions of these
|
||||
two data structures are also appended with the same example.
|
||||
The method returns a tokenized version of the example.
|
||||
\param hashGenerator hash generator to be used to prepare the hash
|
||||
of the example
|
||||
\param T RAM-based hash index to be appended to
|
||||
\param markers RAM-based markers array to be appended to
|
||||
\param example example to be added to index
|
||||
\returns tokenized example
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
boost::shared_ptr<TokenizedSentence> addExample(
|
||||
@ -62,11 +64,13 @@ public:
|
||||
and markers array (also passed to this method) are appended
|
||||
with the hashed examples. At the same time, HDD versions of these
|
||||
two data structures are also appended with the same examples.
|
||||
The method returns a vector of tokenized examples.
|
||||
\param hashGenerator hash generator to be used to prepare the hash
|
||||
of the example
|
||||
\param T RAM-based hash index to be appended to
|
||||
\param markers RAM-based markers array to be appended to
|
||||
\param examples vector of examples to be added to index
|
||||
\returns vector of tokenized examples
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
std::vector<TokenizedSentence> addAllExamples(
|
||||
@ -83,7 +87,8 @@ public:
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T);
|
||||
|
||||
private:
|
||||
boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile,
|
||||
boost::shared_ptr<TokenizedSentence> _addSingleExample(
|
||||
std::ofstream & hashedIndexFile,
|
||||
std::ofstream & markersFile,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
|
@ -29,9 +29,10 @@ HashGenerator::~HashGenerator() {
|
||||
|
||||
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
|
||||
const std::string & sentence) throw(ConcordiaException) {
|
||||
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
||||
boost::shared_ptr<TokenizedSentence> ts =
|
||||
_sentenceTokenizer->tokenize(sentence);
|
||||
ts->generateHash(_wordMap);
|
||||
|
||||
|
||||
if (ts->getTokens().size() > Utils::maxSentenceSize) {
|
||||
throw ConcordiaException("Trying to add too long sentence.");
|
||||
}
|
||||
|
@ -15,14 +15,14 @@
|
||||
|
||||
/*!
|
||||
Class for generating a sentence hash. The hash is generated from a sentence
|
||||
given in raw string. String is first anonymized and tokenized. After these
|
||||
operations, each token is coded as an integer, according to WordMap.
|
||||
Resulting hash is a vector of integers.
|
||||
given in raw string. String is first tokenized by SentenceTokenizer and
|
||||
then each token is coded as an integer, according to WordMap.
|
||||
Resulting hash is an instance of TokenizedSentence.
|
||||
|
||||
Sentence hashed is used when adding a sentence to index and during searching.
|
||||
Hashed sentence is used when adding a sentence to index and during searching.
|
||||
|
||||
HashGenerator holds an instance of WordMap, used to code tokens as integers
|
||||
and SentenceAnonymizer, used to preprocess the sentence string.
|
||||
and SentenceTokenizer, used to tokenize the sentence string.
|
||||
|
||||
*/
|
||||
|
||||
@ -42,9 +42,10 @@ public:
|
||||
/*!
|
||||
Generates hash of a sentence.
|
||||
\param sentence sentence to generate hash from
|
||||
\returns vector of integers
|
||||
\returns tokenized sentence, containing the hash
|
||||
*/
|
||||
boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence)
|
||||
boost::shared_ptr<TokenizedSentence> generateHash(
|
||||
const std::string & sentence)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*!
|
||||
|
@ -73,6 +73,7 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
|
||||
boost::shared_ptr<ConcordiaSearchResult>(
|
||||
new ConcordiaSearchResult(hashedPattern));
|
||||
|
||||
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes());
|
||||
_concordiaSearcher->concordiaSearch(result, T, markers,
|
||||
SA, hashedPattern->getCodes());
|
||||
return result;
|
||||
}
|
||||
|
@ -51,9 +51,12 @@ public:
|
||||
return _end;
|
||||
}
|
||||
|
||||
friend std::ostream & operator << (std::ostream & o, const Interval & interval) {
|
||||
return o << "[" << interval.getStart() << "," << interval.getEnd() << ")";
|
||||
friend std::ostream & operator << (std::ostream & o,
|
||||
const Interval & interval) {
|
||||
return o << "[" << interval.getStart()
|
||||
<< "," << interval.getEnd() << ")";
|
||||
}
|
||||
|
||||
protected:
|
||||
SUFFIX_MARKER_TYPE _start;
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
/*!
|
||||
Class representing matched pattern fragment in concordia search.
|
||||
This fragment can be seen as an interval of the pattern.
|
||||
This fragment can be seen as a word interval of the pattern.
|
||||
|
||||
This class holds information about:
|
||||
- where the pattern fragment was matched (example id and example offset)
|
||||
|
@ -13,9 +13,11 @@ RegexRule::RegexRule(std::string patternString,
|
||||
_value(value) {
|
||||
try {
|
||||
if (caseSensitive) {
|
||||
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
|
||||
_pattern = boost::make_u32regex(
|
||||
UnicodeString(patternString.c_str()));
|
||||
} else {
|
||||
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
|
||||
_pattern = boost::make_u32regex(
|
||||
UnicodeString(patternString.c_str()), boost::regex::icase);
|
||||
}
|
||||
} catch(const std::exception & e) {
|
||||
std::stringstream ss;
|
||||
@ -37,7 +39,8 @@ RegexRule::~RegexRule() {
|
||||
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
|
||||
try {
|
||||
UnicodeString s(sentence->getSentence().c_str());
|
||||
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
|
||||
boost::u32regex_iterator<const UChar*> begin(
|
||||
boost::make_u32regex_iterator(s, _pattern));
|
||||
boost::u32regex_iterator<const UChar*> end;
|
||||
std::vector<TokenAnnotation> annotations;
|
||||
for (; begin != end; ++begin) {
|
||||
@ -46,19 +49,21 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
|
||||
std::string value;
|
||||
if (_annotationType == TokenAnnotation::WORD) {
|
||||
UnicodeString unicodeValue;
|
||||
s.extract(begin->position(), begin->length(), unicodeValue);
|
||||
s.extract(begin->position(), begin->length(), unicodeValue);
|
||||
unicodeValue.toUTF8String(value);
|
||||
} else {
|
||||
value = _value;
|
||||
}
|
||||
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value);
|
||||
TokenAnnotation annotation(matchBegin, matchEnd,
|
||||
_annotationType, value);
|
||||
annotations.push_back(annotation);
|
||||
}
|
||||
sentence->addAnnotations(annotations);
|
||||
} catch(const std::exception & e) {
|
||||
std::stringstream ss;
|
||||
ss << "Exception while applying regex rule: "
|
||||
<< _annotationType << " to text: " << sentence->getSentence();
|
||||
<< _annotationType << " to text: "
|
||||
<< sentence->getSentence();
|
||||
ss << ", message: " << e.what();
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
|
@ -15,8 +15,9 @@ typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
|
||||
|
||||
/*!
|
||||
Class for representing a regular expression annotation rule.
|
||||
Holds regex pattern string for matching and replacement string for
|
||||
annotating found matches.
|
||||
Holds regex pattern string for matching and default value to assign
|
||||
to the annotations. Rule also has a type, given to all annotations
|
||||
produced by it.
|
||||
|
||||
*/
|
||||
class RegexRule {
|
||||
@ -25,6 +26,7 @@ public:
|
||||
Constructor.
|
||||
\param patternString regex pattern to match
|
||||
\param annoationType type of annotation
|
||||
\param value value to be assigned to the annotation
|
||||
\param caseSensitive case sensitivity of the pattern
|
||||
*/
|
||||
RegexRule(std::string patternString,
|
||||
@ -37,7 +39,7 @@ public:
|
||||
*/
|
||||
virtual ~RegexRule();
|
||||
|
||||
/*! Applies the operation on anonymized sentence.
|
||||
/*! Applies regex annotation on tokenized sentence.
|
||||
\param sentence the input sentence
|
||||
*/
|
||||
void apply(boost::shared_ptr<TokenizedSentence> sentence);
|
||||
@ -46,7 +48,7 @@ private:
|
||||
int _annotationType;
|
||||
|
||||
std::string _value;
|
||||
|
||||
|
||||
boost::u32regex _pattern;
|
||||
};
|
||||
|
||||
|
@ -26,7 +26,7 @@ SentenceTokenizer::~SentenceTokenizer() {
|
||||
|
||||
boost::shared_ptr<TokenizedSentence>
|
||||
SentenceTokenizer::tokenize(const std::string & sentence) {
|
||||
boost::shared_ptr<TokenizedSentence>
|
||||
boost::shared_ptr<TokenizedSentence>
|
||||
result(new TokenizedSentence(sentence));
|
||||
|
||||
_htmlTags->apply(result);
|
||||
@ -40,9 +40,10 @@ boost::shared_ptr<TokenizedSentence>
|
||||
if (_stopWordsEnabled) {
|
||||
_stopWords->apply(result);
|
||||
}
|
||||
|
||||
|
||||
boost::shared_ptr<RegexRule> wordsRule(
|
||||
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, ""));
|
||||
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
|
||||
TokenAnnotation::WORD, ""));
|
||||
wordsRule->apply(result);
|
||||
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
||||
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
||||
@ -103,7 +104,8 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
||||
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
||||
tagsExpression += "br).*?>";
|
||||
_htmlTags = boost::shared_ptr<RegexRule>(
|
||||
new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false));
|
||||
new RegexRule(tagsExpression,
|
||||
TokenAnnotation::HTML_TAG, "", false));
|
||||
}
|
||||
|
||||
boost::shared_ptr<RegexRule>
|
||||
@ -137,6 +139,6 @@ boost::shared_ptr<RegexRule>
|
||||
expression = expression.substr(0, expression.size()-1);
|
||||
expression += ")";
|
||||
return boost::shared_ptr<RegexRule>(
|
||||
new RegexRule(expression, annotationType, value, false));
|
||||
new RegexRule(expression, annotationType, value, false));
|
||||
}
|
||||
|
||||
|
@ -14,10 +14,9 @@
|
||||
|
||||
/*!
|
||||
Class for tokenizing sentence before generating hash.
|
||||
This operation is is used to
|
||||
remove unnecessary symbols and possibly words from sentences added to index
|
||||
and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
|
||||
as well as annotates named entities and special symbols. All these have to be listed in files
|
||||
Tokenizer ignores unnecessary symbols, html tags and possibly stop words
|
||||
(if the option is enabled) in sentences added to index
|
||||
as well as annotates named entities. All these have to be listed in files
|
||||
(see \ref tutorial3).
|
||||
*/
|
||||
|
||||
@ -35,7 +34,7 @@ public:
|
||||
|
||||
/*! Tokenizes the sentence.
|
||||
\param sentence input sentence
|
||||
\returns altered version of the input sentence
|
||||
\returns tokenized sentence object build on the input sentence
|
||||
*/
|
||||
boost::shared_ptr<TokenizedSentence>
|
||||
tokenize(const std::string & sentence);
|
||||
@ -58,7 +57,6 @@ private:
|
||||
bool _stopWordsEnabled;
|
||||
|
||||
boost::shared_ptr<RegexRule> _stopWords;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -21,19 +21,19 @@ BOOST_AUTO_TEST_CASE( SimpleAnnotation )
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),7);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),8);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),14);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),15);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),17);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),18);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),19);
|
||||
@ -64,19 +64,19 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),19);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),20);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),21);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),44);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),45);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),45);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),46);
|
||||
@ -94,15 +94,15 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),11);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),19);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),24);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),32);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||
@ -132,7 +132,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),29);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),30);
|
||||
@ -149,71 +149,71 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),2);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),3);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),4);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),5);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),5);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),6);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),9);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),9);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),10);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),15);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),16);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),17);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),20);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),21);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),21);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),22);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),23);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),23);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),24);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),26);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),27);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),28);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),29);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),30);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),33);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),34);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),34);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||
|
@ -47,79 +47,79 @@ BOOST_AUTO_TEST_CASE( NETest )
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "date");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),6);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),16);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),18);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),24);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),40);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),42);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),48);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "number");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),50);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),54);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),56);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),61);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),61);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),62);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),63);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),69);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),70);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),75);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),76);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),80);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),82);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),88);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),89);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),94);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),95);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),99);
|
||||
@ -156,52 +156,52 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),23);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),23);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"link");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),27);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),31);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),32);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),36);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),39);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),39);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),43);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),43);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),47);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),48);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),51);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),52);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),59);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),60);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),65);
|
||||
@ -240,79 +240,79 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"this");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),5);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),7);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"is");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),9);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"a");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),10);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),18);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"sentence");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),20);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),25);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"don't");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),26);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),38);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),39);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),41);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"it");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),43);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),49);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"zażółć");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),51);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),57);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),59);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),63);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"jaźń");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),64);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),71);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),72);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),77);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"gęślą");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),78);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),83);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń");
|
||||
iter++;
|
||||
++iter;
|
||||
|
||||
}
|
||||
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <string>
|
||||
|
||||
/*!
|
||||
Class representing annotatio of char sequence as a token.
|
||||
Class representing annotation of char sequence as a token.
|
||||
It is a type of interval that is also storing information
|
||||
about the annoation type and value.
|
||||
|
||||
@ -18,7 +18,7 @@ public:
|
||||
/*! Constructor.
|
||||
\param start start index of the annotation (char-level, 0-based)
|
||||
\param end end index of the annotation (char-level, 0-based)
|
||||
\param type annotation type
|
||||
\param annotationType annotation type
|
||||
\param value annotation value
|
||||
*/
|
||||
TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
||||
@ -44,14 +44,22 @@ public:
|
||||
return _value;
|
||||
}
|
||||
|
||||
/*! Named entity annotation type
|
||||
*/
|
||||
static int NE;
|
||||
|
||||
/*! Word annotation type
|
||||
*/
|
||||
static int WORD;
|
||||
|
||||
/*! Html tag annotation type
|
||||
*/
|
||||
static int HTML_TAG;
|
||||
|
||||
/*! Stop word annotation type
|
||||
*/
|
||||
static int STOP_WORD;
|
||||
|
||||
|
||||
protected:
|
||||
int _annotationType;
|
||||
|
||||
|
@ -11,37 +11,43 @@ TokenizedSentence::TokenizedSentence(std::string sentence):
|
||||
TokenizedSentence::~TokenizedSentence() {
|
||||
}
|
||||
|
||||
void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
|
||||
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
|
||||
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
|
||||
|
||||
while(newAnnotation != annotations.end()) {
|
||||
void TokenizedSentence::addAnnotations(
|
||||
std::vector<TokenAnnotation> annotations) {
|
||||
std::vector<TokenAnnotation>::iterator newAnnotation =
|
||||
annotations.begin();
|
||||
std::list<TokenAnnotation>::iterator existingAnnotation =
|
||||
_tokenAnnotations.begin();
|
||||
|
||||
while (newAnnotation != annotations.end()) {
|
||||
if (existingAnnotation != _tokenAnnotations.end()) {
|
||||
// there are still some existing annotations, so perform checks
|
||||
if (newAnnotation->intersects(*existingAnnotation)) {
|
||||
// The new annotation intersects with the existing.
|
||||
// We can not add it, so let us just move on to the
|
||||
// next new annoation.
|
||||
newAnnotation++;
|
||||
++newAnnotation;
|
||||
} else {
|
||||
// it is now important whether the new interval is before
|
||||
// or after existing
|
||||
if (newAnnotation->getStart() < existingAnnotation->getStart()) {
|
||||
// New interval does not intersect and is before existing. We add it.
|
||||
_tokenAnnotations.insert(existingAnnotation, *newAnnotation);
|
||||
newAnnotation++;
|
||||
if (newAnnotation->getStart() <
|
||||
existingAnnotation->getStart()) {
|
||||
// New interval does not intersect and is
|
||||
// before existing. We add it.
|
||||
_tokenAnnotations.insert(existingAnnotation,
|
||||
*newAnnotation);
|
||||
++newAnnotation;
|
||||
} else {
|
||||
// If the new interval is after existing we move to the next existing annoation.
|
||||
existingAnnotation++;
|
||||
// If the new interval is after existing
|
||||
// we move to the next existing annoation.
|
||||
++existingAnnotation;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// no more existing annotations, so just add the new annotation
|
||||
_tokenAnnotations.push_back(*newAnnotation);
|
||||
newAnnotation++;
|
||||
++newAnnotation;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void TokenizedSentence::toLowerCase() {
|
||||
@ -54,8 +60,7 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
|
||||
annotation.getType() == TokenAnnotation::NE) {
|
||||
_codes.push_back(wordMap->getWordCode(annotation.getValue()));
|
||||
_tokens.push_back(annotation);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -11,9 +11,12 @@
|
||||
#include <list>
|
||||
|
||||
/*!
|
||||
A sentence after anonymization operations. The class
|
||||
A sentence after tokenizing operations. The class
|
||||
holds the current string represenation of the sentence
|
||||
along with the annotations list.
|
||||
along with the annotations list. The class also allows
|
||||
for generating hash. After that operation the class
|
||||
also holds the list of hashed codes and corresponding
|
||||
tokens.
|
||||
*/
|
||||
|
||||
class TokenizedSentence {
|
||||
@ -22,7 +25,7 @@ public:
|
||||
Constructor.
|
||||
|
||||
*/
|
||||
TokenizedSentence(std::string sentence);
|
||||
explicit TokenizedSentence(std::string sentence);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
@ -35,21 +38,40 @@ public:
|
||||
return _sentence;
|
||||
}
|
||||
|
||||
/*! Getter for annotations list
|
||||
/*! Getter for all annotations list. This method returns
|
||||
all annotations, including those which are not considered
|
||||
in the hash, i.e. stop words and html tags.
|
||||
\returns annotations list
|
||||
*/
|
||||
std::list<TokenAnnotation> getAnnotations() const {
|
||||
return _tokenAnnotations;
|
||||
}
|
||||
|
||||
/*! Getter for codes list. This data is available after calling
|
||||
the hashGenerator method.
|
||||
\returns codes list
|
||||
*/
|
||||
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
|
||||
return _codes;
|
||||
}
|
||||
|
||||
|
||||
/*! Getter for tokens list. This method returns
|
||||
only those annotations considered
|
||||
in the hash, i.e. words and named entities.
|
||||
\returns tokens list
|
||||
*/
|
||||
std::vector<TokenAnnotation> getTokens() const {
|
||||
return _tokens;
|
||||
}
|
||||
|
||||
|
||||
/*! Method for generating hash based on annotations.
|
||||
This method takes into account annotations of type
|
||||
word and named entity. These are encoded and added
|
||||
to to code list. Annotations corresponding to these
|
||||
tokens are added to the tokens list.
|
||||
\param wordMap word map to use when encoding tokens
|
||||
\returns tokens list
|
||||
*/
|
||||
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
||||
|
||||
/*!
|
||||
@ -66,15 +88,15 @@ public:
|
||||
|
||||
\param annotations list of annotations to be added
|
||||
*/
|
||||
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
||||
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
||||
|
||||
private:
|
||||
std::string _sentence;
|
||||
|
||||
std::list<TokenAnnotation> _tokenAnnotations;
|
||||
|
||||
|
||||
std::vector<INDEX_CHARACTER_TYPE> _codes;
|
||||
|
||||
|
||||
std::vector<TokenAnnotation> _tokens;
|
||||
};
|
||||
|
||||
|
@ -98,12 +98,15 @@ Concordia is equipped with a unique functionality of so called Concordia search,
|
||||
|
||||
Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples.
|
||||
|
||||
Moreover, the below example presents the feature of retrieving a tokenized version of the example.
|
||||
|
||||
File concordia_searching.cpp:
|
||||
\verbatim
|
||||
#include <concordia/concordia.hpp>
|
||||
#include <concordia/concordia_search_result.hpp>
|
||||
#include <concordia/matched_pattern_fragment.hpp>
|
||||
#include <concordia/example.hpp>
|
||||
#include <concordia/tokenized_sentence.hpp>
|
||||
|
||||
#include "config.hpp"
|
||||
|
||||
@ -115,7 +118,13 @@ using namespace std;
|
||||
int main() {
|
||||
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
|
||||
|
||||
concordia.addExample(Example("Alice has a cat", 56));
|
||||
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
|
||||
cout << "Added the following tokens: " << endl;
|
||||
BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
|
||||
cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
|
||||
<< token.getEnd() << ")" << endl;
|
||||
}
|
||||
|
||||
concordia.addExample(Example("Alice has a dog", 23));
|
||||
concordia.addExample(Example("New test product has a mistake", 321));
|
||||
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
||||
@ -153,6 +162,11 @@ int main() {
|
||||
This program should print:
|
||||
|
||||
\verbatim
|
||||
Added the following tokens:
|
||||
"alice" at positions: [0,5)
|
||||
"has" at positions: [6,9)
|
||||
"a" at positions: [10,11)
|
||||
"cat" at positions: [12,15)
|
||||
Searching for pattern: Our new test product has nothing to do with computers
|
||||
Printing all matched fragments:
|
||||
Matched pattern fragment found. Pattern fragment: [4,9] in sentence 14, at offset: 6
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <concordia/concordia_search_result.hpp>
|
||||
#include <concordia/matched_pattern_fragment.hpp>
|
||||
#include <concordia/example.hpp>
|
||||
#include <concordia/tokenized_sentence.hpp>
|
||||
|
||||
#include "config.hpp"
|
||||
|
||||
@ -13,7 +14,13 @@ using namespace std;
|
||||
int main() {
|
||||
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
|
||||
|
||||
concordia.addExample(Example("Alice has a cat", 56));
|
||||
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
|
||||
cout << "Added the following tokens: " << endl;
|
||||
BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
|
||||
cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
|
||||
<< token.getEnd() << ")" << endl;
|
||||
}
|
||||
|
||||
concordia.addExample(Example("Alice has a dog", 23));
|
||||
concordia.addExample(Example("New test product has a mistake", 321));
|
||||
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
||||
|
Loading…
Reference in New Issue
Block a user