finished original word positions
This commit is contained in:
parent
a8c5fa0c75
commit
5a57406875
@ -28,8 +28,9 @@ void checkConcordiaResults(
|
|||||||
long baseLineCount) {
|
long baseLineCount) {
|
||||||
long lineIndex = 1;
|
long lineIndex = 1;
|
||||||
BOOST_FOREACH(ConcordiaSearchResult result, results) {
|
BOOST_FOREACH(ConcordiaSearchResult result, results) {
|
||||||
SUFFIX_MARKER_TYPE patternSize = result.getTokenizedPattern()->getTokens().size();
|
SUFFIX_MARKER_TYPE patternSize =
|
||||||
if (patternSize > 0) {
|
result.getTokenizedPattern()->getTokens().size();
|
||||||
|
if (patternSize > 0) {
|
||||||
if (result.getBestOverlay().size() != 1) {
|
if (result.getBestOverlay().size() != 1) {
|
||||||
reportError(baseLineCount + lineIndex,
|
reportError(baseLineCount + lineIndex,
|
||||||
"best overlay has more than one fragment.");
|
"best overlay has more than one fragment.");
|
||||||
@ -37,7 +38,7 @@ void checkConcordiaResults(
|
|||||||
if (result.getBestOverlay().at(0).getMatchedLength()
|
if (result.getBestOverlay().at(0).getMatchedLength()
|
||||||
!= patternSize) {
|
!= patternSize) {
|
||||||
reportError(baseLineCount + lineIndex,
|
reportError(baseLineCount + lineIndex,
|
||||||
"best overlay fragment has different size than pattern.");
|
"best overlay fragment has different size than pattern.");
|
||||||
}
|
}
|
||||||
if (result.getBestOverlayScore() != 1) {
|
if (result.getBestOverlayScore() != 1) {
|
||||||
reportError(baseLineCount + lineIndex,
|
reportError(baseLineCount + lineIndex,
|
||||||
@ -201,7 +202,8 @@ int main(int argc, char** argv) {
|
|||||||
msdiff = time_end - time_start;
|
msdiff = time_end - time_start;
|
||||||
|
|
||||||
std::cout << "\tPattern used: " << std::endl << "\t\t";
|
std::cout << "\tPattern used: " << std::endl << "\t\t";
|
||||||
BOOST_FOREACH(TokenAnnotation annotation, result->getTokenizedPattern()->getTokens()) {
|
BOOST_FOREACH(TokenAnnotation annotation,
|
||||||
|
result->getTokenizedPattern()->getTokens()) {
|
||||||
std::cout << annotation.getValue() << " ";
|
std::cout << annotation.getValue() << " ";
|
||||||
}
|
}
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
|
@ -9,6 +9,7 @@ Before you compile, make sure you have these installed:
|
|||||||
- cmake
|
- cmake
|
||||||
- Boost library
|
- Boost library
|
||||||
- Log4cpp
|
- Log4cpp
|
||||||
|
- ICU
|
||||||
- (optional) Doxygen
|
- (optional) Doxygen
|
||||||
- (optional) TeX
|
- (optional) TeX
|
||||||
|
|
||||||
|
@ -44,15 +44,17 @@ std::string _createLibraryVersion() {
|
|||||||
|
|
||||||
// Sentences are written to disk and added to T.
|
// Sentences are written to disk and added to T.
|
||||||
// SA is generated on command by other methods.
|
// SA is generated on command by other methods.
|
||||||
boost::shared_ptr<TokenizedSentence> Concordia::addExample(const Example & example)
|
boost::shared_ptr<TokenizedSentence> Concordia::addExample(
|
||||||
throw(ConcordiaException) {
|
const Example & example)
|
||||||
|
throw(ConcordiaException) {
|
||||||
return _index->addExample(_hashGenerator, _T, _markers, example);
|
return _index->addExample(_hashGenerator, _T, _markers, example);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sentences are written to disk and added to T.
|
// Sentences are written to disk and added to T.
|
||||||
// SA is generated on command by other methods.
|
// SA is generated on command by other methods.
|
||||||
std::vector<TokenizedSentence> Concordia::addAllExamples(const std::vector<Example> & examples)
|
std::vector<TokenizedSentence> Concordia::addAllExamples(
|
||||||
throw(ConcordiaException) {
|
const std::vector<Example> & examples)
|
||||||
|
throw(ConcordiaException) {
|
||||||
return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
|
return _index->addAllExamples(_hashGenerator, _T, _markers, examples);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,7 +167,8 @@ boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
|
|||||||
} else {
|
} else {
|
||||||
std::string empty;
|
std::string empty;
|
||||||
return boost::shared_ptr<ConcordiaSearchResult>(
|
return boost::shared_ptr<ConcordiaSearchResult>(
|
||||||
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(new TokenizedSentence(empty))));
|
new ConcordiaSearchResult(boost::shared_ptr<TokenizedSentence>(
|
||||||
|
new TokenizedSentence(empty))));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -182,4 +185,3 @@ void Concordia::clearIndex() throw(ConcordiaException) {
|
|||||||
boost::filesystem::remove(_config->getMarkersFilePath());
|
boost::filesystem::remove(_config->getMarkersFilePath());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -54,16 +54,22 @@ public:
|
|||||||
|
|
||||||
/*! Adds an Example to the index.
|
/*! Adds an Example to the index.
|
||||||
\param example example to be added
|
\param example example to be added
|
||||||
|
\returns tokenized sentence object,
|
||||||
|
containing information about original word positions
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<TokenizedSentence> addExample(const Example & example) throw(ConcordiaException);
|
boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Adds multiple examples to the index.
|
/*! Adds multiple examples to the index.
|
||||||
\param examples vector of examples to be added
|
\param examples vector of examples to be added
|
||||||
|
\returns vector of tokenized sentence objects,
|
||||||
|
containing information about original word positions
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
std::vector<TokenizedSentence> addAllExamples(const std::vector<Example> & examples)
|
std::vector<TokenizedSentence> addAllExamples(
|
||||||
throw(ConcordiaException);
|
const std::vector<Example> & examples)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Performs a simple substring lookup on the index.
|
/*! Performs a simple substring lookup on the index.
|
||||||
For more info see \ref tutorial1_2.
|
For more info see \ref tutorial1_2.
|
||||||
|
@ -25,7 +25,7 @@ boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
|||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > result =
|
boost::shared_ptr<std::vector<saidx_t> > result =
|
||||||
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
|
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
|
||||||
for (int i = 0; i < T->size(); i++) {
|
for (int i = 0; i < T->size(); i++) {
|
||||||
result->push_back(SA_array[i]);
|
result->push_back(SA_array[i]);
|
||||||
}
|
}
|
||||||
@ -48,7 +48,8 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
|
|||||||
|
|
||||||
std::vector<TokenizedSentence> hashedPatterns;
|
std::vector<TokenizedSentence> hashedPatterns;
|
||||||
BOOST_FOREACH(Example example, examples) {
|
BOOST_FOREACH(Example example, examples) {
|
||||||
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
||||||
|
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
T, markers, example);
|
T, markers, example);
|
||||||
hashedPatterns.push_back(*hashedPattern);
|
hashedPatterns.push_back(*hashedPattern);
|
||||||
}
|
}
|
||||||
@ -56,7 +57,7 @@ std::vector<TokenizedSentence> ConcordiaIndex::addAllExamples(
|
|||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
markersFile.close();
|
markersFile.close();
|
||||||
hashGenerator->serializeWordMap();
|
hashGenerator->serializeWordMap();
|
||||||
|
|
||||||
return hashedPatterns;
|
return hashedPatterns;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -71,12 +72,13 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
|
|||||||
std::ofstream markersFile;
|
std::ofstream markersFile;
|
||||||
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||||
std::ios::app|std::ios::binary);
|
std::ios::app|std::ios::binary);
|
||||||
boost::shared_ptr<TokenizedSentence> hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
||||||
|
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
T, markers, example);
|
T, markers, example);
|
||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
markersFile.close();
|
markersFile.close();
|
||||||
hashGenerator->serializeWordMap();
|
hashGenerator->serializeWordMap();
|
||||||
|
|
||||||
return hashedPattern;
|
return hashedPattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -87,9 +89,10 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
|||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const Example & example) {
|
const Example & example) {
|
||||||
boost::shared_ptr<TokenizedSentence> hashedPattern = hashGenerator->generateHash(example.getSentence());
|
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
||||||
|
hashGenerator->generateHash(example.getSentence());
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
|
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
|
||||||
|
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||||
it != hash.end(); ++it) {
|
it != hash.end(); ++it) {
|
||||||
@ -117,7 +120,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
|||||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||||
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||||
markers->push_back(sentenceBoundaryMA);
|
markers->push_back(sentenceBoundaryMA);
|
||||||
|
|
||||||
return hashedPattern;
|
return hashedPattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,11 +44,13 @@ public:
|
|||||||
and markers array (also passed to this method) are appended
|
and markers array (also passed to this method) are appended
|
||||||
with the hashed example. At the same time, HDD versions of these
|
with the hashed example. At the same time, HDD versions of these
|
||||||
two data structures are also appended with the same example.
|
two data structures are also appended with the same example.
|
||||||
|
The method returns a tokenized version of the example.
|
||||||
\param hashGenerator hash generator to be used to prepare the hash
|
\param hashGenerator hash generator to be used to prepare the hash
|
||||||
of the example
|
of the example
|
||||||
\param T RAM-based hash index to be appended to
|
\param T RAM-based hash index to be appended to
|
||||||
\param markers RAM-based markers array to be appended to
|
\param markers RAM-based markers array to be appended to
|
||||||
\param example example to be added to index
|
\param example example to be added to index
|
||||||
|
\returns tokenized example
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<TokenizedSentence> addExample(
|
boost::shared_ptr<TokenizedSentence> addExample(
|
||||||
@ -62,11 +64,13 @@ public:
|
|||||||
and markers array (also passed to this method) are appended
|
and markers array (also passed to this method) are appended
|
||||||
with the hashed examples. At the same time, HDD versions of these
|
with the hashed examples. At the same time, HDD versions of these
|
||||||
two data structures are also appended with the same examples.
|
two data structures are also appended with the same examples.
|
||||||
|
The method returns a vector of tokenized examples.
|
||||||
\param hashGenerator hash generator to be used to prepare the hash
|
\param hashGenerator hash generator to be used to prepare the hash
|
||||||
of the example
|
of the example
|
||||||
\param T RAM-based hash index to be appended to
|
\param T RAM-based hash index to be appended to
|
||||||
\param markers RAM-based markers array to be appended to
|
\param markers RAM-based markers array to be appended to
|
||||||
\param examples vector of examples to be added to index
|
\param examples vector of examples to be added to index
|
||||||
|
\returns vector of tokenized examples
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
std::vector<TokenizedSentence> addAllExamples(
|
std::vector<TokenizedSentence> addAllExamples(
|
||||||
@ -83,7 +87,8 @@ public:
|
|||||||
boost::shared_ptr<std::vector<sauchar_t> > T);
|
boost::shared_ptr<std::vector<sauchar_t> > T);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
boost::shared_ptr<TokenizedSentence> _addSingleExample(std::ofstream & hashedIndexFile,
|
boost::shared_ptr<TokenizedSentence> _addSingleExample(
|
||||||
|
std::ofstream & hashedIndexFile,
|
||||||
std::ofstream & markersFile,
|
std::ofstream & markersFile,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
@ -29,9 +29,10 @@ HashGenerator::~HashGenerator() {
|
|||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
|
boost::shared_ptr<TokenizedSentence> HashGenerator::generateHash(
|
||||||
const std::string & sentence) throw(ConcordiaException) {
|
const std::string & sentence) throw(ConcordiaException) {
|
||||||
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
boost::shared_ptr<TokenizedSentence> ts =
|
||||||
|
_sentenceTokenizer->tokenize(sentence);
|
||||||
ts->generateHash(_wordMap);
|
ts->generateHash(_wordMap);
|
||||||
|
|
||||||
if (ts->getTokens().size() > Utils::maxSentenceSize) {
|
if (ts->getTokens().size() > Utils::maxSentenceSize) {
|
||||||
throw ConcordiaException("Trying to add too long sentence.");
|
throw ConcordiaException("Trying to add too long sentence.");
|
||||||
}
|
}
|
||||||
|
@ -15,14 +15,14 @@
|
|||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for generating a sentence hash. The hash is generated from a sentence
|
Class for generating a sentence hash. The hash is generated from a sentence
|
||||||
given in raw string. String is first anonymized and tokenized. After these
|
given in raw string. String is first tokenized by SentenceTokenizer and
|
||||||
operations, each token is coded as an integer, according to WordMap.
|
then each token is coded as an integer, according to WordMap.
|
||||||
Resulting hash is a vector of integers.
|
Resulting hash is an instance of TokenizedSentence.
|
||||||
|
|
||||||
Sentence hashed is used when adding a sentence to index and during searching.
|
Hashed sentence is used when adding a sentence to index and during searching.
|
||||||
|
|
||||||
HashGenerator holds an instance of WordMap, used to code tokens as integers
|
HashGenerator holds an instance of WordMap, used to code tokens as integers
|
||||||
and SentenceAnonymizer, used to preprocess the sentence string.
|
and SentenceTokenizer, used to tokenize the sentence string.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -42,9 +42,10 @@ public:
|
|||||||
/*!
|
/*!
|
||||||
Generates hash of a sentence.
|
Generates hash of a sentence.
|
||||||
\param sentence sentence to generate hash from
|
\param sentence sentence to generate hash from
|
||||||
\returns vector of integers
|
\returns tokenized sentence, containing the hash
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<TokenizedSentence> generateHash(const std::string & sentence)
|
boost::shared_ptr<TokenizedSentence> generateHash(
|
||||||
|
const std::string & sentence)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -73,6 +73,7 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
|
|||||||
boost::shared_ptr<ConcordiaSearchResult>(
|
boost::shared_ptr<ConcordiaSearchResult>(
|
||||||
new ConcordiaSearchResult(hashedPattern));
|
new ConcordiaSearchResult(hashedPattern));
|
||||||
|
|
||||||
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hashedPattern->getCodes());
|
_concordiaSearcher->concordiaSearch(result, T, markers,
|
||||||
|
SA, hashedPattern->getCodes());
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -51,9 +51,12 @@ public:
|
|||||||
return _end;
|
return _end;
|
||||||
}
|
}
|
||||||
|
|
||||||
friend std::ostream & operator << (std::ostream & o, const Interval & interval) {
|
friend std::ostream & operator << (std::ostream & o,
|
||||||
return o << "[" << interval.getStart() << "," << interval.getEnd() << ")";
|
const Interval & interval) {
|
||||||
|
return o << "[" << interval.getStart()
|
||||||
|
<< "," << interval.getEnd() << ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
SUFFIX_MARKER_TYPE _start;
|
SUFFIX_MARKER_TYPE _start;
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing matched pattern fragment in concordia search.
|
Class representing matched pattern fragment in concordia search.
|
||||||
This fragment can be seen as an interval of the pattern.
|
This fragment can be seen as a word interval of the pattern.
|
||||||
|
|
||||||
This class holds information about:
|
This class holds information about:
|
||||||
- where the pattern fragment was matched (example id and example offset)
|
- where the pattern fragment was matched (example id and example offset)
|
||||||
|
@ -13,9 +13,11 @@ RegexRule::RegexRule(std::string patternString,
|
|||||||
_value(value) {
|
_value(value) {
|
||||||
try {
|
try {
|
||||||
if (caseSensitive) {
|
if (caseSensitive) {
|
||||||
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
|
_pattern = boost::make_u32regex(
|
||||||
|
UnicodeString(patternString.c_str()));
|
||||||
} else {
|
} else {
|
||||||
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
|
_pattern = boost::make_u32regex(
|
||||||
|
UnicodeString(patternString.c_str()), boost::regex::icase);
|
||||||
}
|
}
|
||||||
} catch(const std::exception & e) {
|
} catch(const std::exception & e) {
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
@ -37,7 +39,8 @@ RegexRule::~RegexRule() {
|
|||||||
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
|
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
|
||||||
try {
|
try {
|
||||||
UnicodeString s(sentence->getSentence().c_str());
|
UnicodeString s(sentence->getSentence().c_str());
|
||||||
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
|
boost::u32regex_iterator<const UChar*> begin(
|
||||||
|
boost::make_u32regex_iterator(s, _pattern));
|
||||||
boost::u32regex_iterator<const UChar*> end;
|
boost::u32regex_iterator<const UChar*> end;
|
||||||
std::vector<TokenAnnotation> annotations;
|
std::vector<TokenAnnotation> annotations;
|
||||||
for (; begin != end; ++begin) {
|
for (; begin != end; ++begin) {
|
||||||
@ -46,19 +49,21 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
|
|||||||
std::string value;
|
std::string value;
|
||||||
if (_annotationType == TokenAnnotation::WORD) {
|
if (_annotationType == TokenAnnotation::WORD) {
|
||||||
UnicodeString unicodeValue;
|
UnicodeString unicodeValue;
|
||||||
s.extract(begin->position(), begin->length(), unicodeValue);
|
s.extract(begin->position(), begin->length(), unicodeValue);
|
||||||
unicodeValue.toUTF8String(value);
|
unicodeValue.toUTF8String(value);
|
||||||
} else {
|
} else {
|
||||||
value = _value;
|
value = _value;
|
||||||
}
|
}
|
||||||
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value);
|
TokenAnnotation annotation(matchBegin, matchEnd,
|
||||||
|
_annotationType, value);
|
||||||
annotations.push_back(annotation);
|
annotations.push_back(annotation);
|
||||||
}
|
}
|
||||||
sentence->addAnnotations(annotations);
|
sentence->addAnnotations(annotations);
|
||||||
} catch(const std::exception & e) {
|
} catch(const std::exception & e) {
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "Exception while applying regex rule: "
|
ss << "Exception while applying regex rule: "
|
||||||
<< _annotationType << " to text: " << sentence->getSentence();
|
<< _annotationType << " to text: "
|
||||||
|
<< sentence->getSentence();
|
||||||
ss << ", message: " << e.what();
|
ss << ", message: " << e.what();
|
||||||
throw ConcordiaException(ss.str());
|
throw ConcordiaException(ss.str());
|
||||||
}
|
}
|
||||||
|
@ -15,8 +15,9 @@ typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
|
|||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for representing a regular expression annotation rule.
|
Class for representing a regular expression annotation rule.
|
||||||
Holds regex pattern string for matching and replacement string for
|
Holds regex pattern string for matching and default value to assign
|
||||||
annotating found matches.
|
to the annotations. Rule also has a type, given to all annotations
|
||||||
|
produced by it.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
class RegexRule {
|
class RegexRule {
|
||||||
@ -25,6 +26,7 @@ public:
|
|||||||
Constructor.
|
Constructor.
|
||||||
\param patternString regex pattern to match
|
\param patternString regex pattern to match
|
||||||
\param annoationType type of annotation
|
\param annoationType type of annotation
|
||||||
|
\param value value to be assigned to the annotation
|
||||||
\param caseSensitive case sensitivity of the pattern
|
\param caseSensitive case sensitivity of the pattern
|
||||||
*/
|
*/
|
||||||
RegexRule(std::string patternString,
|
RegexRule(std::string patternString,
|
||||||
@ -37,7 +39,7 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~RegexRule();
|
virtual ~RegexRule();
|
||||||
|
|
||||||
/*! Applies the operation on anonymized sentence.
|
/*! Applies regex annotation on tokenized sentence.
|
||||||
\param sentence the input sentence
|
\param sentence the input sentence
|
||||||
*/
|
*/
|
||||||
void apply(boost::shared_ptr<TokenizedSentence> sentence);
|
void apply(boost::shared_ptr<TokenizedSentence> sentence);
|
||||||
@ -46,7 +48,7 @@ private:
|
|||||||
int _annotationType;
|
int _annotationType;
|
||||||
|
|
||||||
std::string _value;
|
std::string _value;
|
||||||
|
|
||||||
boost::u32regex _pattern;
|
boost::u32regex _pattern;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ SentenceTokenizer::~SentenceTokenizer() {
|
|||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence>
|
boost::shared_ptr<TokenizedSentence>
|
||||||
SentenceTokenizer::tokenize(const std::string & sentence) {
|
SentenceTokenizer::tokenize(const std::string & sentence) {
|
||||||
boost::shared_ptr<TokenizedSentence>
|
boost::shared_ptr<TokenizedSentence>
|
||||||
result(new TokenizedSentence(sentence));
|
result(new TokenizedSentence(sentence));
|
||||||
|
|
||||||
_htmlTags->apply(result);
|
_htmlTags->apply(result);
|
||||||
@ -40,9 +40,10 @@ boost::shared_ptr<TokenizedSentence>
|
|||||||
if (_stopWordsEnabled) {
|
if (_stopWordsEnabled) {
|
||||||
_stopWords->apply(result);
|
_stopWords->apply(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule> wordsRule(
|
boost::shared_ptr<RegexRule> wordsRule(
|
||||||
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, ""));
|
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
|
||||||
|
TokenAnnotation::WORD, ""));
|
||||||
wordsRule->apply(result);
|
wordsRule->apply(result);
|
||||||
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
||||||
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
||||||
@ -103,7 +104,8 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
|||||||
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
||||||
tagsExpression += "br).*?>";
|
tagsExpression += "br).*?>";
|
||||||
_htmlTags = boost::shared_ptr<RegexRule>(
|
_htmlTags = boost::shared_ptr<RegexRule>(
|
||||||
new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false));
|
new RegexRule(tagsExpression,
|
||||||
|
TokenAnnotation::HTML_TAG, "", false));
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule>
|
boost::shared_ptr<RegexRule>
|
||||||
@ -137,6 +139,6 @@ boost::shared_ptr<RegexRule>
|
|||||||
expression = expression.substr(0, expression.size()-1);
|
expression = expression.substr(0, expression.size()-1);
|
||||||
expression += ")";
|
expression += ")";
|
||||||
return boost::shared_ptr<RegexRule>(
|
return boost::shared_ptr<RegexRule>(
|
||||||
new RegexRule(expression, annotationType, value, false));
|
new RegexRule(expression, annotationType, value, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,10 +14,9 @@
|
|||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for tokenizing sentence before generating hash.
|
Class for tokenizing sentence before generating hash.
|
||||||
This operation is is used to
|
Tokenizer ignores unnecessary symbols, html tags and possibly stop words
|
||||||
remove unnecessary symbols and possibly words from sentences added to index
|
(if the option is enabled) in sentences added to index
|
||||||
and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
|
as well as annotates named entities. All these have to be listed in files
|
||||||
as well as annotates named entities and special symbols. All these have to be listed in files
|
|
||||||
(see \ref tutorial3).
|
(see \ref tutorial3).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -35,7 +34,7 @@ public:
|
|||||||
|
|
||||||
/*! Tokenizes the sentence.
|
/*! Tokenizes the sentence.
|
||||||
\param sentence input sentence
|
\param sentence input sentence
|
||||||
\returns altered version of the input sentence
|
\returns tokenized sentence object build on the input sentence
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<TokenizedSentence>
|
boost::shared_ptr<TokenizedSentence>
|
||||||
tokenize(const std::string & sentence);
|
tokenize(const std::string & sentence);
|
||||||
@ -58,7 +57,6 @@ private:
|
|||||||
bool _stopWordsEnabled;
|
bool _stopWordsEnabled;
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule> _stopWords;
|
boost::shared_ptr<RegexRule> _stopWords;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -21,19 +21,19 @@ BOOST_AUTO_TEST_CASE( SimpleAnnotation )
|
|||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),7);
|
BOOST_CHECK_EQUAL(iter->getStart(),7);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),8);
|
BOOST_CHECK_EQUAL(iter->getEnd(),8);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),14);
|
BOOST_CHECK_EQUAL(iter->getStart(),14);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),15);
|
BOOST_CHECK_EQUAL(iter->getEnd(),15);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),17);
|
BOOST_CHECK_EQUAL(iter->getEnd(),17);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),18);
|
BOOST_CHECK_EQUAL(iter->getStart(),18);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),19);
|
BOOST_CHECK_EQUAL(iter->getEnd(),19);
|
||||||
@ -64,19 +64,19 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
|
|||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),19);
|
BOOST_CHECK_EQUAL(iter->getStart(),19);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),20);
|
BOOST_CHECK_EQUAL(iter->getEnd(),20);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),21);
|
BOOST_CHECK_EQUAL(iter->getStart(),21);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),44);
|
BOOST_CHECK_EQUAL(iter->getStart(),44);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),45);
|
BOOST_CHECK_EQUAL(iter->getEnd(),45);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),45);
|
BOOST_CHECK_EQUAL(iter->getStart(),45);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),46);
|
BOOST_CHECK_EQUAL(iter->getEnd(),46);
|
||||||
@ -94,15 +94,15 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
|
|||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),11);
|
BOOST_CHECK_EQUAL(iter->getEnd(),11);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),19);
|
BOOST_CHECK_EQUAL(iter->getEnd(),19);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),24);
|
BOOST_CHECK_EQUAL(iter->getStart(),24);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),32);
|
BOOST_CHECK_EQUAL(iter->getStart(),32);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||||
@ -132,7 +132,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
|
|||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),29);
|
BOOST_CHECK_EQUAL(iter->getStart(),29);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),30);
|
BOOST_CHECK_EQUAL(iter->getEnd(),30);
|
||||||
@ -149,71 +149,71 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
|||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),2);
|
BOOST_CHECK_EQUAL(iter->getStart(),2);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),3);
|
BOOST_CHECK_EQUAL(iter->getEnd(),3);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),4);
|
BOOST_CHECK_EQUAL(iter->getStart(),4);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),5);
|
BOOST_CHECK_EQUAL(iter->getEnd(),5);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),5);
|
BOOST_CHECK_EQUAL(iter->getStart(),5);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),6);
|
BOOST_CHECK_EQUAL(iter->getEnd(),6);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),9);
|
BOOST_CHECK_EQUAL(iter->getEnd(),9);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),9);
|
BOOST_CHECK_EQUAL(iter->getStart(),9);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),10);
|
BOOST_CHECK_EQUAL(iter->getEnd(),10);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),15);
|
BOOST_CHECK_EQUAL(iter->getStart(),15);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),16);
|
BOOST_CHECK_EQUAL(iter->getEnd(),16);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),17);
|
BOOST_CHECK_EQUAL(iter->getEnd(),17);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),20);
|
BOOST_CHECK_EQUAL(iter->getStart(),20);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),21);
|
BOOST_CHECK_EQUAL(iter->getEnd(),21);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),21);
|
BOOST_CHECK_EQUAL(iter->getStart(),21);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),22);
|
BOOST_CHECK_EQUAL(iter->getStart(),22);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),23);
|
BOOST_CHECK_EQUAL(iter->getEnd(),23);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),23);
|
BOOST_CHECK_EQUAL(iter->getStart(),23);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),24);
|
BOOST_CHECK_EQUAL(iter->getEnd(),24);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),26);
|
BOOST_CHECK_EQUAL(iter->getStart(),26);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),27);
|
BOOST_CHECK_EQUAL(iter->getStart(),27);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),28);
|
BOOST_CHECK_EQUAL(iter->getEnd(),28);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),29);
|
BOOST_CHECK_EQUAL(iter->getStart(),29);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),30);
|
BOOST_CHECK_EQUAL(iter->getEnd(),30);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),33);
|
BOOST_CHECK_EQUAL(iter->getStart(),33);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),34);
|
BOOST_CHECK_EQUAL(iter->getEnd(),34);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),34);
|
BOOST_CHECK_EQUAL(iter->getStart(),34);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||||
|
@ -47,79 +47,79 @@ BOOST_AUTO_TEST_CASE( NETest )
|
|||||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "date");
|
BOOST_CHECK_EQUAL(iter->getValue(), "date");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),6);
|
BOOST_CHECK_EQUAL(iter->getStart(),6);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),16);
|
BOOST_CHECK_EQUAL(iter->getEnd(),16);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
|
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),18);
|
BOOST_CHECK_EQUAL(iter->getStart(),18);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
|
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),24);
|
BOOST_CHECK_EQUAL(iter->getStart(),24);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),40);
|
BOOST_CHECK_EQUAL(iter->getEnd(),40);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
|
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),42);
|
BOOST_CHECK_EQUAL(iter->getStart(),42);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),48);
|
BOOST_CHECK_EQUAL(iter->getEnd(),48);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "number");
|
BOOST_CHECK_EQUAL(iter->getValue(), "number");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),50);
|
BOOST_CHECK_EQUAL(iter->getStart(),50);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),54);
|
BOOST_CHECK_EQUAL(iter->getEnd(),54);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),56);
|
BOOST_CHECK_EQUAL(iter->getStart(),56);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),61);
|
BOOST_CHECK_EQUAL(iter->getEnd(),61);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
|
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),61);
|
BOOST_CHECK_EQUAL(iter->getStart(),61);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),62);
|
BOOST_CHECK_EQUAL(iter->getEnd(),62);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),63);
|
BOOST_CHECK_EQUAL(iter->getStart(),63);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),69);
|
BOOST_CHECK_EQUAL(iter->getEnd(),69);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),70);
|
BOOST_CHECK_EQUAL(iter->getStart(),70);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),75);
|
BOOST_CHECK_EQUAL(iter->getEnd(),75);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),76);
|
BOOST_CHECK_EQUAL(iter->getStart(),76);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),80);
|
BOOST_CHECK_EQUAL(iter->getEnd(),80);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
|
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),82);
|
BOOST_CHECK_EQUAL(iter->getStart(),82);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),88);
|
BOOST_CHECK_EQUAL(iter->getEnd(),88);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),89);
|
BOOST_CHECK_EQUAL(iter->getStart(),89);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),94);
|
BOOST_CHECK_EQUAL(iter->getEnd(),94);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),95);
|
BOOST_CHECK_EQUAL(iter->getStart(),95);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),99);
|
BOOST_CHECK_EQUAL(iter->getEnd(),99);
|
||||||
@ -156,52 +156,52 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
|||||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),23);
|
BOOST_CHECK_EQUAL(iter->getEnd(),23);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),23);
|
BOOST_CHECK_EQUAL(iter->getStart(),23);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"link");
|
BOOST_CHECK_EQUAL(iter->getValue(),"link");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),27);
|
BOOST_CHECK_EQUAL(iter->getStart(),27);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),31);
|
BOOST_CHECK_EQUAL(iter->getEnd(),31);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),32);
|
BOOST_CHECK_EQUAL(iter->getStart(),32);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),36);
|
BOOST_CHECK_EQUAL(iter->getStart(),36);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),39);
|
BOOST_CHECK_EQUAL(iter->getEnd(),39);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),39);
|
BOOST_CHECK_EQUAL(iter->getStart(),39);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),43);
|
BOOST_CHECK_EQUAL(iter->getEnd(),43);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
|
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),43);
|
BOOST_CHECK_EQUAL(iter->getStart(),43);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),47);
|
BOOST_CHECK_EQUAL(iter->getEnd(),47);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),48);
|
BOOST_CHECK_EQUAL(iter->getStart(),48);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),51);
|
BOOST_CHECK_EQUAL(iter->getEnd(),51);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),52);
|
BOOST_CHECK_EQUAL(iter->getStart(),52);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),59);
|
BOOST_CHECK_EQUAL(iter->getEnd(),59);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
|
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),60);
|
BOOST_CHECK_EQUAL(iter->getStart(),60);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),65);
|
BOOST_CHECK_EQUAL(iter->getEnd(),65);
|
||||||
@ -240,79 +240,79 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
|||||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"this");
|
BOOST_CHECK_EQUAL(iter->getValue(),"this");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),5);
|
BOOST_CHECK_EQUAL(iter->getStart(),5);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),7);
|
BOOST_CHECK_EQUAL(iter->getEnd(),7);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"is");
|
BOOST_CHECK_EQUAL(iter->getValue(),"is");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),9);
|
BOOST_CHECK_EQUAL(iter->getEnd(),9);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"a");
|
BOOST_CHECK_EQUAL(iter->getValue(),"a");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),10);
|
BOOST_CHECK_EQUAL(iter->getStart(),10);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),18);
|
BOOST_CHECK_EQUAL(iter->getEnd(),18);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"sentence");
|
BOOST_CHECK_EQUAL(iter->getValue(),"sentence");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),20);
|
BOOST_CHECK_EQUAL(iter->getStart(),20);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),25);
|
BOOST_CHECK_EQUAL(iter->getEnd(),25);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"don't");
|
BOOST_CHECK_EQUAL(iter->getValue(),"don't");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),26);
|
BOOST_CHECK_EQUAL(iter->getStart(),26);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),38);
|
BOOST_CHECK_EQUAL(iter->getEnd(),38);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze");
|
BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),39);
|
BOOST_CHECK_EQUAL(iter->getStart(),39);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),41);
|
BOOST_CHECK_EQUAL(iter->getEnd(),41);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"it");
|
BOOST_CHECK_EQUAL(iter->getValue(),"it");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),43);
|
BOOST_CHECK_EQUAL(iter->getStart(),43);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),49);
|
BOOST_CHECK_EQUAL(iter->getEnd(),49);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"zażółć");
|
BOOST_CHECK_EQUAL(iter->getValue(),"zażółć");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),51);
|
BOOST_CHECK_EQUAL(iter->getStart(),51);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),57);
|
BOOST_CHECK_EQUAL(iter->getEnd(),57);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą");
|
BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),59);
|
BOOST_CHECK_EQUAL(iter->getStart(),59);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),63);
|
BOOST_CHECK_EQUAL(iter->getEnd(),63);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"jaźń");
|
BOOST_CHECK_EQUAL(iter->getValue(),"jaźń");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),64);
|
BOOST_CHECK_EQUAL(iter->getStart(),64);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),71);
|
BOOST_CHECK_EQUAL(iter->getEnd(),71);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć");
|
BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),72);
|
BOOST_CHECK_EQUAL(iter->getStart(),72);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),77);
|
BOOST_CHECK_EQUAL(iter->getEnd(),77);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"gęślą");
|
BOOST_CHECK_EQUAL(iter->getValue(),"gęślą");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),78);
|
BOOST_CHECK_EQUAL(iter->getStart(),78);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),83);
|
BOOST_CHECK_EQUAL(iter->getEnd(),83);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń");
|
BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń");
|
||||||
iter++;
|
++iter;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing annotatio of char sequence as a token.
|
Class representing annotation of char sequence as a token.
|
||||||
It is a type of interval that is also storing information
|
It is a type of interval that is also storing information
|
||||||
about the annoation type and value.
|
about the annoation type and value.
|
||||||
|
|
||||||
@ -18,7 +18,7 @@ public:
|
|||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
\param start start index of the annotation (char-level, 0-based)
|
\param start start index of the annotation (char-level, 0-based)
|
||||||
\param end end index of the annotation (char-level, 0-based)
|
\param end end index of the annotation (char-level, 0-based)
|
||||||
\param type annotation type
|
\param annotationType annotation type
|
||||||
\param value annotation value
|
\param value annotation value
|
||||||
*/
|
*/
|
||||||
TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
||||||
@ -44,14 +44,22 @@ public:
|
|||||||
return _value;
|
return _value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Named entity annotation type
|
||||||
|
*/
|
||||||
static int NE;
|
static int NE;
|
||||||
|
|
||||||
|
/*! Word annotation type
|
||||||
|
*/
|
||||||
static int WORD;
|
static int WORD;
|
||||||
|
|
||||||
|
/*! Html tag annotation type
|
||||||
|
*/
|
||||||
static int HTML_TAG;
|
static int HTML_TAG;
|
||||||
|
|
||||||
|
/*! Stop word annotation type
|
||||||
|
*/
|
||||||
static int STOP_WORD;
|
static int STOP_WORD;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
int _annotationType;
|
int _annotationType;
|
||||||
|
|
||||||
|
@ -11,37 +11,43 @@ TokenizedSentence::TokenizedSentence(std::string sentence):
|
|||||||
TokenizedSentence::~TokenizedSentence() {
|
TokenizedSentence::~TokenizedSentence() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
|
void TokenizedSentence::addAnnotations(
|
||||||
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
|
std::vector<TokenAnnotation> annotations) {
|
||||||
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
|
std::vector<TokenAnnotation>::iterator newAnnotation =
|
||||||
|
annotations.begin();
|
||||||
while(newAnnotation != annotations.end()) {
|
std::list<TokenAnnotation>::iterator existingAnnotation =
|
||||||
|
_tokenAnnotations.begin();
|
||||||
|
|
||||||
|
while (newAnnotation != annotations.end()) {
|
||||||
if (existingAnnotation != _tokenAnnotations.end()) {
|
if (existingAnnotation != _tokenAnnotations.end()) {
|
||||||
// there are still some existing annotations, so perform checks
|
// there are still some existing annotations, so perform checks
|
||||||
if (newAnnotation->intersects(*existingAnnotation)) {
|
if (newAnnotation->intersects(*existingAnnotation)) {
|
||||||
// The new annotation intersects with the existing.
|
// The new annotation intersects with the existing.
|
||||||
// We can not add it, so let us just move on to the
|
// We can not add it, so let us just move on to the
|
||||||
// next new annoation.
|
// next new annoation.
|
||||||
newAnnotation++;
|
++newAnnotation;
|
||||||
} else {
|
} else {
|
||||||
// it is now important whether the new interval is before
|
// it is now important whether the new interval is before
|
||||||
// or after existing
|
// or after existing
|
||||||
if (newAnnotation->getStart() < existingAnnotation->getStart()) {
|
if (newAnnotation->getStart() <
|
||||||
// New interval does not intersect and is before existing. We add it.
|
existingAnnotation->getStart()) {
|
||||||
_tokenAnnotations.insert(existingAnnotation, *newAnnotation);
|
// New interval does not intersect and is
|
||||||
newAnnotation++;
|
// before existing. We add it.
|
||||||
|
_tokenAnnotations.insert(existingAnnotation,
|
||||||
|
*newAnnotation);
|
||||||
|
++newAnnotation;
|
||||||
} else {
|
} else {
|
||||||
// If the new interval is after existing we move to the next existing annoation.
|
// If the new interval is after existing
|
||||||
existingAnnotation++;
|
// we move to the next existing annoation.
|
||||||
|
++existingAnnotation;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// no more existing annotations, so just add the new annotation
|
// no more existing annotations, so just add the new annotation
|
||||||
_tokenAnnotations.push_back(*newAnnotation);
|
_tokenAnnotations.push_back(*newAnnotation);
|
||||||
newAnnotation++;
|
++newAnnotation;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void TokenizedSentence::toLowerCase() {
|
void TokenizedSentence::toLowerCase() {
|
||||||
@ -54,8 +60,7 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
|
|||||||
annotation.getType() == TokenAnnotation::NE) {
|
annotation.getType() == TokenAnnotation::NE) {
|
||||||
_codes.push_back(wordMap->getWordCode(annotation.getValue()));
|
_codes.push_back(wordMap->getWordCode(annotation.getValue()));
|
||||||
_tokens.push_back(annotation);
|
_tokens.push_back(annotation);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,9 +11,12 @@
|
|||||||
#include <list>
|
#include <list>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
A sentence after anonymization operations. The class
|
A sentence after tokenizing operations. The class
|
||||||
holds the current string represenation of the sentence
|
holds the current string represenation of the sentence
|
||||||
along with the annotations list.
|
along with the annotations list. The class also allows
|
||||||
|
for generating hash. After that operation the class
|
||||||
|
also holds the list of hashed codes and corresponding
|
||||||
|
tokens.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class TokenizedSentence {
|
class TokenizedSentence {
|
||||||
@ -22,7 +25,7 @@ public:
|
|||||||
Constructor.
|
Constructor.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
TokenizedSentence(std::string sentence);
|
explicit TokenizedSentence(std::string sentence);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
@ -35,21 +38,40 @@ public:
|
|||||||
return _sentence;
|
return _sentence;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! Getter for annotations list
|
/*! Getter for all annotations list. This method returns
|
||||||
|
all annotations, including those which are not considered
|
||||||
|
in the hash, i.e. stop words and html tags.
|
||||||
\returns annotations list
|
\returns annotations list
|
||||||
*/
|
*/
|
||||||
std::list<TokenAnnotation> getAnnotations() const {
|
std::list<TokenAnnotation> getAnnotations() const {
|
||||||
return _tokenAnnotations;
|
return _tokenAnnotations;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for codes list. This data is available after calling
|
||||||
|
the hashGenerator method.
|
||||||
|
\returns codes list
|
||||||
|
*/
|
||||||
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
|
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
|
||||||
return _codes;
|
return _codes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Getter for tokens list. This method returns
|
||||||
|
only those annotations considered
|
||||||
|
in the hash, i.e. words and named entities.
|
||||||
|
\returns tokens list
|
||||||
|
*/
|
||||||
std::vector<TokenAnnotation> getTokens() const {
|
std::vector<TokenAnnotation> getTokens() const {
|
||||||
return _tokens;
|
return _tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Method for generating hash based on annotations.
|
||||||
|
This method takes into account annotations of type
|
||||||
|
word and named entity. These are encoded and added
|
||||||
|
to to code list. Annotations corresponding to these
|
||||||
|
tokens are added to the tokens list.
|
||||||
|
\param wordMap word map to use when encoding tokens
|
||||||
|
\returns tokens list
|
||||||
|
*/
|
||||||
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -66,15 +88,15 @@ public:
|
|||||||
|
|
||||||
\param annotations list of annotations to be added
|
\param annotations list of annotations to be added
|
||||||
*/
|
*/
|
||||||
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::string _sentence;
|
std::string _sentence;
|
||||||
|
|
||||||
std::list<TokenAnnotation> _tokenAnnotations;
|
std::list<TokenAnnotation> _tokenAnnotations;
|
||||||
|
|
||||||
std::vector<INDEX_CHARACTER_TYPE> _codes;
|
std::vector<INDEX_CHARACTER_TYPE> _codes;
|
||||||
|
|
||||||
std::vector<TokenAnnotation> _tokens;
|
std::vector<TokenAnnotation> _tokens;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -98,12 +98,15 @@ Concordia is equipped with a unique functionality of so called Concordia search,
|
|||||||
|
|
||||||
Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples.
|
Additionally, the score for this best overlay is computed. The score is a real number between 0 and 1, where 0 indicates, that the pattern is not covered at all (i.e. not a single word from this pattern is found in the index). The score 1 represents the perfect match - pattern is covered completely by just one fragment, which means that the pattern is found in the index as one of the examples.
|
||||||
|
|
||||||
|
Moreover, the below example presents the feature of retrieving a tokenized version of the example.
|
||||||
|
|
||||||
File concordia_searching.cpp:
|
File concordia_searching.cpp:
|
||||||
\verbatim
|
\verbatim
|
||||||
#include <concordia/concordia.hpp>
|
#include <concordia/concordia.hpp>
|
||||||
#include <concordia/concordia_search_result.hpp>
|
#include <concordia/concordia_search_result.hpp>
|
||||||
#include <concordia/matched_pattern_fragment.hpp>
|
#include <concordia/matched_pattern_fragment.hpp>
|
||||||
#include <concordia/example.hpp>
|
#include <concordia/example.hpp>
|
||||||
|
#include <concordia/tokenized_sentence.hpp>
|
||||||
|
|
||||||
#include "config.hpp"
|
#include "config.hpp"
|
||||||
|
|
||||||
@ -115,7 +118,13 @@ using namespace std;
|
|||||||
int main() {
|
int main() {
|
||||||
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
|
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
|
||||||
|
|
||||||
concordia.addExample(Example("Alice has a cat", 56));
|
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
|
||||||
|
cout << "Added the following tokens: " << endl;
|
||||||
|
BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
|
||||||
|
cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
|
||||||
|
<< token.getEnd() << ")" << endl;
|
||||||
|
}
|
||||||
|
|
||||||
concordia.addExample(Example("Alice has a dog", 23));
|
concordia.addExample(Example("Alice has a dog", 23));
|
||||||
concordia.addExample(Example("New test product has a mistake", 321));
|
concordia.addExample(Example("New test product has a mistake", 321));
|
||||||
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
||||||
@ -153,6 +162,11 @@ int main() {
|
|||||||
This program should print:
|
This program should print:
|
||||||
|
|
||||||
\verbatim
|
\verbatim
|
||||||
|
Added the following tokens:
|
||||||
|
"alice" at positions: [0,5)
|
||||||
|
"has" at positions: [6,9)
|
||||||
|
"a" at positions: [10,11)
|
||||||
|
"cat" at positions: [12,15)
|
||||||
Searching for pattern: Our new test product has nothing to do with computers
|
Searching for pattern: Our new test product has nothing to do with computers
|
||||||
Printing all matched fragments:
|
Printing all matched fragments:
|
||||||
Matched pattern fragment found. Pattern fragment: [4,9] in sentence 14, at offset: 6
|
Matched pattern fragment found. Pattern fragment: [4,9] in sentence 14, at offset: 6
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include <concordia/concordia_search_result.hpp>
|
#include <concordia/concordia_search_result.hpp>
|
||||||
#include <concordia/matched_pattern_fragment.hpp>
|
#include <concordia/matched_pattern_fragment.hpp>
|
||||||
#include <concordia/example.hpp>
|
#include <concordia/example.hpp>
|
||||||
|
#include <concordia/tokenized_sentence.hpp>
|
||||||
|
|
||||||
#include "config.hpp"
|
#include "config.hpp"
|
||||||
|
|
||||||
@ -13,7 +14,13 @@ using namespace std;
|
|||||||
int main() {
|
int main() {
|
||||||
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
|
Concordia concordia(EXAMPLES_DIR"/../tests/resources/concordia-config/concordia.cfg");
|
||||||
|
|
||||||
concordia.addExample(Example("Alice has a cat", 56));
|
boost::shared_ptr<TokenizedSentence> ts = concordia.addExample(Example("Alice has a cat", 56));
|
||||||
|
cout << "Added the following tokens: " << endl;
|
||||||
|
BOOST_FOREACH(TokenAnnotation token, ts->getTokens()) {
|
||||||
|
cout << "\"" << token.getValue() << "\"" << " at positions: [" << token.getStart() << ","
|
||||||
|
<< token.getEnd() << ")" << endl;
|
||||||
|
}
|
||||||
|
|
||||||
concordia.addExample(Example("Alice has a dog", 23));
|
concordia.addExample(Example("Alice has a dog", 23));
|
||||||
concordia.addExample(Example("New test product has a mistake", 321));
|
concordia.addExample(Example("New test product has a mistake", 321));
|
||||||
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
||||||
|
Loading…
Reference in New Issue
Block a user