2015-06-25 10:12:51 +02:00
|
|
|
#include "concordia/tokenized_sentence.hpp"
|
2015-06-22 13:52:56 +02:00
|
|
|
#include "concordia/common/text_utils.hpp"
|
|
|
|
|
|
|
|
#include <iostream>
|
2015-06-25 20:49:22 +02:00
|
|
|
#include <boost/foreach.hpp>
|
2015-06-22 13:52:56 +02:00
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
TokenizedSentence::TokenizedSentence(std::string sentence):
|
2015-06-22 13:52:56 +02:00
|
|
|
_sentence(sentence) {
|
|
|
|
}
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
TokenizedSentence::~TokenizedSentence() {
|
2015-06-22 13:52:56 +02:00
|
|
|
}
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
|
2015-06-22 13:52:56 +02:00
|
|
|
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
|
|
|
|
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
|
|
|
|
|
|
|
|
while(newAnnotation != annotations.end()) {
|
|
|
|
if (existingAnnotation != _tokenAnnotations.end()) {
|
|
|
|
// there are still some existing annotations, so perform checks
|
|
|
|
if (newAnnotation->intersects(*existingAnnotation)) {
|
|
|
|
// The new annotation intersects with the existing.
|
|
|
|
// We can not add it, so let us just move on to the
|
|
|
|
// next new annoation.
|
|
|
|
newAnnotation++;
|
|
|
|
} else {
|
|
|
|
// it is now important whether the new interval is before
|
|
|
|
// or after existing
|
|
|
|
if (newAnnotation->getStart() < existingAnnotation->getStart()) {
|
|
|
|
// New interval does not intersect and is before existing. We add it.
|
|
|
|
_tokenAnnotations.insert(existingAnnotation, *newAnnotation);
|
|
|
|
newAnnotation++;
|
|
|
|
} else {
|
|
|
|
// If the new interval is after existing we move to the next existing annoation.
|
|
|
|
existingAnnotation++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// no more existing annotations, so just add the new annotation
|
|
|
|
_tokenAnnotations.push_back(*newAnnotation);
|
|
|
|
newAnnotation++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
void TokenizedSentence::toLowerCase() {
|
2015-06-22 13:52:56 +02:00
|
|
|
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
|
|
|
|
}
|
2015-06-26 15:38:24 +02:00
|
|
|
|
|
|
|
void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
|
|
|
|
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
|
|
|
|
if (annotation.getType() == TokenAnnotation::WORD ||
|
|
|
|
annotation.getType() == TokenAnnotation::NE) {
|
|
|
|
_codes.push_back(wordMap->getWordCode(annotation.getValue()));
|
|
|
|
_tokens.push_back(annotation);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|