#include "concordia/tokenized_sentence.hpp" #include "concordia/common/text_utils.hpp" #include #include TokenizedSentence::TokenizedSentence(std::string sentence): _sentence(sentence) { } TokenizedSentence::~TokenizedSentence() { } void TokenizedSentence::addAnnotations( std::vector annotations) { std::vector::iterator newAnnotation = annotations.begin(); std::list::iterator existingAnnotation = _tokenAnnotations.begin(); while (newAnnotation != annotations.end()) { if (existingAnnotation != _tokenAnnotations.end()) { // there are still some existing annotations, so perform checks if (newAnnotation->intersects(*existingAnnotation)) { // The new annotation intersects with the existing. // We can not add it, so let us just move on to the // next new annoation. ++newAnnotation; } else { // it is now important whether the new interval is before // or after existing if (newAnnotation->getStart() < existingAnnotation->getStart()) { // New interval does not intersect and is // before existing. We add it. _tokenAnnotations.insert(existingAnnotation, *newAnnotation); ++newAnnotation; } else { // If the new interval is after existing // we move to the next existing annoation. ++existingAnnotation; } } } else { // no more existing annotations, so just add the new annotation _tokenAnnotations.push_back(*newAnnotation); ++newAnnotation; } } } void TokenizedSentence::toLowerCase() { _sentence = TextUtils::getInstance().toLowerCase(_sentence); } void TokenizedSentence::generateHash(boost::shared_ptr wordMap) { BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) { if (annotation.getType() == TokenAnnotation::WORD || annotation.getType() == TokenAnnotation::NE) { _codes.push_back(wordMap->getWordCode(annotation.getValue())); _tokens.push_back(annotation); } } } void TokenizedSentence::generateTokens() { BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) { if (annotation.getType() == TokenAnnotation::WORD || annotation.getType() == TokenAnnotation::NE) { _tokens.push_back(annotation); } } }