concordia-library/concordia/tokenized_sentence.cpp

#include "concordia/tokenized_sentence.hpp"
#include "concordia/common/text_utils.hpp"

#include <iostream>
#include <boost/foreach.hpp>

TokenizedSentence::TokenizedSentence(std::string sentence):
                                         _sentence(sentence) {
}

TokenizedSentence::~TokenizedSentence() {
}

void TokenizedSentence::addAnnotations(
                            std::vector<TokenAnnotation> annotations) {
    std::vector<TokenAnnotation>::iterator newAnnotation =
                                           annotations.begin();
    std::list<TokenAnnotation>::iterator existingAnnotation =
                                           _tokenAnnotations.begin();

    while (newAnnotation != annotations.end()) {
        if (existingAnnotation != _tokenAnnotations.end()) {
            // there are still some existing annotations, so perform checks
            if (newAnnotation->intersects(*existingAnnotation)) {
                // The new annotation intersects with the existing.
                // We can not add it, so let us just move on to the
                // next new annoation.
                ++newAnnotation;
            } else {
                // it is now important whether the new interval is before
                // or after existing
                if (newAnnotation->getStart() <
                             existingAnnotation->getStart()) {
                    // New interval does not intersect and is
                    // before existing. We add it.
                    _tokenAnnotations.insert(existingAnnotation,
                                             *newAnnotation);
                    ++newAnnotation;
                } else {
                    // If the new interval is after existing
                    // we move to the next existing annoation.
                    ++existingAnnotation;
                }
            }
        } else {
            // no more existing annotations, so just add the new annotation
            _tokenAnnotations.push_back(*newAnnotation);
            ++newAnnotation;
        }
    }
}

void TokenizedSentence::toLowerCase() {
    _sentence = TextUtils::getInstance().toLowerCase(_sentence);
}

void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
    BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
        if (annotation.getType() == TokenAnnotation::WORD ||
               annotation.getType() == TokenAnnotation::NE) {
            _codes.push_back(wordMap->getWordCode(annotation.getValue()));
            _tokens.push_back(annotation);
        }
    }
}

void TokenizedSentence::generateTokens() {
    BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
        if (annotation.getType() == TokenAnnotation::WORD ||
               annotation.getType() == TokenAnnotation::NE) {
            _tokens.push_back(annotation);
        }
    }
}
tokenizer in progress 2015-06-25 10:12:51 +02:00			`#include "concordia/tokenized_sentence.hpp"`
character intervals in progress 2015-06-22 13:52:56 +02:00			`#include "concordia/common/text_utils.hpp"`

			`#include <iostream>`
working sentence tokenizer 2015-06-25 20:49:22 +02:00			`#include <boost/foreach.hpp>`
character intervals in progress 2015-06-22 13:52:56 +02:00
tokenizer in progress 2015-06-25 10:12:51 +02:00			`TokenizedSentence::TokenizedSentence(std::string sentence):`
character intervals in progress 2015-06-22 13:52:56 +02:00			`_sentence(sentence) {`
			`}`

tokenizer in progress 2015-06-25 10:12:51 +02:00			`TokenizedSentence::~TokenizedSentence() {`
character intervals in progress 2015-06-22 13:52:56 +02:00			`}`

finished original word positions 2015-06-27 12:40:24 +02:00			`void TokenizedSentence::addAnnotations(`
			`std::vector<TokenAnnotation> annotations) {`
			`std::vector<TokenAnnotation>::iterator newAnnotation =`
			`annotations.begin();`
			`std::list<TokenAnnotation>::iterator existingAnnotation =`
			`_tokenAnnotations.begin();`

			`while (newAnnotation != annotations.end()) {`
character intervals in progress 2015-06-22 13:52:56 +02:00			`if (existingAnnotation != _tokenAnnotations.end()) {`
			`// there are still some existing annotations, so perform checks`
			`if (newAnnotation->intersects(*existingAnnotation)) {`
			`// The new annotation intersects with the existing.`
			`// We can not add it, so let us just move on to the`
			`// next new annoation.`
finished original word positions 2015-06-27 12:40:24 +02:00			`++newAnnotation;`
character intervals in progress 2015-06-22 13:52:56 +02:00			`} else {`
			`// it is now important whether the new interval is before`
			`// or after existing`
finished original word positions 2015-06-27 12:40:24 +02:00			`if (newAnnotation->getStart() <`
			`existingAnnotation->getStart()) {`
			`// New interval does not intersect and is`
			`// before existing. We add it.`
			`_tokenAnnotations.insert(existingAnnotation,`
			`*newAnnotation);`
			`++newAnnotation;`
character intervals in progress 2015-06-22 13:52:56 +02:00			`} else {`
finished original word positions 2015-06-27 12:40:24 +02:00			`// If the new interval is after existing`
			`// we move to the next existing annoation.`
			`++existingAnnotation;`
character intervals in progress 2015-06-22 13:52:56 +02:00			`}`
			`}`
			`} else {`
			`// no more existing annotations, so just add the new annotation`
			`_tokenAnnotations.push_back(*newAnnotation);`
finished original word positions 2015-06-27 12:40:24 +02:00			`++newAnnotation;`
character intervals in progress 2015-06-22 13:52:56 +02:00			`}`
			`}`
			`}`

tokenizer in progress 2015-06-25 10:12:51 +02:00			`void TokenizedSentence::toLowerCase() {`
character intervals in progress 2015-06-22 13:52:56 +02:00			`_sentence = TextUtils::getInstance().toLowerCase(_sentence);`
			`}`
new responsibilities of tokenized sentence 2015-06-26 15:38:24 +02:00
			`void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {`
			`BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {`
			`if (annotation.getType() == TokenAnnotation::WORD \|\|`
			`annotation.getType() == TokenAnnotation::NE) {`
			`_codes.push_back(wordMap->getWordCode(annotation.getValue()));`
			`_tokens.push_back(annotation);`
finished original word positions 2015-06-27 12:40:24 +02:00			`}`
new responsibilities of tokenized sentence 2015-06-26 15:38:24 +02:00			`}`
			`}`

tokenize only option - no word map 2016-01-01 20:45:07 +01:00			`void TokenizedSentence::generateTokens() {`
			`BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {`
			`if (annotation.getType() == TokenAnnotation::WORD \|\|`
			`annotation.getType() == TokenAnnotation::NE) {`
			`_tokens.push_back(annotation);`
			`}`
			`}`
			`}`