concordia-library/concordia/tokenized_sentence.cpp

91 lines
3.4 KiB
C++
Raw Permalink Normal View History

2015-06-25 10:12:51 +02:00
#include "concordia/tokenized_sentence.hpp"
2015-06-22 13:52:56 +02:00
#include "concordia/common/text_utils.hpp"
#include <iostream>
2017-04-26 17:02:18 +02:00
#include <sstream>
2015-06-25 20:49:22 +02:00
#include <boost/foreach.hpp>
2017-04-26 17:02:18 +02:00
#include <boost/algorithm/string.hpp>
2015-06-22 13:52:56 +02:00
2015-06-25 10:12:51 +02:00
TokenizedSentence::TokenizedSentence(std::string sentence):
_sentence(sentence),
_originalSentence(sentence) {
2015-06-22 13:52:56 +02:00
}
2015-06-25 10:12:51 +02:00
TokenizedSentence::~TokenizedSentence() {
2015-06-22 13:52:56 +02:00
}
2015-06-27 12:40:24 +02:00
void TokenizedSentence::addAnnotations(
std::vector<TokenAnnotation> annotations) {
std::vector<TokenAnnotation>::iterator newAnnotation =
annotations.begin();
std::list<TokenAnnotation>::iterator existingAnnotation =
_tokenAnnotations.begin();
while (newAnnotation != annotations.end()) {
2015-06-22 13:52:56 +02:00
if (existingAnnotation != _tokenAnnotations.end()) {
// there are still some existing annotations, so perform checks
if (newAnnotation->intersects(*existingAnnotation)) {
// The new annotation intersects with the existing.
// We can not add it, so let us just move on to the
// next new annoation.
2015-06-27 12:40:24 +02:00
++newAnnotation;
2015-06-22 13:52:56 +02:00
} else {
// it is now important whether the new interval is before
// or after existing
2015-06-27 12:40:24 +02:00
if (newAnnotation->getStart() <
existingAnnotation->getStart()) {
// New interval does not intersect and is
// before existing. We add it.
_tokenAnnotations.insert(existingAnnotation,
*newAnnotation);
++newAnnotation;
2015-06-22 13:52:56 +02:00
} else {
2015-06-27 12:40:24 +02:00
// If the new interval is after existing
// we move to the next existing annoation.
++existingAnnotation;
2015-06-22 13:52:56 +02:00
}
}
} else {
// no more existing annotations, so just add the new annotation
_tokenAnnotations.push_back(*newAnnotation);
2015-06-27 12:40:24 +02:00
++newAnnotation;
2015-06-22 13:52:56 +02:00
}
}
}
2015-06-25 10:12:51 +02:00
void TokenizedSentence::toLowerCase() {
2015-06-22 13:52:56 +02:00
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
}
void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
_codes.push_back(wordMap->getWordCode(annotation.getValue()));
_tokens.push_back(annotation);
2015-06-27 12:40:24 +02:00
}
}
}
2016-01-01 20:45:07 +01:00
void TokenizedSentence::generateTokens() {
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
_tokens.push_back(annotation);
}
}
}
2017-04-26 17:02:18 +02:00
std::string TokenizedSentence::getTokenizedSentence() const {
std::stringstream ss;
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
ss << annotation.getValue() << " ";
}
}
std::string result = ss.str();
boost::trim_right(result);
return result;
}