2015-06-25 10:12:51 +02:00
|
|
|
#ifndef TOKENIZED_SENTENCE_HDR
|
|
|
|
#define TOKENIZED_SENTENCE_HDR
|
2015-06-22 13:52:56 +02:00
|
|
|
|
|
|
|
#include "concordia/common/config.hpp"
|
|
|
|
#include "concordia/token_annotation.hpp"
|
2015-06-26 15:38:24 +02:00
|
|
|
#include "concordia/word_map.hpp"
|
|
|
|
|
|
|
|
#include <boost/shared_ptr.hpp>
|
2015-06-22 13:52:56 +02:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
#include <list>
|
|
|
|
|
|
|
|
/*!
|
|
|
|
A sentence after anonymization operations. The class
|
|
|
|
holds the current string represenation of the sentence
|
|
|
|
along with the annotations list.
|
|
|
|
*/
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
class TokenizedSentence {
|
2015-06-22 13:52:56 +02:00
|
|
|
public:
|
|
|
|
/*!
|
|
|
|
Constructor.
|
|
|
|
|
|
|
|
*/
|
2015-06-25 10:12:51 +02:00
|
|
|
TokenizedSentence(std::string sentence);
|
2015-06-22 13:52:56 +02:00
|
|
|
|
|
|
|
/*! Destructor.
|
|
|
|
*/
|
2015-06-25 10:12:51 +02:00
|
|
|
virtual ~TokenizedSentence();
|
2015-06-22 13:52:56 +02:00
|
|
|
|
|
|
|
/*! Getter for sentence
|
|
|
|
\returns sentence
|
|
|
|
*/
|
|
|
|
std::string getSentence() const {
|
|
|
|
return _sentence;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*! Getter for annotations list
|
|
|
|
\returns annotations list
|
|
|
|
*/
|
|
|
|
std::list<TokenAnnotation> getAnnotations() const {
|
|
|
|
return _tokenAnnotations;
|
|
|
|
}
|
|
|
|
|
2015-06-26 15:38:24 +02:00
|
|
|
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
|
|
|
|
return _codes;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<TokenAnnotation> getTokens() const {
|
|
|
|
return _tokens;
|
|
|
|
}
|
|
|
|
|
|
|
|
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
|
|
|
|
2015-06-22 13:52:56 +02:00
|
|
|
/*!
|
|
|
|
Transform the sentence to lower case.
|
|
|
|
*/
|
|
|
|
void toLowerCase();
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Add new annotations to the existing annotations list. Assumptions:
|
|
|
|
1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
|
|
|
|
2. the annotations to be added list also has the above properties.
|
|
|
|
The below algorithm will only add the annotations that do not
|
|
|
|
intersect with any of the existing ones.
|
|
|
|
|
|
|
|
\param annotations list of annotations to be added
|
|
|
|
*/
|
|
|
|
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::string _sentence;
|
|
|
|
|
|
|
|
std::list<TokenAnnotation> _tokenAnnotations;
|
2015-06-26 15:38:24 +02:00
|
|
|
|
|
|
|
std::vector<INDEX_CHARACTER_TYPE> _codes;
|
|
|
|
|
|
|
|
std::vector<TokenAnnotation> _tokens;
|
2015-06-22 13:52:56 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|