2015-06-25 10:12:51 +02:00
|
|
|
#ifndef TOKENIZED_SENTENCE_HDR
|
|
|
|
#define TOKENIZED_SENTENCE_HDR
|
2015-06-22 13:52:56 +02:00
|
|
|
|
|
|
|
#include "concordia/common/config.hpp"
|
|
|
|
#include "concordia/token_annotation.hpp"
|
2015-06-26 15:38:24 +02:00
|
|
|
#include "concordia/word_map.hpp"
|
|
|
|
|
|
|
|
#include <boost/shared_ptr.hpp>
|
2015-06-22 13:52:56 +02:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
#include <list>
|
|
|
|
|
|
|
|
/*!
|
2015-06-27 12:40:24 +02:00
|
|
|
A sentence after tokenizing operations. The class
|
2015-06-22 13:52:56 +02:00
|
|
|
holds the current string represenation of the sentence
|
2015-06-27 12:40:24 +02:00
|
|
|
along with the annotations list. The class also allows
|
|
|
|
for generating hash. After that operation the class
|
|
|
|
also holds the list of hashed codes and corresponding
|
|
|
|
tokens.
|
2015-06-22 13:52:56 +02:00
|
|
|
*/
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
class TokenizedSentence {
|
2015-06-22 13:52:56 +02:00
|
|
|
public:
|
|
|
|
/*!
|
|
|
|
Constructor.
|
|
|
|
|
|
|
|
*/
|
2015-06-27 12:40:24 +02:00
|
|
|
explicit TokenizedSentence(std::string sentence);
|
2015-06-22 13:52:56 +02:00
|
|
|
|
|
|
|
/*! Destructor.
|
|
|
|
*/
|
2015-06-25 10:12:51 +02:00
|
|
|
virtual ~TokenizedSentence();
|
2015-06-22 13:52:56 +02:00
|
|
|
|
|
|
|
/*! Getter for sentence
|
|
|
|
\returns sentence
|
|
|
|
*/
|
|
|
|
std::string getSentence() const {
|
|
|
|
return _sentence;
|
|
|
|
}
|
|
|
|
|
2015-06-27 12:40:24 +02:00
|
|
|
/*! Getter for all annotations list. This method returns
|
|
|
|
all annotations, including those which are not considered
|
|
|
|
in the hash, i.e. stop words and html tags.
|
2015-06-22 13:52:56 +02:00
|
|
|
\returns annotations list
|
|
|
|
*/
|
|
|
|
std::list<TokenAnnotation> getAnnotations() const {
|
|
|
|
return _tokenAnnotations;
|
|
|
|
}
|
|
|
|
|
2015-06-27 12:40:24 +02:00
|
|
|
/*! Getter for codes list. This data is available after calling
|
|
|
|
the hashGenerator method.
|
|
|
|
\returns codes list
|
|
|
|
*/
|
2015-06-26 15:38:24 +02:00
|
|
|
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
|
|
|
|
return _codes;
|
|
|
|
}
|
2015-06-27 12:40:24 +02:00
|
|
|
|
|
|
|
/*! Getter for tokens list. This method returns
|
|
|
|
only those annotations considered
|
|
|
|
in the hash, i.e. words and named entities.
|
|
|
|
\returns tokens list
|
|
|
|
*/
|
2015-06-26 15:38:24 +02:00
|
|
|
std::vector<TokenAnnotation> getTokens() const {
|
|
|
|
return _tokens;
|
|
|
|
}
|
2015-06-27 12:40:24 +02:00
|
|
|
|
|
|
|
/*! Method for generating hash based on annotations.
|
|
|
|
This method takes into account annotations of type
|
|
|
|
word and named entity. These are encoded and added
|
2016-01-01 20:45:07 +01:00
|
|
|
to code list. Annotations corresponding to these
|
2015-06-27 12:40:24 +02:00
|
|
|
tokens are added to the tokens list.
|
|
|
|
\param wordMap word map to use when encoding tokens
|
|
|
|
*/
|
2015-06-26 15:38:24 +02:00
|
|
|
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
|
|
|
|
2016-01-01 20:45:07 +01:00
|
|
|
/*! Method for generating tokens based on annotations.
|
|
|
|
This method takes into account annotations of type
|
|
|
|
word and named entity. Unlike in generateHash,
|
|
|
|
these are not encoded or added to code list.
|
|
|
|
Annotations corresponding to these
|
|
|
|
tokens are added to the tokens list.
|
|
|
|
*/
|
|
|
|
void generateTokens();
|
|
|
|
|
2015-06-22 13:52:56 +02:00
|
|
|
/*!
|
|
|
|
Transform the sentence to lower case.
|
|
|
|
*/
|
|
|
|
void toLowerCase();
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Add new annotations to the existing annotations list. Assumptions:
|
|
|
|
1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
|
|
|
|
2. the annotations to be added list also has the above properties.
|
|
|
|
The below algorithm will only add the annotations that do not
|
|
|
|
intersect with any of the existing ones.
|
|
|
|
|
|
|
|
\param annotations list of annotations to be added
|
|
|
|
*/
|
2015-06-27 12:40:24 +02:00
|
|
|
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
2015-06-22 13:52:56 +02:00
|
|
|
|
|
|
|
private:
|
|
|
|
std::string _sentence;
|
|
|
|
|
|
|
|
std::list<TokenAnnotation> _tokenAnnotations;
|
2015-06-27 12:40:24 +02:00
|
|
|
|
2015-06-26 15:38:24 +02:00
|
|
|
std::vector<INDEX_CHARACTER_TYPE> _codes;
|
2015-06-27 12:40:24 +02:00
|
|
|
|
2015-06-26 15:38:24 +02:00
|
|
|
std::vector<TokenAnnotation> _tokens;
|
2015-06-22 13:52:56 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|