2015-06-25 10:12:51 +02:00
|
|
|
#ifndef TOKENIZED_SENTENCE_HDR
|
|
|
|
#define TOKENIZED_SENTENCE_HDR
|
2015-06-22 13:52:56 +02:00
|
|
|
|
|
|
|
#include "concordia/common/config.hpp"
|
|
|
|
#include "concordia/token_annotation.hpp"
|
2015-06-26 15:38:24 +02:00
|
|
|
#include "concordia/word_map.hpp"
|
|
|
|
|
|
|
|
#include <boost/shared_ptr.hpp>
|
2015-06-22 13:52:56 +02:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
#include <list>
|
2017-04-26 17:02:18 +02:00
|
|
|
#include <iostream>
|
|
|
|
#include <boost/foreach.hpp>
|
|
|
|
|
|
|
|
|
2015-06-22 13:52:56 +02:00
|
|
|
|
|
|
|
/*!
|
2015-06-27 12:40:24 +02:00
|
|
|
A sentence after tokenizing operations. The class
|
2015-06-22 13:52:56 +02:00
|
|
|
holds the current string represenation of the sentence
|
2015-06-27 12:40:24 +02:00
|
|
|
along with the annotations list. The class also allows
|
|
|
|
for generating hash. After that operation the class
|
|
|
|
also holds the list of hashed codes and corresponding
|
|
|
|
tokens.
|
2015-06-22 13:52:56 +02:00
|
|
|
*/
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
class TokenizedSentence {
|
2015-06-22 13:52:56 +02:00
|
|
|
public:
|
|
|
|
/*!
|
|
|
|
Constructor.
|
|
|
|
|
|
|
|
*/
|
2015-06-27 12:40:24 +02:00
|
|
|
explicit TokenizedSentence(std::string sentence);
|
2015-06-22 13:52:56 +02:00
|
|
|
|
|
|
|
/*! Destructor.
|
|
|
|
*/
|
2015-06-25 10:12:51 +02:00
|
|
|
virtual ~TokenizedSentence();
|
2015-06-22 13:52:56 +02:00
|
|
|
|
2017-04-28 13:48:32 +02:00
|
|
|
/*! Getter for the string sentence,
|
|
|
|
which might have been modified during tokenization.
|
2015-06-22 13:52:56 +02:00
|
|
|
\returns sentence
|
|
|
|
*/
|
|
|
|
std::string getSentence() const {
|
|
|
|
return _sentence;
|
|
|
|
}
|
|
|
|
|
2017-04-28 13:48:32 +02:00
|
|
|
/*! Getter for the original string sentence,
|
|
|
|
which was used for extracting tokens.
|
|
|
|
\returns originalSentence
|
|
|
|
*/
|
|
|
|
std::string getOriginalSentence() const {
|
|
|
|
return _originalSentence;
|
|
|
|
}
|
|
|
|
|
2017-04-26 17:02:18 +02:00
|
|
|
/*! Method for getting tokenized sentence in a string format (
|
|
|
|
tokens separated by single spaces.
|
|
|
|
\returns tokenized sentence
|
|
|
|
*/
|
|
|
|
std::string getTokenizedSentence() const;
|
|
|
|
|
2015-06-27 12:40:24 +02:00
|
|
|
/*! Getter for all annotations list. This method returns
|
|
|
|
all annotations, including those which are not considered
|
|
|
|
in the hash, i.e. stop words and html tags.
|
2015-06-22 13:52:56 +02:00
|
|
|
\returns annotations list
|
|
|
|
*/
|
|
|
|
std::list<TokenAnnotation> getAnnotations() const {
|
|
|
|
return _tokenAnnotations;
|
|
|
|
}
|
|
|
|
|
2015-06-27 12:40:24 +02:00
|
|
|
/*! Getter for codes list. This data is available after calling
|
|
|
|
the hashGenerator method.
|
|
|
|
\returns codes list
|
|
|
|
*/
|
2015-06-26 15:38:24 +02:00
|
|
|
std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
|
|
|
|
return _codes;
|
|
|
|
}
|
2015-06-27 12:40:24 +02:00
|
|
|
|
|
|
|
/*! Getter for tokens list. This method returns
|
|
|
|
only those annotations considered
|
|
|
|
in the hash, i.e. words and named entities.
|
|
|
|
\returns tokens list
|
|
|
|
*/
|
2015-06-26 15:38:24 +02:00
|
|
|
std::vector<TokenAnnotation> getTokens() const {
|
|
|
|
return _tokens;
|
|
|
|
}
|
2015-06-27 12:40:24 +02:00
|
|
|
|
|
|
|
/*! Method for generating hash based on annotations.
|
|
|
|
This method takes into account annotations of type
|
|
|
|
word and named entity. These are encoded and added
|
2016-01-01 20:45:07 +01:00
|
|
|
to code list. Annotations corresponding to these
|
2015-06-27 12:40:24 +02:00
|
|
|
tokens are added to the tokens list.
|
|
|
|
\param wordMap word map to use when encoding tokens
|
|
|
|
*/
|
2015-06-26 15:38:24 +02:00
|
|
|
void generateHash(boost::shared_ptr<WordMap> wordMap);
|
|
|
|
|
2016-01-01 20:45:07 +01:00
|
|
|
/*! Method for generating tokens based on annotations.
|
|
|
|
This method takes into account annotations of type
|
|
|
|
word and named entity. Unlike in generateHash,
|
|
|
|
these are not encoded or added to code list.
|
|
|
|
Annotations corresponding to these
|
|
|
|
tokens are added to the tokens list.
|
|
|
|
*/
|
|
|
|
void generateTokens();
|
|
|
|
|
2017-04-26 17:02:18 +02:00
|
|
|
/*!
|
2015-06-22 13:52:56 +02:00
|
|
|
Transform the sentence to lower case.
|
|
|
|
*/
|
|
|
|
void toLowerCase();
|
|
|
|
|
2017-04-26 17:02:18 +02:00
|
|
|
/*!
|
2015-06-22 13:52:56 +02:00
|
|
|
Add new annotations to the existing annotations list. Assumptions:
|
|
|
|
1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
|
|
|
|
2. the annotations to be added list also has the above properties.
|
|
|
|
The below algorithm will only add the annotations that do not
|
|
|
|
intersect with any of the existing ones.
|
|
|
|
|
|
|
|
\param annotations list of annotations to be added
|
|
|
|
*/
|
2015-06-27 12:40:24 +02:00
|
|
|
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
2015-06-22 13:52:56 +02:00
|
|
|
|
2017-04-26 17:02:18 +02:00
|
|
|
friend std::ostream & operator << (std::ostream & o,
|
|
|
|
const TokenizedSentence & ts) {
|
|
|
|
int index = 0;
|
|
|
|
BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {
|
|
|
|
o << "[" << token.getStart() << "," << token.getEnd() << "]["
|
|
|
|
<< token.getType() << "][" << token.getValue() <<"]";
|
|
|
|
if (index < ts.getAnnotations().size() - 1) {
|
|
|
|
o << " ";
|
|
|
|
}
|
|
|
|
index++;
|
|
|
|
}
|
|
|
|
return o;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-06-22 13:52:56 +02:00
|
|
|
private:
|
|
|
|
std::string _sentence;
|
|
|
|
|
2017-04-28 13:48:32 +02:00
|
|
|
std::string _originalSentence;
|
|
|
|
|
2015-06-22 13:52:56 +02:00
|
|
|
std::list<TokenAnnotation> _tokenAnnotations;
|
2015-06-27 12:40:24 +02:00
|
|
|
|
2015-06-26 15:38:24 +02:00
|
|
|
std::vector<INDEX_CHARACTER_TYPE> _codes;
|
2015-06-27 12:40:24 +02:00
|
|
|
|
2015-06-26 15:38:24 +02:00
|
|
|
std::vector<TokenAnnotation> _tokens;
|
2015-06-22 13:52:56 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|