original sentence in tokenized sentence

This commit is contained in:
rjawor 2017-04-28 13:48:32 +02:00
parent 4faae4e91a
commit 96a5bc3108
2 changed files with 14 additions and 2 deletions

View File

@ -7,7 +7,8 @@
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
TokenizedSentence::TokenizedSentence(std::string sentence): TokenizedSentence::TokenizedSentence(std::string sentence):
_sentence(sentence) { _sentence(sentence),
_originalSentence(sentence) {
} }
TokenizedSentence::~TokenizedSentence() { TokenizedSentence::~TokenizedSentence() {

View File

@ -35,13 +35,22 @@ public:
*/ */
virtual ~TokenizedSentence(); virtual ~TokenizedSentence();
/*! Getter for the string sentence, which is used for extracting tokens. /*! Getter for the string sentence,
which might have been modified during tokenization.
\returns sentence \returns sentence
*/ */
std::string getSentence() const { std::string getSentence() const {
return _sentence; return _sentence;
} }
/*! Getter for the original string sentence,
which was used for extracting tokens.
\returns originalSentence
*/
std::string getOriginalSentence() const {
return _originalSentence;
}
/*! Method for getting tokenized sentence in a string format ( /*! Method for getting tokenized sentence in a string format (
tokens separated by single spaces. tokens separated by single spaces.
\returns tokenized sentence \returns tokenized sentence
@ -126,6 +135,8 @@ public:
private: private:
std::string _sentence; std::string _sentence;
std::string _originalSentence;
std::list<TokenAnnotation> _tokenAnnotations; std::list<TokenAnnotation> _tokenAnnotations;
std::vector<INDEX_CHARACTER_TYPE> _codes; std::vector<INDEX_CHARACTER_TYPE> _codes;