concordia-library/concordia/tokenized_sentence.hpp

#ifndef TOKENIZED_SENTENCE_HDR
#define TOKENIZED_SENTENCE_HDR

#include "concordia/common/config.hpp"
#include "concordia/token_annotation.hpp"
#include "concordia/word_map.hpp"

#include <boost/shared_ptr.hpp>
#include <string>
#include <vector>
#include <list>
#include <iostream>
#include <boost/foreach.hpp>


/*!
  A sentence after tokenizing operations. The class
  holds the current string represenation of the sentence
  along with the annotations list. The class also allows
  for generating hash. After that operation the class
  also holds the list of hashed codes and corresponding
  tokens.
*/

class TokenizedSentence {
public:
    /*!
      Constructor.

    */
    explicit TokenizedSentence(std::string sentence);

    /*! Destructor.
    */
    virtual ~TokenizedSentence();

    /*! Getter for the string sentence,
        which might have been modified during tokenization.
      \returns sentence
    */
    std::string getSentence() const {
        return _sentence;
    }

    /*! Getter for the original string sentence,
        which was used for extracting tokens.
      \returns originalSentence
    */
    std::string getOriginalSentence() const {
        return _originalSentence;
    }

    /*! Method for getting tokenized sentence in a string format (
    tokens separated by single spaces.
      \returns tokenized sentence
    */
    std::string getTokenizedSentence() const;

    /*! Getter for all annotations list. This method returns
        all annotations, including those which are not considered
        in the hash, i.e. stop words and html tags.
      \returns annotations list
    */
    std::list<TokenAnnotation> getAnnotations() const {
        return _tokenAnnotations;
    }

    /*! Getter for codes list. This data is available after calling
        the hashGenerator method.
      \returns codes list
    */
    std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
        return _codes;
    }

    /*! Getter for tokens list. This method returns
        only those annotations considered
        in the hash, i.e. words and named entities.
      \returns tokens list
    */
    std::vector<TokenAnnotation> getTokens() const {
        return _tokens;
    }

    /*! Method for generating hash based on annotations.
        This method takes into account annotations of type
        word and named entity. These are encoded and added
        to code list. Annotations corresponding to these
        tokens are added to the tokens list.
      \param wordMap word map to use when encoding tokens
    */
    void generateHash(boost::shared_ptr<WordMap> wordMap);

    /*! Method for generating tokens based on annotations.
        This method takes into account annotations of type
        word and named entity. Unlike in generateHash,
        these are not encoded or added to code list.
        Annotations corresponding to these
        tokens are added to the tokens list.
    */
    void generateTokens();

    /*!
        Transform the sentence to lower case.
    */
    void toLowerCase();

    /*!
        Add new annotations to the existing annotations list. Assumptions:
        1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
        2. the annotations to be added list also has the above properties.
        The below algorithm will only add the annotations that do not
        intersect with any of the existing ones.

        \param annotations list of annotations to be added
    */
    void addAnnotations(std::vector<TokenAnnotation> annotations);

    friend std::ostream & operator << (std::ostream & o,
                          const TokenizedSentence & ts) {
        int index = 0;
        BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {
            o << "[" << token.getStart() << "," << token.getEnd() << "]["
              << token.getType() << "][" << token.getValue() <<"]";
            if (index < ts.getAnnotations().size() - 1) {
                o << " ";
            }
            index++;
        }
        return o;
    }


private:
    std::string _sentence;

    std::string _originalSentence;

    std::list<TokenAnnotation> _tokenAnnotations;

    std::vector<INDEX_CHARACTER_TYPE> _codes;

    std::vector<TokenAnnotation> _tokens;
};

#endif
tokenizer in progress 2015-06-25 10:12:51 +02:00			`#ifndef TOKENIZED_SENTENCE_HDR`
			`#define TOKENIZED_SENTENCE_HDR`
character intervals in progress 2015-06-22 13:52:56 +02:00
			`#include "concordia/common/config.hpp"`
			`#include "concordia/token_annotation.hpp"`
new responsibilities of tokenized sentence 2015-06-26 15:38:24 +02:00			`#include "concordia/word_map.hpp"`

			`#include <boost/shared_ptr.hpp>`
character intervals in progress 2015-06-22 13:52:56 +02:00			`#include <string>`
			`#include <vector>`
			`#include <list>`
new tokenizer 2017-04-26 17:02:18 +02:00			`#include <iostream>`
			`#include <boost/foreach.hpp>`


character intervals in progress 2015-06-22 13:52:56 +02:00
			`/*!`
finished original word positions 2015-06-27 12:40:24 +02:00			`A sentence after tokenizing operations. The class`
character intervals in progress 2015-06-22 13:52:56 +02:00			`holds the current string represenation of the sentence`
finished original word positions 2015-06-27 12:40:24 +02:00			`along with the annotations list. The class also allows`
			`for generating hash. After that operation the class`
			`also holds the list of hashed codes and corresponding`
			`tokens.`
character intervals in progress 2015-06-22 13:52:56 +02:00			`*/`

tokenizer in progress 2015-06-25 10:12:51 +02:00			`class TokenizedSentence {`
character intervals in progress 2015-06-22 13:52:56 +02:00			`public:`
			`/*!`
			`Constructor.`

			`*/`
finished original word positions 2015-06-27 12:40:24 +02:00			`explicit TokenizedSentence(std::string sentence);`
character intervals in progress 2015-06-22 13:52:56 +02:00
			`/*! Destructor.`
			`*/`
tokenizer in progress 2015-06-25 10:12:51 +02:00			`virtual ~TokenizedSentence();`
character intervals in progress 2015-06-22 13:52:56 +02:00
original sentence in tokenized sentence 2017-04-28 13:48:32 +02:00			`/*! Getter for the string sentence,`
			`which might have been modified during tokenization.`
character intervals in progress 2015-06-22 13:52:56 +02:00			`\returns sentence`
			`*/`
			`std::string getSentence() const {`
			`return _sentence;`
			`}`

original sentence in tokenized sentence 2017-04-28 13:48:32 +02:00			`/*! Getter for the original string sentence,`
			`which was used for extracting tokens.`
			`\returns originalSentence`
			`*/`
			`std::string getOriginalSentence() const {`
			`return _originalSentence;`
			`}`

new tokenizer 2017-04-26 17:02:18 +02:00			`/*! Method for getting tokenized sentence in a string format (`
			`tokens separated by single spaces.`
			`\returns tokenized sentence`
			`*/`
			`std::string getTokenizedSentence() const;`

finished original word positions 2015-06-27 12:40:24 +02:00			`/*! Getter for all annotations list. This method returns`
			`all annotations, including those which are not considered`
			`in the hash, i.e. stop words and html tags.`
character intervals in progress 2015-06-22 13:52:56 +02:00			`\returns annotations list`
			`*/`
			`std::list<TokenAnnotation> getAnnotations() const {`
			`return _tokenAnnotations;`
			`}`

finished original word positions 2015-06-27 12:40:24 +02:00			`/*! Getter for codes list. This data is available after calling`
			`the hashGenerator method.`
			`\returns codes list`
			`*/`
new responsibilities of tokenized sentence 2015-06-26 15:38:24 +02:00			`std::vector<INDEX_CHARACTER_TYPE> getCodes() const {`
			`return _codes;`
			`}`
finished original word positions 2015-06-27 12:40:24 +02:00
			`/*! Getter for tokens list. This method returns`
			`only those annotations considered`
			`in the hash, i.e. words and named entities.`
			`\returns tokens list`
			`*/`
new responsibilities of tokenized sentence 2015-06-26 15:38:24 +02:00			`std::vector<TokenAnnotation> getTokens() const {`
			`return _tokens;`
			`}`
finished original word positions 2015-06-27 12:40:24 +02:00
			`/*! Method for generating hash based on annotations.`
			`This method takes into account annotations of type`
			`word and named entity. These are encoded and added`
tokenize only option - no word map 2016-01-01 20:45:07 +01:00			`to code list. Annotations corresponding to these`
finished original word positions 2015-06-27 12:40:24 +02:00			`tokens are added to the tokens list.`
			`\param wordMap word map to use when encoding tokens`
			`*/`
new responsibilities of tokenized sentence 2015-06-26 15:38:24 +02:00			`void generateHash(boost::shared_ptr<WordMap> wordMap);`

tokenize only option - no word map 2016-01-01 20:45:07 +01:00			`/*! Method for generating tokens based on annotations.`
			`This method takes into account annotations of type`
			`word and named entity. Unlike in generateHash,`
			`these are not encoded or added to code list.`
			`Annotations corresponding to these`
			`tokens are added to the tokens list.`
			`*/`
			`void generateTokens();`

new tokenizer 2017-04-26 17:02:18 +02:00			`/*!`
character intervals in progress 2015-06-22 13:52:56 +02:00			`Transform the sentence to lower case.`
			`*/`
			`void toLowerCase();`

new tokenizer 2017-04-26 17:02:18 +02:00			`/*!`
character intervals in progress 2015-06-22 13:52:56 +02:00			`Add new annotations to the existing annotations list. Assumptions:`
			`1. existing _tokenAnnotations vector contains disjoint, sorted intervals;`
			`2. the annotations to be added list also has the above properties.`
			`The below algorithm will only add the annotations that do not`
			`intersect with any of the existing ones.`

			`\param annotations list of annotations to be added`
			`*/`
finished original word positions 2015-06-27 12:40:24 +02:00			`void addAnnotations(std::vector<TokenAnnotation> annotations);`
character intervals in progress 2015-06-22 13:52:56 +02:00
new tokenizer 2017-04-26 17:02:18 +02:00			`friend std::ostream & operator << (std::ostream & o,`
			`const TokenizedSentence & ts) {`
			`int index = 0;`
			`BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {`
			`o << "[" << token.getStart() << "," << token.getEnd() << "]["`
			`<< token.getType() << "][" << token.getValue() <<"]";`
			`if (index < ts.getAnnotations().size() - 1) {`
			`o << " ";`
			`}`
			`index++;`
			`}`
			`return o;`
			`}`


character intervals in progress 2015-06-22 13:52:56 +02:00			`private:`
			`std::string _sentence;`

original sentence in tokenized sentence 2017-04-28 13:48:32 +02:00			`std::string _originalSentence;`

character intervals in progress 2015-06-22 13:52:56 +02:00			`std::list<TokenAnnotation> _tokenAnnotations;`
finished original word positions 2015-06-27 12:40:24 +02:00
new responsibilities of tokenized sentence 2015-06-26 15:38:24 +02:00			`std::vector<INDEX_CHARACTER_TYPE> _codes;`
finished original word positions 2015-06-27 12:40:24 +02:00
new responsibilities of tokenized sentence 2015-06-26 15:38:24 +02:00			`std::vector<TokenAnnotation> _tokens;`
character intervals in progress 2015-06-22 13:52:56 +02:00			`};`

			`#endif`