#ifndef TOKENIZED_SENTENCE_HDR #define TOKENIZED_SENTENCE_HDR #include "concordia/common/config.hpp" #include "concordia/token_annotation.hpp" #include "concordia/word_map.hpp" #include #include #include #include /*! A sentence after tokenizing operations. The class holds the current string represenation of the sentence along with the annotations list. The class also allows for generating hash. After that operation the class also holds the list of hashed codes and corresponding tokens. */ class TokenizedSentence { public: /*! Constructor. */ explicit TokenizedSentence(std::string sentence); /*! Destructor. */ virtual ~TokenizedSentence(); /*! Getter for sentence \returns sentence */ std::string getSentence() const { return _sentence; } /*! Getter for all annotations list. This method returns all annotations, including those which are not considered in the hash, i.e. stop words and html tags. \returns annotations list */ std::list getAnnotations() const { return _tokenAnnotations; } /*! Getter for codes list. This data is available after calling the hashGenerator method. \returns codes list */ std::vector getCodes() const { return _codes; } /*! Getter for tokens list. This method returns only those annotations considered in the hash, i.e. words and named entities. \returns tokens list */ std::vector getTokens() const { return _tokens; } /*! Method for generating hash based on annotations. This method takes into account annotations of type word and named entity. These are encoded and added to to code list. Annotations corresponding to these tokens are added to the tokens list. \param wordMap word map to use when encoding tokens \returns tokens list */ void generateHash(boost::shared_ptr wordMap); /*! Transform the sentence to lower case. */ void toLowerCase(); /*! Add new annotations to the existing annotations list. Assumptions: 1. existing _tokenAnnotations vector contains disjoint, sorted intervals; 2. the annotations to be added list also has the above properties. The below algorithm will only add the annotations that do not intersect with any of the existing ones. \param annotations list of annotations to be added */ void addAnnotations(std::vector annotations); private: std::string _sentence; std::list _tokenAnnotations; std::vector _codes; std::vector _tokens; }; #endif