concordia-library/concordia/sentence_tokenizer.hpp

#ifndef SENTENCE_TOKENIZER_HDR
#define SENTENCE_TOKENIZER_HDR

#include <string>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/regex_rule.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/filesystem.hpp>


/*!
  Class for tokenizing sentence before generating hash.
  Tokenizer ignores unnecessary symbols, html tags and possibly stop words
  (if the option is enabled) in sentences added to index
  as well as annotates named entities. All these have to be listed in files
  (see \ref tutorial3).
*/

class SentenceTokenizer {
public:
    /*! Constructor.
      \param config config object, holding paths to necessary files
    */
    explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
                                                 throw(ConcordiaException);

    /*! Destructor.
    */
    virtual ~SentenceTokenizer();

    /*! Tokenizes the sentence.
      \param sentence input sentence
      \returns tokenized sentence object build on the input sentence
    */
    TokenizedSentence tokenize(const std::string & sentence);

private:
    void _createNeRules(std::string & namedEntitiesPath);

    void _createHtmlTagsRule(std::string & htmlTagsPath);

    boost::shared_ptr<RegexRule> _getMultipleRegexRule(
                                             std::string filePath,
                                             char annotationType,
                                             std::string value,
                                             bool wholeWord = false);

    std::vector<RegexRule> _namedEntities;

    boost::shared_ptr<RegexRule> _htmlTags;

    bool _stopWordsEnabled;

    boost::shared_ptr<RegexRule> _stopWords;
};

#endif
tokenizer in progress 2015-06-25 10:12:51 +02:00			`#ifndef SENTENCE_TOKENIZER_HDR`
			`#define SENTENCE_TOKENIZER_HDR`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00
			`#include <string>`
std vectors Former-commit-id: 5816e87c856f7edc242cc707851a0e2ad05aeb38 2015-04-15 10:55:26 +02:00			`#include <vector>`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00			`#include "concordia/common/config.hpp"`
tokenizer in progress 2015-06-25 10:12:51 +02:00			`#include "concordia/tokenized_sentence.hpp"`
character intervals in progress 2015-06-22 13:52:56 +02:00			`#include "concordia/regex_rule.hpp"`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00			`#include "concordia/concordia_config.hpp"`
			`#include "concordia/concordia_exception.hpp"`
			`#include <boost/shared_ptr.hpp>`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`#include <boost/filesystem.hpp>`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00

			`/*!`
tokenizer in progress 2015-06-25 10:12:51 +02:00			`Class for tokenizing sentence before generating hash.`
finished original word positions 2015-06-27 12:40:24 +02:00			`Tokenizer ignores unnecessary symbols, html tags and possibly stop words`
			`(if the option is enabled) in sentences added to index`
			`as well as annotates named entities. All these have to be listed in files`
finished documentation 2015-05-01 14:52:53 +02:00			`(see \ref tutorial3).`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00			`*/`

tokenizer in progress 2015-06-25 10:12:51 +02:00			`class SentenceTokenizer {`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00			`public:`
finished documentation 2015-05-01 14:52:53 +02:00			`/*! Constructor.`
			`\param config config object, holding paths to necessary files`
			`*/`
tokenizer in progress 2015-06-25 10:12:51 +02:00			`explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00			`throw(ConcordiaException);`

			`/*! Destructor.`
			`*/`
tokenizer in progress 2015-06-25 10:12:51 +02:00			`virtual ~SentenceTokenizer();`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00
tokenizer in progress 2015-06-25 10:12:51 +02:00			`/*! Tokenizes the sentence.`
finished documentation 2015-05-01 14:52:53 +02:00			`\param sentence input sentence`
finished original word positions 2015-06-27 12:40:24 +02:00			`\returns tokenized sentence object build on the input sentence`
finished documentation 2015-05-01 14:52:53 +02:00			`*/`
adding all tokenized examples 2015-08-19 20:49:26 +02:00			`TokenizedSentence tokenize(const std::string & sentence);`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00
			`private:`
removed using namespace std Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e 2015-04-15 14:14:10 +02:00			`void _createNeRules(std::string & namedEntitiesPath);`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00
removed using namespace std Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e 2015-04-15 14:14:10 +02:00			`void _createHtmlTagsRule(std::string & htmlTagsPath);`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00
tokenizer in progress 2015-06-25 10:12:51 +02:00			`boost::shared_ptr<RegexRule> _getMultipleRegexRule(`
			`std::string filePath,`
			`char annotationType,`
			`std::string value,`
removed using namespace std Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e 2015-04-15 14:14:10 +02:00			`bool wholeWord = false);`
text utils stub Former-commit-id: d4459220f5696839d98848e9c30a61c084763a91 2014-04-24 08:36:48 +02:00
character intervals in progress 2015-06-22 13:52:56 +02:00			`std::vector<RegexRule> _namedEntities;`
text utils stub Former-commit-id: d4459220f5696839d98848e9c30a61c084763a91 2014-04-24 08:36:48 +02:00
character intervals in progress 2015-06-22 13:52:56 +02:00			`boost::shared_ptr<RegexRule> _htmlTags;`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00
removed stop words - works slower Former-commit-id: 97ce33b0a6ea3c89aaa5a4c69cad248c7b2c8203 2015-04-21 21:33:08 +02:00			`bool _stopWordsEnabled;`

character intervals in progress 2015-06-22 13:52:56 +02:00			`boost::shared_ptr<RegexRule> _stopWords;`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00			`};`

			`#endif`