concordia-library/concordia/sentence_tokenizer.cpp

#include "concordia/sentence_tokenizer.hpp"
#include "concordia/token_annotation.hpp"

#include <boost/foreach.hpp>
#include <fstream>
#include <sstream>
#include <iostream>
#include <boost/algorithm/string.hpp>

SentenceTokenizer::SentenceTokenizer(
                        boost::shared_ptr<ConcordiaConfig> config)
                                         throw(ConcordiaException) {
    _createNeRules(config->getNamedEntitiesFilePath());
    _createHtmlTagsRule(config->getHtmlTagsFilePath());
    _stopWordsEnabled = config->isStopWordsEnabled();
    if (_stopWordsEnabled) {
        _stopWords = _getMultipleRegexRule(
                                  config->getStopWordsFilePath(),
                                  TokenAnnotation::STOP_WORD,
                                  "", true);
    }
}

SentenceTokenizer::~SentenceTokenizer() {
}

TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
                                              bool byWhitespace) {
    TokenizedSentence result(sentence);

    if (byWhitespace) {
        result.toLowerCase();

        boost::shared_ptr<RegexRule> whitespaceRule(
                            new RegexRule("\\S+",
                                          TokenAnnotation::WORD, ""));
        whitespaceRule->apply(result);
    } else {
        _htmlTags->apply(result);

        BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
            neRule.apply(result);
        }

        result.toLowerCase();

        if (_stopWordsEnabled) {
            _stopWords->apply(result);
        }

        boost::shared_ptr<RegexRule> wordsWithNumbersRule(
                                    new RegexRule("[0-9]+\\-(\\p{L}|[0-9])+",
                                                  TokenAnnotation::WORD, ""));
                wordsWithNumbersRule->apply(result);

        boost::shared_ptr<RegexRule> numbersRule(
                            new RegexRule("\\b[0-9]+([\\.\\,][0-9]+)?\\b", TokenAnnotation::NE, "ne_number"));
        numbersRule->apply(result);

        boost::shared_ptr<RegexRule> wordsRule(
                            new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])",
                                          TokenAnnotation::WORD, ""));
        wordsRule->apply(result);

        boost::shared_ptr<RegexRule> singleLetterWordsRule(
                            new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
        singleLetterWordsRule->apply(result);


    }

    return result;
}

void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
    if (boost::filesystem::exists(namedEntitiesPath)) {
        std::string line;
        std::ifstream neFile(namedEntitiesPath.c_str());
        if (neFile.is_open()) {
            int lineCounter = 0;
            while (getline(neFile, line)) {
                lineCounter++;
                boost::shared_ptr<std::vector<std::string> >
                                 tokenTexts(new std::vector<std::string>());
                boost::split(*tokenTexts, line, boost::is_any_of(" "),
                             boost::token_compress_on);
                if (tokenTexts->size() != 2) {
                    std::stringstream ss;
                    ss << "Invalid line: " << lineCounter
                       << " in NE file: " << namedEntitiesPath;
                    throw ConcordiaException(ss.str());
                } else {
                    _namedEntities.push_back(RegexRule(
                                tokenTexts->at(0),
                                TokenAnnotation::NE,
                                tokenTexts->at(1)));
                }
            }
            neFile.close();
        } else {
            throw ConcordiaException("Unable to read named entities file.");
        }
    } else {
        throw ConcordiaException("No named entities file.");
    }
}

void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
    std::string tagsExpression = "<\\/?(";
    if (boost::filesystem::exists(htmlTagsPath)) {
        std::string line;
        std::ifstream tagsFile(htmlTagsPath.c_str());
        if (tagsFile.is_open()) {
            while (getline(tagsFile, line)) {
                tagsExpression += line +"|";
            }
           tagsFile.close();
        } else {
            throw ConcordiaException("Unable to read html tags file.");
        }
    } else {
        throw ConcordiaException("No html tags file.");
    }
    tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
    tagsExpression += ").*?>";
    _htmlTags = boost::shared_ptr<RegexRule>(
                        new RegexRule(tagsExpression,
                                TokenAnnotation::HTML_TAG, "", false));
}

boost::shared_ptr<RegexRule>
        SentenceTokenizer::_getMultipleRegexRule(
            std::string filePath,
            char annotationType,
            std::string value,
            bool wholeWord) {
    std::string expression = "(";
    if (boost::filesystem::exists(filePath)) {
        std::string line;
        std::ifstream ruleFile(filePath.c_str());
        if (ruleFile.is_open()) {
            while (getline(ruleFile, line)) {
                if (wholeWord) {
                    expression += "\\b";
                }
                expression += line;
                if (wholeWord) {
                    expression += "\\b";
                }
                expression += "|";
            }
           ruleFile.close();
        } else {
            throw ConcordiaException("Unable to read file: "+filePath);
        }
    } else {
        throw ConcordiaException("No "+filePath+" file.");
    }
    expression = expression.substr(0, expression.size()-1);
    expression += ")";
    return boost::shared_ptr<RegexRule>(
                      new RegexRule(expression, annotationType, value, false));
}
tokenizer in progress 2015-06-25 10:12:51 +02:00			`#include "concordia/sentence_tokenizer.hpp"`
			`#include "concordia/token_annotation.hpp"`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`#include <boost/foreach.hpp>`
			`#include <fstream>`
			`#include <sstream>`
			`#include <iostream>`
			`#include <boost/algorithm/string.hpp>`

tokenizer in progress 2015-06-25 10:12:51 +02:00			`SentenceTokenizer::SentenceTokenizer(`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`boost::shared_ptr<ConcordiaConfig> config)`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00			`throw(ConcordiaException) {`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`_createNeRules(config->getNamedEntitiesFilePath());`
			`_createHtmlTagsRule(config->getHtmlTagsFilePath());`
removed stop words - works slower Former-commit-id: 97ce33b0a6ea3c89aaa5a4c69cad248c7b2c8203 2015-04-21 21:33:08 +02:00			`_stopWordsEnabled = config->isStopWordsEnabled();`
100% test in concordia-console. All passed! Former-commit-id: 6e6186a148d637ba5a0d324d6d68c78708f0942d 2015-04-22 16:50:12 +02:00			`if (_stopWordsEnabled) {`
tokenizer in progress 2015-06-25 10:12:51 +02:00			`_stopWords = _getMultipleRegexRule(`
			`config->getStopWordsFilePath(),`
working sentence tokenizer 2015-06-25 20:49:22 +02:00			`TokenAnnotation::STOP_WORD,`
tokenizer in progress 2015-06-25 10:12:51 +02:00			`"", true);`
removed stop words - works slower Former-commit-id: 97ce33b0a6ea3c89aaa5a4c69cad248c7b2c8203 2015-04-21 21:33:08 +02:00			`}`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00			`}`

tokenizer in progress 2015-06-25 10:12:51 +02:00			`SentenceTokenizer::~SentenceTokenizer() {`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00			`}`

tokenize by whitespace option 2015-12-27 20:54:40 +01:00			`TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,`
			`bool byWhitespace) {`
adding all tokenized examples 2015-08-19 20:49:26 +02:00			`TokenizedSentence result(sentence);`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00
added lowercasing when tokenizing by space 2015-12-29 21:44:46 +01:00			`if (byWhitespace) {`
			`result.toLowerCase();`

tokenize by whitespace option 2015-12-27 20:54:40 +01:00			`boost::shared_ptr<RegexRule> whitespaceRule(`
			`new RegexRule("\\S+",`
			`TokenAnnotation::WORD, ""));`
			`whitespaceRule->apply(result);`
			`} else {`
			`_htmlTags->apply(result);`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00
tokenize by whitespace option 2015-12-27 20:54:40 +01:00			`BOOST_FOREACH(RegexRule & neRule, _namedEntities) {`
			`neRule.apply(result);`
			`}`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00
tokenize by whitespace option 2015-12-27 20:54:40 +01:00			`result.toLowerCase();`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00
tokenize by whitespace option 2015-12-27 20:54:40 +01:00			`if (_stopWordsEnabled) {`
			`_stopWords->apply(result);`
			`}`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00
corrected tokenizer 2017-05-05 12:58:32 +02:00			`boost::shared_ptr<RegexRule> wordsWithNumbersRule(`
			`new RegexRule("[0-9]+\\-(\\p{L}\|[0-9])+",`
			`TokenAnnotation::WORD, ""));`
			`wordsWithNumbersRule->apply(result);`

			`boost::shared_ptr<RegexRule> numbersRule(`
			`new RegexRule("\\b[0-9]+([\\.\\,][0-9]+)?\\b", TokenAnnotation::NE, "ne_number"));`
			`numbersRule->apply(result);`

tokenize by whitespace option 2015-12-27 20:54:40 +01:00			`boost::shared_ptr<RegexRule> wordsRule(`
new tokenizer 2017-04-26 17:02:18 +02:00			`new RegexRule("(\\p{L}\|[0-9])(\\p{L}\|[0-9]\|'\|\\-)*(\\p{L}\|[0-9])",`
tokenize by whitespace option 2015-12-27 20:54:40 +01:00			`TokenAnnotation::WORD, ""));`
			`wordsRule->apply(result);`
corrected tokenizer 2017-05-05 12:58:32 +02:00
tokenize by whitespace option 2015-12-27 20:54:40 +01:00			`boost::shared_ptr<RegexRule> singleLetterWordsRule(`
			`new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));`
			`singleLetterWordsRule->apply(result);`
corrected tokenizer 2017-05-05 12:58:32 +02:00

tokenize by whitespace option 2015-12-27 20:54:40 +01:00			`}`
added lowercasing when tokenizing by space 2015-12-29 21:44:46 +01:00
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`return result;`
			`}`

tokenizer in progress 2015-06-25 10:12:51 +02:00			`void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`if (boost::filesystem::exists(namedEntitiesPath)) {`
removed using namespace std Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e 2015-04-15 14:14:10 +02:00			`std::string line;`
			`std::ifstream neFile(namedEntitiesPath.c_str());`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`if (neFile.is_open()) {`
			`int lineCounter = 0;`
			`while (getline(neFile, line)) {`
			`lineCounter++;`
removed using namespace std Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e 2015-04-15 14:14:10 +02:00			`boost::shared_ptr<std::vector<std::string> >`
			`tokenTexts(new std::vector<std::string>());`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`boost::split(*tokenTexts, line, boost::is_any_of(" "),`
			`boost::token_compress_on);`
			`if (tokenTexts->size() != 2) {`
removed using namespace std Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e 2015-04-15 14:14:10 +02:00			`std::stringstream ss;`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`ss << "Invalid line: " << lineCounter`
			`<< " in NE file: " << namedEntitiesPath;`
			`throw ConcordiaException(ss.str());`
			`} else {`
character intervals in progress 2015-06-22 13:52:56 +02:00			`_namedEntities.push_back(RegexRule(`
tokenizer in progress 2015-06-25 10:12:51 +02:00			`tokenTexts->at(0),`
working sentence tokenizer 2015-06-25 20:49:22 +02:00			`TokenAnnotation::NE,`
tokenizer in progress 2015-06-25 10:12:51 +02:00			`tokenTexts->at(1)));`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`}`
			`}`
			`neFile.close();`
			`} else {`
			`throw ConcordiaException("Unable to read named entities file.");`
			`}`
			`} else {`
			`throw ConcordiaException("No named entities file.");`
			`}`
			`}`

tokenizer in progress 2015-06-25 10:12:51 +02:00			`void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {`
removed using namespace std Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e 2015-04-15 14:14:10 +02:00			`std::string tagsExpression = "<\\/?(";`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`if (boost::filesystem::exists(htmlTagsPath)) {`
removed using namespace std Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e 2015-04-15 14:14:10 +02:00			`std::string line;`
			`std::ifstream tagsFile(htmlTagsPath.c_str());`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`if (tagsFile.is_open()) {`
			`while (getline(tagsFile, line)) {`
new tokenizer 2017-04-26 17:02:18 +02:00			`tagsExpression += line +"\|";`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`}`
			`tagsFile.close();`
			`} else {`
			`throw ConcordiaException("Unable to read html tags file.");`
			`}`
			`} else {`
			`throw ConcordiaException("No html tags file.");`
			`}`
			`tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);`
new tokenizer 2017-04-26 17:02:18 +02:00			`tagsExpression += ").*?>";`
character intervals in progress 2015-06-22 13:52:56 +02:00			`_htmlTags = boost::shared_ptr<RegexRule>(`
finished original word positions 2015-06-27 12:40:24 +02:00			`new RegexRule(tagsExpression,`
			`TokenAnnotation::HTML_TAG, "", false));`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`}`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00
character intervals in progress 2015-06-22 13:52:56 +02:00			`boost::shared_ptr<RegexRule>`
tokenizer in progress 2015-06-25 10:12:51 +02:00			`SentenceTokenizer::_getMultipleRegexRule(`
			`std::string filePath,`
			`char annotationType,`
			`std::string value,`
			`bool wholeWord) {`
removed using namespace std Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e 2015-04-15 14:14:10 +02:00			`std::string expression = "(";`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`if (boost::filesystem::exists(filePath)) {`
removed using namespace std Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e 2015-04-15 14:14:10 +02:00			`std::string line;`
			`std::ifstream ruleFile(filePath.c_str());`
anonymizing sentences Former-commit-id: 5d8bd7e16258fda7c02a7cc0e1da589d73418f0d 2014-04-29 14:46:04 +02:00			`if (ruleFile.is_open()) {`
			`while (getline(ruleFile, line)) {`
			`if (wholeWord) {`
			`expression += "\\b";`
			`}`
			`expression += line;`
			`if (wholeWord) {`
			`expression += "\\b";`
			`}`
			`expression += "\|";`
			`}`
			`ruleFile.close();`
			`} else {`
			`throw ConcordiaException("Unable to read file: "+filePath);`
			`}`
			`} else {`
			`throw ConcordiaException("No "+filePath+" file.");`
			`}`
			`expression = expression.substr(0, expression.size()-1);`
			`expression += ")";`
character intervals in progress 2015-06-22 13:52:56 +02:00			`return boost::shared_ptr<RegexRule>(`
finished original word positions 2015-06-27 12:40:24 +02:00			`new RegexRule(expression, annotationType, value, false));`
sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e 2014-04-13 12:21:30 +02:00			`}`