concordia-library/concordia/sentence_tokenizer.cpp

164 lines
5.7 KiB
C++
Raw Normal View History

2015-06-25 10:12:51 +02:00
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/token_annotation.hpp"
#include <boost/foreach.hpp>
#include <fstream>
#include <sstream>
#include <iostream>
#include <boost/algorithm/string.hpp>
2015-06-25 10:12:51 +02:00
SentenceTokenizer::SentenceTokenizer(
boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException) {
_createNeRules(config->getNamedEntitiesFilePath());
_createHtmlTagsRule(config->getHtmlTagsFilePath());
_stopWordsEnabled = config->isStopWordsEnabled();
if (_stopWordsEnabled) {
2015-06-25 10:12:51 +02:00
_stopWords = _getMultipleRegexRule(
config->getStopWordsFilePath(),
2015-06-25 20:49:22 +02:00
TokenAnnotation::STOP_WORD,
2015-06-25 10:12:51 +02:00
"", true);
}
}
2015-06-25 10:12:51 +02:00
SentenceTokenizer::~SentenceTokenizer() {
}
2015-12-27 20:54:40 +01:00
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
bool byWhitespace) {
2015-08-19 20:49:26 +02:00
TokenizedSentence result(sentence);
if (byWhitespace) {
result.toLowerCase();
2015-12-27 20:54:40 +01:00
boost::shared_ptr<RegexRule> whitespaceRule(
new RegexRule("\\S+",
TokenAnnotation::WORD, ""));
whitespaceRule->apply(result);
} else {
_htmlTags->apply(result);
2015-12-27 20:54:40 +01:00
BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
neRule.apply(result);
}
2015-12-27 20:54:40 +01:00
result.toLowerCase();
2015-12-27 20:54:40 +01:00
if (_stopWordsEnabled) {
_stopWords->apply(result);
}
2017-05-05 12:58:32 +02:00
boost::shared_ptr<RegexRule> wordsWithNumbersRule(
new RegexRule("[0-9]+\\-(\\p{L}|[0-9])+",
TokenAnnotation::WORD, ""));
wordsWithNumbersRule->apply(result);
boost::shared_ptr<RegexRule> numbersRule(
new RegexRule("\\b[0-9]+([\\.\\,][0-9]+)?\\b", TokenAnnotation::NE, "ne_number"));
numbersRule->apply(result);
2015-12-27 20:54:40 +01:00
boost::shared_ptr<RegexRule> wordsRule(
2017-04-26 17:02:18 +02:00
new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])",
2015-12-27 20:54:40 +01:00
TokenAnnotation::WORD, ""));
wordsRule->apply(result);
2017-05-05 12:58:32 +02:00
2015-12-27 20:54:40 +01:00
boost::shared_ptr<RegexRule> singleLetterWordsRule(
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
singleLetterWordsRule->apply(result);
2017-05-05 12:58:32 +02:00
2015-12-27 20:54:40 +01:00
}
return result;
}
2015-06-25 10:12:51 +02:00
void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
if (boost::filesystem::exists(namedEntitiesPath)) {
std::string line;
std::ifstream neFile(namedEntitiesPath.c_str());
if (neFile.is_open()) {
int lineCounter = 0;
while (getline(neFile, line)) {
lineCounter++;
boost::shared_ptr<std::vector<std::string> >
tokenTexts(new std::vector<std::string>());
boost::split(*tokenTexts, line, boost::is_any_of(" "),
boost::token_compress_on);
if (tokenTexts->size() != 2) {
std::stringstream ss;
ss << "Invalid line: " << lineCounter
<< " in NE file: " << namedEntitiesPath;
throw ConcordiaException(ss.str());
} else {
2015-06-22 13:52:56 +02:00
_namedEntities.push_back(RegexRule(
2015-06-25 10:12:51 +02:00
tokenTexts->at(0),
2015-06-25 20:49:22 +02:00
TokenAnnotation::NE,
2015-06-25 10:12:51 +02:00
tokenTexts->at(1)));
}
}
neFile.close();
} else {
throw ConcordiaException("Unable to read named entities file.");
}
} else {
throw ConcordiaException("No named entities file.");
}
}
2015-06-25 10:12:51 +02:00
void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
std::string tagsExpression = "<\\/?(";
if (boost::filesystem::exists(htmlTagsPath)) {
std::string line;
std::ifstream tagsFile(htmlTagsPath.c_str());
if (tagsFile.is_open()) {
while (getline(tagsFile, line)) {
2017-04-26 17:02:18 +02:00
tagsExpression += line +"|";
}
tagsFile.close();
} else {
throw ConcordiaException("Unable to read html tags file.");
}
} else {
throw ConcordiaException("No html tags file.");
}
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
2017-04-26 17:02:18 +02:00
tagsExpression += ").*?>";
2015-06-22 13:52:56 +02:00
_htmlTags = boost::shared_ptr<RegexRule>(
2015-06-27 12:40:24 +02:00
new RegexRule(tagsExpression,
TokenAnnotation::HTML_TAG, "", false));
}
2015-06-22 13:52:56 +02:00
boost::shared_ptr<RegexRule>
2015-06-25 10:12:51 +02:00
SentenceTokenizer::_getMultipleRegexRule(
std::string filePath,
char annotationType,
std::string value,
bool wholeWord) {
std::string expression = "(";
if (boost::filesystem::exists(filePath)) {
std::string line;
std::ifstream ruleFile(filePath.c_str());
if (ruleFile.is_open()) {
while (getline(ruleFile, line)) {
if (wholeWord) {
expression += "\\b";
}
expression += line;
if (wholeWord) {
expression += "\\b";
}
expression += "|";
}
ruleFile.close();
} else {
throw ConcordiaException("Unable to read file: "+filePath);
}
} else {
throw ConcordiaException("No "+filePath+" file.");
}
expression = expression.substr(0, expression.size()-1);
expression += ")";
2015-06-22 13:52:56 +02:00
return boost::shared_ptr<RegexRule>(
2015-06-27 12:40:24 +02:00
new RegexRule(expression, annotationType, value, false));
}