2015-06-25 10:12:51 +02:00
|
|
|
#include "concordia/sentence_tokenizer.hpp"
|
|
|
|
#include "concordia/token_annotation.hpp"
|
2014-04-13 12:21:30 +02:00
|
|
|
|
2014-04-29 14:46:04 +02:00
|
|
|
#include <boost/foreach.hpp>
|
|
|
|
#include <fstream>
|
|
|
|
#include <sstream>
|
|
|
|
#include <iostream>
|
|
|
|
#include <boost/algorithm/string.hpp>
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
SentenceTokenizer::SentenceTokenizer(
|
2014-04-29 14:46:04 +02:00
|
|
|
boost::shared_ptr<ConcordiaConfig> config)
|
2014-04-13 12:21:30 +02:00
|
|
|
throw(ConcordiaException) {
|
2014-04-29 14:46:04 +02:00
|
|
|
_createNeRules(config->getNamedEntitiesFilePath());
|
|
|
|
_createHtmlTagsRule(config->getHtmlTagsFilePath());
|
2015-04-21 21:33:08 +02:00
|
|
|
_stopWordsEnabled = config->isStopWordsEnabled();
|
2015-04-22 16:50:12 +02:00
|
|
|
if (_stopWordsEnabled) {
|
2015-06-25 10:12:51 +02:00
|
|
|
_stopWords = _getMultipleRegexRule(
|
|
|
|
config->getStopWordsFilePath(),
|
2015-06-25 20:49:22 +02:00
|
|
|
TokenAnnotation::STOP_WORD,
|
2015-06-25 10:12:51 +02:00
|
|
|
"", true);
|
2015-04-21 21:33:08 +02:00
|
|
|
}
|
2014-04-13 12:21:30 +02:00
|
|
|
}
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
SentenceTokenizer::~SentenceTokenizer() {
|
2014-04-13 12:21:30 +02:00
|
|
|
}
|
|
|
|
|
2015-12-27 20:54:40 +01:00
|
|
|
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
|
|
|
|
bool byWhitespace) {
|
2015-08-19 20:49:26 +02:00
|
|
|
TokenizedSentence result(sentence);
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2015-12-29 21:44:46 +01:00
|
|
|
if (byWhitespace) {
|
|
|
|
result.toLowerCase();
|
|
|
|
|
2015-12-27 20:54:40 +01:00
|
|
|
boost::shared_ptr<RegexRule> whitespaceRule(
|
|
|
|
new RegexRule("\\S+",
|
|
|
|
TokenAnnotation::WORD, ""));
|
|
|
|
whitespaceRule->apply(result);
|
|
|
|
} else {
|
|
|
|
_htmlTags->apply(result);
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2015-12-27 20:54:40 +01:00
|
|
|
BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
|
|
|
|
neRule.apply(result);
|
|
|
|
}
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2015-12-27 20:54:40 +01:00
|
|
|
result.toLowerCase();
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2015-12-27 20:54:40 +01:00
|
|
|
if (_stopWordsEnabled) {
|
|
|
|
_stopWords->apply(result);
|
|
|
|
}
|
2014-04-29 14:46:04 +02:00
|
|
|
|
2015-12-27 20:54:40 +01:00
|
|
|
boost::shared_ptr<RegexRule> wordsRule(
|
2017-04-26 17:02:18 +02:00
|
|
|
new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])",
|
2015-12-27 20:54:40 +01:00
|
|
|
TokenAnnotation::WORD, ""));
|
|
|
|
wordsRule->apply(result);
|
|
|
|
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
|
|
|
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
|
|
|
singleLetterWordsRule->apply(result);
|
|
|
|
}
|
2015-12-29 21:44:46 +01:00
|
|
|
|
2014-04-29 14:46:04 +02:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
|
2014-04-29 14:46:04 +02:00
|
|
|
if (boost::filesystem::exists(namedEntitiesPath)) {
|
2015-04-15 14:14:10 +02:00
|
|
|
std::string line;
|
|
|
|
std::ifstream neFile(namedEntitiesPath.c_str());
|
2014-04-29 14:46:04 +02:00
|
|
|
if (neFile.is_open()) {
|
|
|
|
int lineCounter = 0;
|
|
|
|
while (getline(neFile, line)) {
|
|
|
|
lineCounter++;
|
2015-04-15 14:14:10 +02:00
|
|
|
boost::shared_ptr<std::vector<std::string> >
|
|
|
|
tokenTexts(new std::vector<std::string>());
|
2014-04-29 14:46:04 +02:00
|
|
|
boost::split(*tokenTexts, line, boost::is_any_of(" "),
|
|
|
|
boost::token_compress_on);
|
|
|
|
if (tokenTexts->size() != 2) {
|
2015-04-15 14:14:10 +02:00
|
|
|
std::stringstream ss;
|
2014-04-29 14:46:04 +02:00
|
|
|
ss << "Invalid line: " << lineCounter
|
|
|
|
<< " in NE file: " << namedEntitiesPath;
|
|
|
|
throw ConcordiaException(ss.str());
|
|
|
|
} else {
|
2015-06-22 13:52:56 +02:00
|
|
|
_namedEntities.push_back(RegexRule(
|
2015-06-25 10:12:51 +02:00
|
|
|
tokenTexts->at(0),
|
2015-06-25 20:49:22 +02:00
|
|
|
TokenAnnotation::NE,
|
2015-06-25 10:12:51 +02:00
|
|
|
tokenTexts->at(1)));
|
2014-04-29 14:46:04 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
neFile.close();
|
|
|
|
} else {
|
|
|
|
throw ConcordiaException("Unable to read named entities file.");
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
throw ConcordiaException("No named entities file.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-06-25 10:12:51 +02:00
|
|
|
void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
2015-04-15 14:14:10 +02:00
|
|
|
std::string tagsExpression = "<\\/?(";
|
2014-04-29 14:46:04 +02:00
|
|
|
if (boost::filesystem::exists(htmlTagsPath)) {
|
2015-04-15 14:14:10 +02:00
|
|
|
std::string line;
|
|
|
|
std::ifstream tagsFile(htmlTagsPath.c_str());
|
2014-04-29 14:46:04 +02:00
|
|
|
if (tagsFile.is_open()) {
|
|
|
|
while (getline(tagsFile, line)) {
|
2017-04-26 17:02:18 +02:00
|
|
|
tagsExpression += line +"|";
|
2014-04-29 14:46:04 +02:00
|
|
|
}
|
|
|
|
tagsFile.close();
|
|
|
|
} else {
|
|
|
|
throw ConcordiaException("Unable to read html tags file.");
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
throw ConcordiaException("No html tags file.");
|
|
|
|
}
|
|
|
|
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
2017-04-26 17:02:18 +02:00
|
|
|
tagsExpression += ").*?>";
|
2015-06-22 13:52:56 +02:00
|
|
|
_htmlTags = boost::shared_ptr<RegexRule>(
|
2015-06-27 12:40:24 +02:00
|
|
|
new RegexRule(tagsExpression,
|
|
|
|
TokenAnnotation::HTML_TAG, "", false));
|
2014-04-29 14:46:04 +02:00
|
|
|
}
|
2014-04-13 12:21:30 +02:00
|
|
|
|
2015-06-22 13:52:56 +02:00
|
|
|
boost::shared_ptr<RegexRule>
|
2015-06-25 10:12:51 +02:00
|
|
|
SentenceTokenizer::_getMultipleRegexRule(
|
|
|
|
std::string filePath,
|
|
|
|
char annotationType,
|
|
|
|
std::string value,
|
|
|
|
bool wholeWord) {
|
2015-04-15 14:14:10 +02:00
|
|
|
std::string expression = "(";
|
2014-04-29 14:46:04 +02:00
|
|
|
if (boost::filesystem::exists(filePath)) {
|
2015-04-15 14:14:10 +02:00
|
|
|
std::string line;
|
|
|
|
std::ifstream ruleFile(filePath.c_str());
|
2014-04-29 14:46:04 +02:00
|
|
|
if (ruleFile.is_open()) {
|
|
|
|
while (getline(ruleFile, line)) {
|
|
|
|
if (wholeWord) {
|
|
|
|
expression += "\\b";
|
|
|
|
}
|
|
|
|
expression += line;
|
|
|
|
if (wholeWord) {
|
|
|
|
expression += "\\b";
|
|
|
|
}
|
|
|
|
expression += "|";
|
|
|
|
}
|
|
|
|
ruleFile.close();
|
|
|
|
} else {
|
|
|
|
throw ConcordiaException("Unable to read file: "+filePath);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
throw ConcordiaException("No "+filePath+" file.");
|
|
|
|
}
|
|
|
|
expression = expression.substr(0, expression.size()-1);
|
|
|
|
expression += ")";
|
2015-06-22 13:52:56 +02:00
|
|
|
return boost::shared_ptr<RegexRule>(
|
2015-06-27 12:40:24 +02:00
|
|
|
new RegexRule(expression, annotationType, value, false));
|
2014-04-13 12:21:30 +02:00
|
|
|
}
|