#include "concordia/sentence_tokenizer.hpp" #include "concordia/token_annotation.hpp" #include #include #include #include #include SentenceTokenizer::SentenceTokenizer( boost::shared_ptr config) throw(ConcordiaException) { _createNeRules(config->getNamedEntitiesFilePath()); _createHtmlTagsRule(config->getHtmlTagsFilePath()); _stopWordsEnabled = config->isStopWordsEnabled(); if (_stopWordsEnabled) { _stopWords = _getMultipleRegexRule( config->getStopWordsFilePath(), TokenAnnotation::STOP_WORD, "", true); } } SentenceTokenizer::~SentenceTokenizer() { } TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence, bool byWhitespace) { TokenizedSentence result(sentence); if (byWhitespace) { result.toLowerCase(); boost::shared_ptr whitespaceRule( new RegexRule("\\S+", TokenAnnotation::WORD, "")); whitespaceRule->apply(result); } else { _htmlTags->apply(result); BOOST_FOREACH(RegexRule & neRule, _namedEntities) { neRule.apply(result); } result.toLowerCase(); if (_stopWordsEnabled) { _stopWords->apply(result); } boost::shared_ptr wordsWithNumbersRule( new RegexRule("[0-9]+\\-(\\p{L}|[0-9])+", TokenAnnotation::WORD, "")); wordsWithNumbersRule->apply(result); boost::shared_ptr numbersRule( new RegexRule("\\b[0-9]+([\\.\\,][0-9]+)?\\b", TokenAnnotation::NE, "ne_number")); numbersRule->apply(result); boost::shared_ptr wordsRule( new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])", TokenAnnotation::WORD, "")); wordsRule->apply(result); boost::shared_ptr singleLetterWordsRule( new RegexRule("\\p{L}", TokenAnnotation::WORD, "")); singleLetterWordsRule->apply(result); } return result; } void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) { if (boost::filesystem::exists(namedEntitiesPath)) { std::string line; std::ifstream neFile(namedEntitiesPath.c_str()); if (neFile.is_open()) { int lineCounter = 0; while (getline(neFile, line)) { lineCounter++; boost::shared_ptr > tokenTexts(new std::vector()); boost::split(*tokenTexts, line, boost::is_any_of(" "), boost::token_compress_on); if (tokenTexts->size() != 2) { std::stringstream ss; ss << "Invalid line: " << lineCounter << " in NE file: " << namedEntitiesPath; throw ConcordiaException(ss.str()); } else { _namedEntities.push_back(RegexRule( tokenTexts->at(0), TokenAnnotation::NE, tokenTexts->at(1))); } } neFile.close(); } else { throw ConcordiaException("Unable to read named entities file."); } } else { throw ConcordiaException("No named entities file."); } } void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) { std::string tagsExpression = "<\\/?("; if (boost::filesystem::exists(htmlTagsPath)) { std::string line; std::ifstream tagsFile(htmlTagsPath.c_str()); if (tagsFile.is_open()) { while (getline(tagsFile, line)) { tagsExpression += line +"|"; } tagsFile.close(); } else { throw ConcordiaException("Unable to read html tags file."); } } else { throw ConcordiaException("No html tags file."); } tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression += ").*?>"; _htmlTags = boost::shared_ptr( new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false)); } boost::shared_ptr SentenceTokenizer::_getMultipleRegexRule( std::string filePath, char annotationType, std::string value, bool wholeWord) { std::string expression = "("; if (boost::filesystem::exists(filePath)) { std::string line; std::ifstream ruleFile(filePath.c_str()); if (ruleFile.is_open()) { while (getline(ruleFile, line)) { if (wholeWord) { expression += "\\b"; } expression += line; if (wholeWord) { expression += "\\b"; } expression += "|"; } ruleFile.close(); } else { throw ConcordiaException("Unable to read file: "+filePath); } } else { throw ConcordiaException("No "+filePath+" file."); } expression = expression.substr(0, expression.size()-1); expression += ")"; return boost::shared_ptr( new RegexRule(expression, annotationType, value, false)); }