#include "concordia/sentence_tokenizer.hpp" #include "concordia/token_annotation.hpp" #include #include #include #include #include SentenceTokenizer::SentenceTokenizer( boost::shared_ptr config) throw(ConcordiaException) { _createNeRules(config->getNamedEntitiesFilePath()); _createHtmlTagsRule(config->getHtmlTagsFilePath()); _stopWordsEnabled = config->isStopWordsEnabled(); if (_stopWordsEnabled) { _stopWords = _getMultipleRegexRule( config->getStopWordsFilePath(), TokenAnnotation::STOP_WORD_TYPE, "", true); } } SentenceTokenizer::~SentenceTokenizer() { } boost::shared_ptr SentenceTokenizer::tokenize(const std::string & sentence) { boost::shared_ptr result(new TokenizedSentence(sentence)); _htmlTags->apply(result); BOOST_FOREACH(RegexRule & neRule, _namedEntities) { neRule.apply(result); } result->toLowerCase(); if (_stopWordsEnabled) { _stopWords->apply(result); } boost::shared_ptr wordsRule( new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word")); return result; } void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) { if (boost::filesystem::exists(namedEntitiesPath)) { std::string line; std::ifstream neFile(namedEntitiesPath.c_str()); if (neFile.is_open()) { int lineCounter = 0; while (getline(neFile, line)) { lineCounter++; boost::shared_ptr > tokenTexts(new std::vector()); boost::split(*tokenTexts, line, boost::is_any_of(" "), boost::token_compress_on); if (tokenTexts->size() != 2) { std::stringstream ss; ss << "Invalid line: " << lineCounter << " in NE file: " << namedEntitiesPath; throw ConcordiaException(ss.str()); } else { _namedEntities.push_back(RegexRule( tokenTexts->at(0), TokenAnnotation::NE_TYPE, tokenTexts->at(1))); } } neFile.close(); } else { throw ConcordiaException("Unable to read named entities file."); } } else { throw ConcordiaException("No named entities file."); } } void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) { std::string tagsExpression = "<\\/?("; if (boost::filesystem::exists(htmlTagsPath)) { std::string line; std::ifstream tagsFile(htmlTagsPath.c_str()); if (tagsFile.is_open()) { while (getline(tagsFile, line)) { tagsExpression += "|"; } tagsFile.close(); } else { throw ConcordiaException("Unable to read html tags file."); } } else { throw ConcordiaException("No html tags file."); } tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression += "br).*?>"; _htmlTags = boost::shared_ptr( new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false)); } boost::shared_ptr SentenceTokenizer::_getMultipleRegexRule( std::string filePath, char annotationType, std::string value, bool wholeWord) { std::string expression = "("; if (boost::filesystem::exists(filePath)) { std::string line; std::ifstream ruleFile(filePath.c_str()); if (ruleFile.is_open()) { while (getline(ruleFile, line)) { if (wholeWord) { expression += "\\b"; } expression += line; if (wholeWord) { expression += "\\b"; } expression += "|"; } ruleFile.close(); } else { throw ConcordiaException("Unable to read file: "+filePath); } } else { throw ConcordiaException("No "+filePath+" file."); } expression = expression.substr(0, expression.size()-1); expression += ")"; return boost::shared_ptr( new RegexRule(expression, annotationType, value, false)); }