new tokenizer
This commit is contained in:
parent
a0673df75a
commit
bd73749388
@ -66,7 +66,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
|||||||
set(BASE_TARGETS concordia)
|
set(BASE_TARGETS concordia)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ================================================
|
# ================================================
|
||||||
# Third-party libraries
|
# Third-party libraries
|
||||||
# ================================================
|
# ================================================
|
||||||
@ -93,7 +93,7 @@ endif(WITH_PCRE)
|
|||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
set(Boost_USE_STATIC_LIBS OFF)
|
set(Boost_USE_STATIC_LIBS OFF)
|
||||||
set(Boost_USE_STATIC_RUNTIME OFF)
|
set(Boost_USE_STATIC_RUNTIME OFF)
|
||||||
find_package(Boost COMPONENTS
|
find_package(Boost COMPONENTS
|
||||||
serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)
|
serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)
|
||||||
|
|
||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
@ -166,7 +166,7 @@ endif()
|
|||||||
# Concordia: sub-projects
|
# Concordia: sub-projects
|
||||||
# ================================================
|
# ================================================
|
||||||
|
|
||||||
set(ALL_DIRECTORIES concordia concordia-console libdivsufsort utf8 utf8case)
|
set(ALL_DIRECTORIES concordia concordia-console concordia-sentence-tokenizer libdivsufsort utf8 utf8case)
|
||||||
|
|
||||||
include_directories("${concordia_SOURCE_DIR}")
|
include_directories("${concordia_SOURCE_DIR}")
|
||||||
|
|
||||||
@ -179,7 +179,7 @@ foreach(dir ${ALL_DIRECTORIES})
|
|||||||
endforeach(dir)
|
endforeach(dir)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ================================================
|
# ================================================
|
||||||
# Tests
|
# Tests
|
||||||
# ================================================
|
# ================================================
|
||||||
@ -198,7 +198,7 @@ if(DOXYGEN_FOUND)
|
|||||||
SET(DOXYFILE_LATEX ON)
|
SET(DOXYFILE_LATEX ON)
|
||||||
SET(DOXYGEN_HAVE_DOT NO)
|
SET(DOXYGEN_HAVE_DOT NO)
|
||||||
SET(DOXYGEN_QUIET YES)
|
SET(DOXYGEN_QUIET YES)
|
||||||
|
|
||||||
SET(INPUT_FILES)
|
SET(INPUT_FILES)
|
||||||
SET(INPUT_FILES "${INPUT_FILES} ${CMAKE_CURRENT_SOURCE_DIR}")
|
SET(INPUT_FILES "${INPUT_FILES} ${CMAKE_CURRENT_SOURCE_DIR}")
|
||||||
|
|
||||||
@ -212,6 +212,5 @@ if(DOXYGEN_FOUND)
|
|||||||
add_custom_target(doc ALL ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
|
add_custom_target(doc ALL ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||||
COMMENT "Generating API documentation with Doxygen" VERBATIM)
|
COMMENT "Generating API documentation with Doxygen" VERBATIM)
|
||||||
|
|
||||||
endif(DOXYGEN_FOUND)
|
|
||||||
|
|
||||||
|
endif(DOXYGEN_FOUND)
|
||||||
|
19
concordia-sentence-tokenizer/CMakeLists.txt
Normal file
19
concordia-sentence-tokenizer/CMakeLists.txt
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
|
||||||
|
add_executable(concordia-sentence-tokenizer concordia-sentence-tokenizer.cpp)
|
||||||
|
|
||||||
|
target_link_libraries(concordia-sentence-tokenizer concordia utf8case ${Boost_LIBRARIES} ${LIBCONFIG_LIB})
|
||||||
|
|
||||||
|
if (WITH_RE2)
|
||||||
|
target_link_libraries(concordia-sentence-tokenizer re2)
|
||||||
|
if (WITH_PCRE)
|
||||||
|
target_link_libraries(concordia-sentence-tokenizer pcrecpp)
|
||||||
|
endif(WITH_PCRE)
|
||||||
|
else(WITH_RE2)
|
||||||
|
if (WITH_PCRE)
|
||||||
|
target_link_libraries(concordia-sentence-tokenizer pcrecpp)
|
||||||
|
endif(WITH_PCRE)
|
||||||
|
endif(WITH_RE2)
|
||||||
|
|
||||||
|
# =====================================
|
||||||
|
|
||||||
|
install(TARGETS concordia-sentence-tokenizer DESTINATION bin/)
|
@ -0,0 +1,70 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <boost/program_options.hpp>
|
||||||
|
#include <boost/algorithm/string.hpp>
|
||||||
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
|
#include "concordia/concordia_config.hpp"
|
||||||
|
#include "concordia/sentence_tokenizer.hpp"
|
||||||
|
#include "concordia/tokenized_sentence.hpp"
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
|
|
||||||
|
namespace po = boost::program_options;
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
po::options_description desc("Allowed options");
|
||||||
|
|
||||||
|
desc.add_options()
|
||||||
|
("help,h", "Display this message")
|
||||||
|
("config,c", boost::program_options::value<std::string>(),
|
||||||
|
"Concordia configuration file (required)");
|
||||||
|
po::variables_map cli;
|
||||||
|
po::store(po::parse_command_line(argc, argv, desc), cli);
|
||||||
|
po::notify(cli);
|
||||||
|
|
||||||
|
if (cli.count("help")) {
|
||||||
|
std::cerr << desc << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string configFile;
|
||||||
|
if (cli.count("config")) {
|
||||||
|
configFile = cli["config"].as<std::string>();
|
||||||
|
} else {
|
||||||
|
std::cerr << "No Concordia configuration file given. Terminating."
|
||||||
|
<< std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
boost::shared_ptr<ConcordiaConfig> config =
|
||||||
|
boost::shared_ptr<ConcordiaConfig> (
|
||||||
|
new ConcordiaConfig(configFile));
|
||||||
|
|
||||||
|
SentenceTokenizer sentenceTokenizer(config);
|
||||||
|
for (std::string line; std::getline(std::cin, line);) {
|
||||||
|
TokenizedSentence ts = sentenceTokenizer.tokenize(line);
|
||||||
|
std::cout << ts.getTokenizedSentence() << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch(ConcordiaException & e) {
|
||||||
|
std::cerr << "ConcordiaException caught with message: "
|
||||||
|
<< std::endl
|
||||||
|
<< e.what()
|
||||||
|
<< std::endl
|
||||||
|
<< "Terminating execution."
|
||||||
|
<< std::endl;
|
||||||
|
return 1;
|
||||||
|
} catch(std::exception & e) {
|
||||||
|
std::cerr << "Unexpected exception caught with message: "
|
||||||
|
<< std::endl
|
||||||
|
<< e.what()
|
||||||
|
<< std::endl
|
||||||
|
<< "Terminating execution."
|
||||||
|
<< std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
@ -49,7 +49,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
|
|||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule> wordsRule(
|
boost::shared_ptr<RegexRule> wordsRule(
|
||||||
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
|
new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])",
|
||||||
TokenAnnotation::WORD, ""));
|
TokenAnnotation::WORD, ""));
|
||||||
wordsRule->apply(result);
|
wordsRule->apply(result);
|
||||||
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
||||||
@ -100,7 +100,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
|||||||
std::ifstream tagsFile(htmlTagsPath.c_str());
|
std::ifstream tagsFile(htmlTagsPath.c_str());
|
||||||
if (tagsFile.is_open()) {
|
if (tagsFile.is_open()) {
|
||||||
while (getline(tagsFile, line)) {
|
while (getline(tagsFile, line)) {
|
||||||
tagsExpression += "|";
|
tagsExpression += line +"|";
|
||||||
}
|
}
|
||||||
tagsFile.close();
|
tagsFile.close();
|
||||||
} else {
|
} else {
|
||||||
@ -110,7 +110,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
|||||||
throw ConcordiaException("No html tags file.");
|
throw ConcordiaException("No html tags file.");
|
||||||
}
|
}
|
||||||
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
||||||
tagsExpression += "br).*?>";
|
tagsExpression += ").*?>";
|
||||||
_htmlTags = boost::shared_ptr<RegexRule>(
|
_htmlTags = boost::shared_ptr<RegexRule>(
|
||||||
new RegexRule(tagsExpression,
|
new RegexRule(tagsExpression,
|
||||||
TokenAnnotation::HTML_TAG, "", false));
|
TokenAnnotation::HTML_TAG, "", false));
|
||||||
@ -149,4 +149,3 @@ boost::shared_ptr<RegexRule>
|
|||||||
return boost::shared_ptr<RegexRule>(
|
return boost::shared_ptr<RegexRule>(
|
||||||
new RegexRule(expression, annotationType, value, false));
|
new RegexRule(expression, annotationType, value, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,115 +17,20 @@ BOOST_AUTO_TEST_CASE( NETest )
|
|||||||
{
|
{
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
SentenceTokenizer tokenizer(config);
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
|
|
||||||
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
||||||
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
|
||||||
|
|
||||||
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(14,annotations.size());
|
|
||||||
|
|
||||||
/*
|
|
||||||
0,4 type: 1 value: date
|
|
||||||
6,16 type: 0 value: ne_date
|
|
||||||
18,22 type: 1 value: mail
|
|
||||||
24,40 type: 0 value: ne_email
|
|
||||||
42,48 type: 1 value: number
|
|
||||||
50,54 type: 0 value: ne_number
|
|
||||||
56,61 type: 1 value: hello
|
|
||||||
61,62 type: 0 value: ne_number
|
|
||||||
63,69 type: 1 value: zażółć
|
|
||||||
70,75 type: 1 value: gęślą
|
|
||||||
76,80 type: 1 value: jaźń
|
|
||||||
82,88 type: 1 value: zażółć
|
|
||||||
89,94 type: 1 value: gęślą
|
|
||||||
95,99 type: 1 value: jaźń
|
|
||||||
*/
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
BOOST_CHECK_EQUAL(13,annotations.size());
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "date");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),6);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),16);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),18);
|
std::stringstream ss;
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
ss << ts;
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL("[0,4][1][date] [6,16][0][ne_date] [18,22][1][mail] [24,40][0][ne_email] [42,48][1][number] [50,54][0][ne_number] [56,62][1][hello3] [63,69][1][zażółć] [70,75][1][gęślą] [76,80][1][jaźń] [82,88][1][zażółć] [89,94][1][gęślą] [95,99][1][jaźń]", ss.str());
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),24);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),40);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),42);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),48);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "number");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),50);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),54);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),56);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),61);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),61);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),62);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),63);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),69);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),70);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),75);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),76);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),80);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),82);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),88);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),89);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),94);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),95);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),99);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||||
@ -135,77 +40,18 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
|||||||
|
|
||||||
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
||||||
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||||
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "link and bold and newline");
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
0,23 type: 2 value:
|
|
||||||
23,27 type: 1 value: link
|
|
||||||
27,31 type: 2 value:
|
|
||||||
32,35 type: 1 value: and
|
|
||||||
36,39 type: 2 value:
|
|
||||||
39,43 type: 1 value: bold
|
|
||||||
43,47 type: 2 value:
|
|
||||||
48,51 type: 1 value: and
|
|
||||||
52,59 type: 1 value: newline
|
|
||||||
60,65 type: 2 value:
|
|
||||||
*/
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(10,annotations.size());
|
BOOST_AUTO_TEST_CASE( NormalSentencesTest )
|
||||||
|
{
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),23);
|
SentenceTokenizer tokenizer(config);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),23);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"link");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),27);
|
std::string sentence = "5.5.3.9 zbiornik balastowy t1500 347m3 ;";
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),31);
|
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "ne_bullet zbiornik balastowy t1500 347m3");
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),32);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),36);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),39);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),39);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),43);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),43);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),47);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),48);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),51);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),52);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),59);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
|
|
||||||
++iter;
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),60);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),65);
|
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
||||||
@ -217,7 +63,7 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
|||||||
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||||
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
0,4 type: 1 value: this
|
0,4 type: 1 value: this
|
||||||
5,7 type: 1 value: is
|
5,7 type: 1 value: is
|
||||||
@ -235,7 +81,7 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(13,annotations.size());
|
BOOST_CHECK_EQUAL(13,annotations.size());
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
||||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
@ -330,13 +176,12 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
|||||||
{
|
{
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
SentenceTokenizer tokenizer(config);
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
||||||
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||||
|
|
||||||
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
BOOST_CHECK_EQUAL(9, annotations.size());
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(161, annotations.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
@ -2,7 +2,9 @@
|
|||||||
#include "concordia/common/text_utils.hpp"
|
#include "concordia/common/text_utils.hpp"
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
|
#include <boost/algorithm/string.hpp>
|
||||||
|
|
||||||
TokenizedSentence::TokenizedSentence(std::string sentence):
|
TokenizedSentence::TokenizedSentence(std::string sentence):
|
||||||
_sentence(sentence) {
|
_sentence(sentence) {
|
||||||
@ -73,3 +75,15 @@ void TokenizedSentence::generateTokens() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string TokenizedSentence::getTokenizedSentence() const {
|
||||||
|
std::stringstream ss;
|
||||||
|
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
|
||||||
|
if (annotation.getType() == TokenAnnotation::WORD ||
|
||||||
|
annotation.getType() == TokenAnnotation::NE) {
|
||||||
|
ss << annotation.getValue() << " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::string result = ss.str();
|
||||||
|
boost::trim_right(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
@ -9,6 +9,10 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
#include <iostream>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
A sentence after tokenizing operations. The class
|
A sentence after tokenizing operations. The class
|
||||||
@ -31,13 +35,19 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~TokenizedSentence();
|
virtual ~TokenizedSentence();
|
||||||
|
|
||||||
/*! Getter for sentence
|
/*! Getter for the string sentence, which is used for extracting tokens.
|
||||||
\returns sentence
|
\returns sentence
|
||||||
*/
|
*/
|
||||||
std::string getSentence() const {
|
std::string getSentence() const {
|
||||||
return _sentence;
|
return _sentence;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*! Method for getting tokenized sentence in a string format (
|
||||||
|
tokens separated by single spaces.
|
||||||
|
\returns tokenized sentence
|
||||||
|
*/
|
||||||
|
std::string getTokenizedSentence() const;
|
||||||
|
|
||||||
/*! Getter for all annotations list. This method returns
|
/*! Getter for all annotations list. This method returns
|
||||||
all annotations, including those which are not considered
|
all annotations, including those which are not considered
|
||||||
in the hash, i.e. stop words and html tags.
|
in the hash, i.e. stop words and html tags.
|
||||||
@ -82,12 +92,12 @@ public:
|
|||||||
*/
|
*/
|
||||||
void generateTokens();
|
void generateTokens();
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Transform the sentence to lower case.
|
Transform the sentence to lower case.
|
||||||
*/
|
*/
|
||||||
void toLowerCase();
|
void toLowerCase();
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Add new annotations to the existing annotations list. Assumptions:
|
Add new annotations to the existing annotations list. Assumptions:
|
||||||
1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
|
1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
|
||||||
2. the annotations to be added list also has the above properties.
|
2. the annotations to be added list also has the above properties.
|
||||||
@ -98,6 +108,21 @@ public:
|
|||||||
*/
|
*/
|
||||||
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
||||||
|
|
||||||
|
friend std::ostream & operator << (std::ostream & o,
|
||||||
|
const TokenizedSentence & ts) {
|
||||||
|
int index = 0;
|
||||||
|
BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {
|
||||||
|
o << "[" << token.getStart() << "," << token.getEnd() << "]["
|
||||||
|
<< token.getType() << "][" << token.getValue() <<"]";
|
||||||
|
if (index < ts.getAnnotations().size() - 1) {
|
||||||
|
o << " ";
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::string _sentence;
|
std::string _sentence;
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
|
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
|
||||||
[\w\._\d]+@\w+(\.\w+)* ne_email
|
[\w\._\d]+@\w+(\.\w+)* ne_email
|
||||||
[0-9]+([\.\,][0-9]+)? ne_number
|
[0-9]+[\.\)]([0-9]+\.)+ ne_bullet
|
||||||
|
\b[0-9]+([\.\,][0-9]+)?\b ne_number
|
||||||
|
Loading…
Reference in New Issue
Block a user