new tokenizer
This commit is contained in:
parent
a0673df75a
commit
bd73749388
@ -66,7 +66,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
||||
set(BASE_TARGETS concordia)
|
||||
|
||||
|
||||
|
||||
|
||||
# ================================================
|
||||
# Third-party libraries
|
||||
# ================================================
|
||||
@ -93,7 +93,7 @@ endif(WITH_PCRE)
|
||||
# ----------------------------------------------------
|
||||
set(Boost_USE_STATIC_LIBS OFF)
|
||||
set(Boost_USE_STATIC_RUNTIME OFF)
|
||||
find_package(Boost COMPONENTS
|
||||
find_package(Boost COMPONENTS
|
||||
serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)
|
||||
|
||||
# ----------------------------------------------------
|
||||
@ -166,7 +166,7 @@ endif()
|
||||
# Concordia: sub-projects
|
||||
# ================================================
|
||||
|
||||
set(ALL_DIRECTORIES concordia concordia-console libdivsufsort utf8 utf8case)
|
||||
set(ALL_DIRECTORIES concordia concordia-console concordia-sentence-tokenizer libdivsufsort utf8 utf8case)
|
||||
|
||||
include_directories("${concordia_SOURCE_DIR}")
|
||||
|
||||
@ -179,7 +179,7 @@ foreach(dir ${ALL_DIRECTORIES})
|
||||
endforeach(dir)
|
||||
|
||||
|
||||
|
||||
|
||||
# ================================================
|
||||
# Tests
|
||||
# ================================================
|
||||
@ -198,7 +198,7 @@ if(DOXYGEN_FOUND)
|
||||
SET(DOXYFILE_LATEX ON)
|
||||
SET(DOXYGEN_HAVE_DOT NO)
|
||||
SET(DOXYGEN_QUIET YES)
|
||||
|
||||
|
||||
SET(INPUT_FILES)
|
||||
SET(INPUT_FILES "${INPUT_FILES} ${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
|
||||
@ -212,6 +212,5 @@ if(DOXYGEN_FOUND)
|
||||
add_custom_target(doc ALL ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
COMMENT "Generating API documentation with Doxygen" VERBATIM)
|
||||
|
||||
endif(DOXYGEN_FOUND)
|
||||
|
||||
endif(DOXYGEN_FOUND)
|
||||
|
19
concordia-sentence-tokenizer/CMakeLists.txt
Normal file
19
concordia-sentence-tokenizer/CMakeLists.txt
Normal file
@ -0,0 +1,19 @@
|
||||
|
||||
add_executable(concordia-sentence-tokenizer concordia-sentence-tokenizer.cpp)
|
||||
|
||||
target_link_libraries(concordia-sentence-tokenizer concordia utf8case ${Boost_LIBRARIES} ${LIBCONFIG_LIB})
|
||||
|
||||
if (WITH_RE2)
|
||||
target_link_libraries(concordia-sentence-tokenizer re2)
|
||||
if (WITH_PCRE)
|
||||
target_link_libraries(concordia-sentence-tokenizer pcrecpp)
|
||||
endif(WITH_PCRE)
|
||||
else(WITH_RE2)
|
||||
if (WITH_PCRE)
|
||||
target_link_libraries(concordia-sentence-tokenizer pcrecpp)
|
||||
endif(WITH_PCRE)
|
||||
endif(WITH_RE2)
|
||||
|
||||
# =====================================
|
||||
|
||||
install(TARGETS concordia-sentence-tokenizer DESTINATION bin/)
|
@ -0,0 +1,70 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <boost/program_options.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/sentence_tokenizer.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/common/utils.hpp"
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
po::options_description desc("Allowed options");
|
||||
|
||||
desc.add_options()
|
||||
("help,h", "Display this message")
|
||||
("config,c", boost::program_options::value<std::string>(),
|
||||
"Concordia configuration file (required)");
|
||||
po::variables_map cli;
|
||||
po::store(po::parse_command_line(argc, argv, desc), cli);
|
||||
po::notify(cli);
|
||||
|
||||
if (cli.count("help")) {
|
||||
std::cerr << desc << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string configFile;
|
||||
if (cli.count("config")) {
|
||||
configFile = cli["config"].as<std::string>();
|
||||
} else {
|
||||
std::cerr << "No Concordia configuration file given. Terminating."
|
||||
<< std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
try {
|
||||
boost::shared_ptr<ConcordiaConfig> config =
|
||||
boost::shared_ptr<ConcordiaConfig> (
|
||||
new ConcordiaConfig(configFile));
|
||||
|
||||
SentenceTokenizer sentenceTokenizer(config);
|
||||
for (std::string line; std::getline(std::cin, line);) {
|
||||
TokenizedSentence ts = sentenceTokenizer.tokenize(line);
|
||||
std::cout << ts.getTokenizedSentence() << std::endl;
|
||||
}
|
||||
|
||||
} catch(ConcordiaException & e) {
|
||||
std::cerr << "ConcordiaException caught with message: "
|
||||
<< std::endl
|
||||
<< e.what()
|
||||
<< std::endl
|
||||
<< "Terminating execution."
|
||||
<< std::endl;
|
||||
return 1;
|
||||
} catch(std::exception & e) {
|
||||
std::cerr << "Unexpected exception caught with message: "
|
||||
<< std::endl
|
||||
<< e.what()
|
||||
<< std::endl
|
||||
<< "Terminating execution."
|
||||
<< std::endl;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
@ -49,7 +49,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
|
||||
}
|
||||
|
||||
boost::shared_ptr<RegexRule> wordsRule(
|
||||
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
|
||||
new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])",
|
||||
TokenAnnotation::WORD, ""));
|
||||
wordsRule->apply(result);
|
||||
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
||||
@ -100,7 +100,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
||||
std::ifstream tagsFile(htmlTagsPath.c_str());
|
||||
if (tagsFile.is_open()) {
|
||||
while (getline(tagsFile, line)) {
|
||||
tagsExpression += "|";
|
||||
tagsExpression += line +"|";
|
||||
}
|
||||
tagsFile.close();
|
||||
} else {
|
||||
@ -110,7 +110,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
||||
throw ConcordiaException("No html tags file.");
|
||||
}
|
||||
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
||||
tagsExpression += "br).*?>";
|
||||
tagsExpression += ").*?>";
|
||||
_htmlTags = boost::shared_ptr<RegexRule>(
|
||||
new RegexRule(tagsExpression,
|
||||
TokenAnnotation::HTML_TAG, "", false));
|
||||
@ -149,4 +149,3 @@ boost::shared_ptr<RegexRule>
|
||||
return boost::shared_ptr<RegexRule>(
|
||||
new RegexRule(expression, annotationType, value, false));
|
||||
}
|
||||
|
||||
|
@ -17,115 +17,20 @@ BOOST_AUTO_TEST_CASE( NETest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceTokenizer tokenizer(config);
|
||||
|
||||
|
||||
|
||||
|
||||
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
||||
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
|
||||
|
||||
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(14,annotations.size());
|
||||
|
||||
/*
|
||||
0,4 type: 1 value: date
|
||||
6,16 type: 0 value: ne_date
|
||||
18,22 type: 1 value: mail
|
||||
24,40 type: 0 value: ne_email
|
||||
42,48 type: 1 value: number
|
||||
50,54 type: 0 value: ne_number
|
||||
56,61 type: 1 value: hello
|
||||
61,62 type: 0 value: ne_number
|
||||
63,69 type: 1 value: zażółć
|
||||
70,75 type: 1 value: gęślą
|
||||
76,80 type: 1 value: jaźń
|
||||
82,88 type: 1 value: zażółć
|
||||
89,94 type: 1 value: gęślą
|
||||
95,99 type: 1 value: jaźń
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "date");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),6);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),16);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
|
||||
++iter;
|
||||
BOOST_CHECK_EQUAL(13,annotations.size());
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),18);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),24);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),40);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),42);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),48);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "number");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),50);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),54);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),56);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),61);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),61);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),62);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),63);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),69);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),70);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),75);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),76);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),80);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),82);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),88);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),89);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),94);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),95);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),99);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
|
||||
|
||||
std::stringstream ss;
|
||||
ss << ts;
|
||||
BOOST_CHECK_EQUAL("[0,4][1][date] [6,16][0][ne_date] [18,22][1][mail] [24,40][0][ne_email] [42,48][1][number] [50,54][0][ne_number] [56,62][1][hello3] [63,69][1][zażółć] [70,75][1][gęślą] [76,80][1][jaźń] [82,88][1][zażółć] [89,94][1][gęślą] [95,99][1][jaźń]", ss.str());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||
@ -135,77 +40,18 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||
|
||||
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
||||
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
/*
|
||||
0,23 type: 2 value:
|
||||
23,27 type: 1 value: link
|
||||
27,31 type: 2 value:
|
||||
32,35 type: 1 value: and
|
||||
36,39 type: 2 value:
|
||||
39,43 type: 1 value: bold
|
||||
43,47 type: 2 value:
|
||||
48,51 type: 1 value: and
|
||||
52,59 type: 1 value: newline
|
||||
60,65 type: 2 value:
|
||||
*/
|
||||
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "link and bold and newline");
|
||||
}
|
||||
|
||||
BOOST_CHECK_EQUAL(10,annotations.size());
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),23);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),23);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"link");
|
||||
++iter;
|
||||
BOOST_AUTO_TEST_CASE( NormalSentencesTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceTokenizer tokenizer(config);
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),27);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),31);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||
++iter;
|
||||
std::string sentence = "5.5.3.9 zbiornik balastowy t1500 347m3 ;";
|
||||
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "ne_bullet zbiornik balastowy t1500 347m3");
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),32);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),36);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),39);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),39);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),43);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),43);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),47);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),48);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),51);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),52);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),59);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
|
||||
++iter;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),60);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),65);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
||||
@ -217,7 +63,7 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
||||
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
|
||||
/*
|
||||
0,4 type: 1 value: this
|
||||
5,7 type: 1 value: is
|
||||
@ -235,7 +81,7 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
||||
*/
|
||||
|
||||
BOOST_CHECK_EQUAL(13,annotations.size());
|
||||
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||
@ -330,13 +176,12 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceTokenizer tokenizer(config);
|
||||
|
||||
|
||||
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
||||
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||
|
||||
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(161, annotations.size());
|
||||
BOOST_CHECK_EQUAL(9, annotations.size());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
@ -2,7 +2,9 @@
|
||||
#include "concordia/common/text_utils.hpp"
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <boost/foreach.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
TokenizedSentence::TokenizedSentence(std::string sentence):
|
||||
_sentence(sentence) {
|
||||
@ -73,3 +75,15 @@ void TokenizedSentence::generateTokens() {
|
||||
}
|
||||
}
|
||||
|
||||
std::string TokenizedSentence::getTokenizedSentence() const {
|
||||
std::stringstream ss;
|
||||
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
|
||||
if (annotation.getType() == TokenAnnotation::WORD ||
|
||||
annotation.getType() == TokenAnnotation::NE) {
|
||||
ss << annotation.getValue() << " ";
|
||||
}
|
||||
}
|
||||
std::string result = ss.str();
|
||||
boost::trim_right(result);
|
||||
return result;
|
||||
}
|
||||
|
@ -9,6 +9,10 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <iostream>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
A sentence after tokenizing operations. The class
|
||||
@ -31,13 +35,19 @@ public:
|
||||
*/
|
||||
virtual ~TokenizedSentence();
|
||||
|
||||
/*! Getter for sentence
|
||||
/*! Getter for the string sentence, which is used for extracting tokens.
|
||||
\returns sentence
|
||||
*/
|
||||
std::string getSentence() const {
|
||||
return _sentence;
|
||||
}
|
||||
|
||||
/*! Method for getting tokenized sentence in a string format (
|
||||
tokens separated by single spaces.
|
||||
\returns tokenized sentence
|
||||
*/
|
||||
std::string getTokenizedSentence() const;
|
||||
|
||||
/*! Getter for all annotations list. This method returns
|
||||
all annotations, including those which are not considered
|
||||
in the hash, i.e. stop words and html tags.
|
||||
@ -82,12 +92,12 @@ public:
|
||||
*/
|
||||
void generateTokens();
|
||||
|
||||
/*!
|
||||
/*!
|
||||
Transform the sentence to lower case.
|
||||
*/
|
||||
void toLowerCase();
|
||||
|
||||
/*!
|
||||
/*!
|
||||
Add new annotations to the existing annotations list. Assumptions:
|
||||
1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
|
||||
2. the annotations to be added list also has the above properties.
|
||||
@ -98,6 +108,21 @@ public:
|
||||
*/
|
||||
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
||||
|
||||
friend std::ostream & operator << (std::ostream & o,
|
||||
const TokenizedSentence & ts) {
|
||||
int index = 0;
|
||||
BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {
|
||||
o << "[" << token.getStart() << "," << token.getEnd() << "]["
|
||||
<< token.getType() << "][" << token.getValue() <<"]";
|
||||
if (index < ts.getAnnotations().size() - 1) {
|
||||
o << " ";
|
||||
}
|
||||
index++;
|
||||
}
|
||||
return o;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
std::string _sentence;
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
|
||||
[\w\._\d]+@\w+(\.\w+)* ne_email
|
||||
[0-9]+([\.\,][0-9]+)? ne_number
|
||||
[0-9]+[\.\)]([0-9]+\.)+ ne_bullet
|
||||
\b[0-9]+([\.\,][0-9]+)?\b ne_number
|
||||
|
Loading…
Reference in New Issue
Block a user