new tokenizer

This commit is contained in:
rjawor 2017-04-26 17:02:18 +02:00
parent a0673df75a
commit bd73749388
8 changed files with 164 additions and 192 deletions

View File

@ -66,7 +66,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
set(BASE_TARGETS concordia) set(BASE_TARGETS concordia)
# ================================================ # ================================================
# Third-party libraries # Third-party libraries
# ================================================ # ================================================
@ -93,7 +93,7 @@ endif(WITH_PCRE)
# ---------------------------------------------------- # ----------------------------------------------------
set(Boost_USE_STATIC_LIBS OFF) set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_STATIC_RUNTIME OFF) set(Boost_USE_STATIC_RUNTIME OFF)
find_package(Boost COMPONENTS find_package(Boost COMPONENTS
serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED) serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)
# ---------------------------------------------------- # ----------------------------------------------------
@ -166,7 +166,7 @@ endif()
# Concordia: sub-projects # Concordia: sub-projects
# ================================================ # ================================================
set(ALL_DIRECTORIES concordia concordia-console libdivsufsort utf8 utf8case) set(ALL_DIRECTORIES concordia concordia-console concordia-sentence-tokenizer libdivsufsort utf8 utf8case)
include_directories("${concordia_SOURCE_DIR}") include_directories("${concordia_SOURCE_DIR}")
@ -179,7 +179,7 @@ foreach(dir ${ALL_DIRECTORIES})
endforeach(dir) endforeach(dir)
# ================================================ # ================================================
# Tests # Tests
# ================================================ # ================================================
@ -198,7 +198,7 @@ if(DOXYGEN_FOUND)
SET(DOXYFILE_LATEX ON) SET(DOXYFILE_LATEX ON)
SET(DOXYGEN_HAVE_DOT NO) SET(DOXYGEN_HAVE_DOT NO)
SET(DOXYGEN_QUIET YES) SET(DOXYGEN_QUIET YES)
SET(INPUT_FILES) SET(INPUT_FILES)
SET(INPUT_FILES "${INPUT_FILES} ${CMAKE_CURRENT_SOURCE_DIR}") SET(INPUT_FILES "${INPUT_FILES} ${CMAKE_CURRENT_SOURCE_DIR}")
@ -212,6 +212,5 @@ if(DOXYGEN_FOUND)
add_custom_target(doc ALL ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile add_custom_target(doc ALL ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMENT "Generating API documentation with Doxygen" VERBATIM) COMMENT "Generating API documentation with Doxygen" VERBATIM)
endif(DOXYGEN_FOUND)
endif(DOXYGEN_FOUND)

View File

@ -0,0 +1,19 @@
add_executable(concordia-sentence-tokenizer concordia-sentence-tokenizer.cpp)
target_link_libraries(concordia-sentence-tokenizer concordia utf8case ${Boost_LIBRARIES} ${LIBCONFIG_LIB})
if (WITH_RE2)
target_link_libraries(concordia-sentence-tokenizer re2)
if (WITH_PCRE)
target_link_libraries(concordia-sentence-tokenizer pcrecpp)
endif(WITH_PCRE)
else(WITH_RE2)
if (WITH_PCRE)
target_link_libraries(concordia-sentence-tokenizer pcrecpp)
endif(WITH_PCRE)
endif(WITH_RE2)
# =====================================
install(TARGETS concordia-sentence-tokenizer DESTINATION bin/)

View File

@ -0,0 +1,70 @@
#include <iostream>
#include <fstream>
#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include "concordia/concordia_config.hpp"
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
namespace po = boost::program_options;
int main(int argc, char** argv) {
po::options_description desc("Allowed options");
desc.add_options()
("help,h", "Display this message")
("config,c", boost::program_options::value<std::string>(),
"Concordia configuration file (required)");
po::variables_map cli;
po::store(po::parse_command_line(argc, argv, desc), cli);
po::notify(cli);
if (cli.count("help")) {
std::cerr << desc << std::endl;
return 1;
}
std::string configFile;
if (cli.count("config")) {
configFile = cli["config"].as<std::string>();
} else {
std::cerr << "No Concordia configuration file given. Terminating."
<< std::endl;
return 1;
}
try {
boost::shared_ptr<ConcordiaConfig> config =
boost::shared_ptr<ConcordiaConfig> (
new ConcordiaConfig(configFile));
SentenceTokenizer sentenceTokenizer(config);
for (std::string line; std::getline(std::cin, line);) {
TokenizedSentence ts = sentenceTokenizer.tokenize(line);
std::cout << ts.getTokenizedSentence() << std::endl;
}
} catch(ConcordiaException & e) {
std::cerr << "ConcordiaException caught with message: "
<< std::endl
<< e.what()
<< std::endl
<< "Terminating execution."
<< std::endl;
return 1;
} catch(std::exception & e) {
std::cerr << "Unexpected exception caught with message: "
<< std::endl
<< e.what()
<< std::endl
<< "Terminating execution."
<< std::endl;
return 1;
}
return 0;
}

View File

@ -49,7 +49,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
} }
boost::shared_ptr<RegexRule> wordsRule( boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])",
TokenAnnotation::WORD, "")); TokenAnnotation::WORD, ""));
wordsRule->apply(result); wordsRule->apply(result);
boost::shared_ptr<RegexRule> singleLetterWordsRule( boost::shared_ptr<RegexRule> singleLetterWordsRule(
@ -100,7 +100,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
std::ifstream tagsFile(htmlTagsPath.c_str()); std::ifstream tagsFile(htmlTagsPath.c_str());
if (tagsFile.is_open()) { if (tagsFile.is_open()) {
while (getline(tagsFile, line)) { while (getline(tagsFile, line)) {
tagsExpression += "|"; tagsExpression += line +"|";
} }
tagsFile.close(); tagsFile.close();
} else { } else {
@ -110,7 +110,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
throw ConcordiaException("No html tags file."); throw ConcordiaException("No html tags file.");
} }
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
tagsExpression += "br).*?>"; tagsExpression += ").*?>";
_htmlTags = boost::shared_ptr<RegexRule>( _htmlTags = boost::shared_ptr<RegexRule>(
new RegexRule(tagsExpression, new RegexRule(tagsExpression,
TokenAnnotation::HTML_TAG, "", false)); TokenAnnotation::HTML_TAG, "", false));
@ -149,4 +149,3 @@ boost::shared_ptr<RegexRule>
return boost::shared_ptr<RegexRule>( return boost::shared_ptr<RegexRule>(
new RegexRule(expression, annotationType, value, false)); new RegexRule(expression, annotationType, value, false));
} }

View File

@ -17,115 +17,20 @@ BOOST_AUTO_TEST_CASE( NETest )
{ {
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config); SentenceTokenizer tokenizer(config);
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ"; std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
TokenizedSentence ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
std::list<TokenAnnotation> annotations = ts.getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(14,annotations.size());
/*
0,4 type: 1 value: date
6,16 type: 0 value: ne_date
18,22 type: 1 value: mail
24,40 type: 0 value: ne_email
42,48 type: 1 value: number
50,54 type: 0 value: ne_number
56,61 type: 1 value: hello
61,62 type: 0 value: ne_number
63,69 type: 1 value: zażółć
70,75 type: 1 value: gęślą
76,80 type: 1 value: jaźń
82,88 type: 1 value: zażółć
89,94 type: 1 value: gęślą
95,99 type: 1 value: jaźń
*/
BOOST_CHECK_EQUAL(iter->getStart(),0); BOOST_CHECK_EQUAL(13,annotations.size());
BOOST_CHECK_EQUAL(iter->getEnd(),4);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "date");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),6);
BOOST_CHECK_EQUAL(iter->getEnd(),16);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),18); std::stringstream ss;
BOOST_CHECK_EQUAL(iter->getEnd(),22); ss << ts;
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL("[0,4][1][date] [6,16][0][ne_date] [18,22][1][mail] [24,40][0][ne_email] [42,48][1][number] [50,54][0][ne_number] [56,62][1][hello3] [63,69][1][zażółć] [70,75][1][gęślą] [76,80][1][jaźń] [82,88][1][zażółć] [89,94][1][gęślą] [95,99][1][jaźń]", ss.str());
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),24);
BOOST_CHECK_EQUAL(iter->getEnd(),40);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),42);
BOOST_CHECK_EQUAL(iter->getEnd(),48);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "number");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),50);
BOOST_CHECK_EQUAL(iter->getEnd(),54);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),56);
BOOST_CHECK_EQUAL(iter->getEnd(),61);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),61);
BOOST_CHECK_EQUAL(iter->getEnd(),62);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),63);
BOOST_CHECK_EQUAL(iter->getEnd(),69);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),70);
BOOST_CHECK_EQUAL(iter->getEnd(),75);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),76);
BOOST_CHECK_EQUAL(iter->getEnd(),80);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),82);
BOOST_CHECK_EQUAL(iter->getEnd(),88);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),89);
BOOST_CHECK_EQUAL(iter->getEnd(),94);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),95);
BOOST_CHECK_EQUAL(iter->getEnd(),99);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
} }
BOOST_AUTO_TEST_CASE( HtmlTagsTest ) BOOST_AUTO_TEST_CASE( HtmlTagsTest )
@ -135,77 +40,18 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>"; std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
TokenizedSentence ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations(); BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "link and bold and newline");
std::list<TokenAnnotation>::iterator iter = annotations.begin(); }
/*
0,23 type: 2 value:
23,27 type: 1 value: link
27,31 type: 2 value:
32,35 type: 1 value: and
36,39 type: 2 value:
39,43 type: 1 value: bold
43,47 type: 2 value:
48,51 type: 1 value: and
52,59 type: 1 value: newline
60,65 type: 2 value:
*/
BOOST_CHECK_EQUAL(10,annotations.size()); BOOST_AUTO_TEST_CASE( NormalSentencesTest )
{
BOOST_CHECK_EQUAL(iter->getStart(),0); boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
BOOST_CHECK_EQUAL(iter->getEnd(),23); SentenceTokenizer tokenizer(config);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),23);
BOOST_CHECK_EQUAL(iter->getEnd(),27);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"link");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),27); std::string sentence = "5.5.3.9 zbiornik balastowy t1500 347m3 ;";
BOOST_CHECK_EQUAL(iter->getEnd(),31); TokenizedSentence ts = tokenizer.tokenize(sentence);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "ne_bullet zbiornik balastowy t1500 347m3");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),32);
BOOST_CHECK_EQUAL(iter->getEnd(),35);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),36);
BOOST_CHECK_EQUAL(iter->getEnd(),39);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),39);
BOOST_CHECK_EQUAL(iter->getEnd(),43);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),43);
BOOST_CHECK_EQUAL(iter->getEnd(),47);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),48);
BOOST_CHECK_EQUAL(iter->getEnd(),51);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),52);
BOOST_CHECK_EQUAL(iter->getEnd(),59);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),60);
BOOST_CHECK_EQUAL(iter->getEnd(),65);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
} }
BOOST_AUTO_TEST_CASE( InWordSymbolsTest ) BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
@ -217,7 +63,7 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
TokenizedSentence ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
/* /*
0,4 type: 1 value: this 0,4 type: 1 value: this
5,7 type: 1 value: is 5,7 type: 1 value: is
@ -235,7 +81,7 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
*/ */
BOOST_CHECK_EQUAL(13,annotations.size()); BOOST_CHECK_EQUAL(13,annotations.size());
BOOST_CHECK_EQUAL(iter->getStart(),0); BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),4); BOOST_CHECK_EQUAL(iter->getEnd(),4);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
@ -330,13 +176,12 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
{ {
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config); SentenceTokenizer tokenizer(config);
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |"; std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
TokenizedSentence ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(9, annotations.size());
BOOST_CHECK_EQUAL(161, annotations.size());
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -2,7 +2,9 @@
#include "concordia/common/text_utils.hpp" #include "concordia/common/text_utils.hpp"
#include <iostream> #include <iostream>
#include <sstream>
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <boost/algorithm/string.hpp>
TokenizedSentence::TokenizedSentence(std::string sentence): TokenizedSentence::TokenizedSentence(std::string sentence):
_sentence(sentence) { _sentence(sentence) {
@ -73,3 +75,15 @@ void TokenizedSentence::generateTokens() {
} }
} }
std::string TokenizedSentence::getTokenizedSentence() const {
std::stringstream ss;
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
ss << annotation.getValue() << " ";
}
}
std::string result = ss.str();
boost::trim_right(result);
return result;
}

View File

@ -9,6 +9,10 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <list> #include <list>
#include <iostream>
#include <boost/foreach.hpp>
/*! /*!
A sentence after tokenizing operations. The class A sentence after tokenizing operations. The class
@ -31,13 +35,19 @@ public:
*/ */
virtual ~TokenizedSentence(); virtual ~TokenizedSentence();
/*! Getter for sentence /*! Getter for the string sentence, which is used for extracting tokens.
\returns sentence \returns sentence
*/ */
std::string getSentence() const { std::string getSentence() const {
return _sentence; return _sentence;
} }
/*! Method for getting tokenized sentence in a string format (
tokens separated by single spaces.
\returns tokenized sentence
*/
std::string getTokenizedSentence() const;
/*! Getter for all annotations list. This method returns /*! Getter for all annotations list. This method returns
all annotations, including those which are not considered all annotations, including those which are not considered
in the hash, i.e. stop words and html tags. in the hash, i.e. stop words and html tags.
@ -82,12 +92,12 @@ public:
*/ */
void generateTokens(); void generateTokens();
/*! /*!
Transform the sentence to lower case. Transform the sentence to lower case.
*/ */
void toLowerCase(); void toLowerCase();
/*! /*!
Add new annotations to the existing annotations list. Assumptions: Add new annotations to the existing annotations list. Assumptions:
1. existing _tokenAnnotations vector contains disjoint, sorted intervals; 1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
2. the annotations to be added list also has the above properties. 2. the annotations to be added list also has the above properties.
@ -98,6 +108,21 @@ public:
*/ */
void addAnnotations(std::vector<TokenAnnotation> annotations); void addAnnotations(std::vector<TokenAnnotation> annotations);
friend std::ostream & operator << (std::ostream & o,
const TokenizedSentence & ts) {
int index = 0;
BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {
o << "[" << token.getStart() << "," << token.getEnd() << "]["
<< token.getType() << "][" << token.getValue() <<"]";
if (index < ts.getAnnotations().size() - 1) {
o << " ";
}
index++;
}
return o;
}
private: private:
std::string _sentence; std::string _sentence;

View File

@ -1,3 +1,4 @@
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date [0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
[\w\._\d]+@\w+(\.\w+)* ne_email [\w\._\d]+@\w+(\.\w+)* ne_email
[0-9]+([\.\,][0-9]+)? ne_number [0-9]+[\.\)]([0-9]+\.)+ ne_bullet
\b[0-9]+([\.\,][0-9]+)?\b ne_number