new tokenizer

This commit is contained in:
rjawor 2017-04-26 17:02:18 +02:00
parent a0673df75a
commit bd73749388
8 changed files with 164 additions and 192 deletions

View File

@ -66,7 +66,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
set(BASE_TARGETS concordia)
# ================================================
# Third-party libraries
# ================================================
@ -93,7 +93,7 @@ endif(WITH_PCRE)
# ----------------------------------------------------
set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_STATIC_RUNTIME OFF)
find_package(Boost COMPONENTS
find_package(Boost COMPONENTS
serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)
# ----------------------------------------------------
@ -166,7 +166,7 @@ endif()
# Concordia: sub-projects
# ================================================
set(ALL_DIRECTORIES concordia concordia-console libdivsufsort utf8 utf8case)
set(ALL_DIRECTORIES concordia concordia-console concordia-sentence-tokenizer libdivsufsort utf8 utf8case)
include_directories("${concordia_SOURCE_DIR}")
@ -179,7 +179,7 @@ foreach(dir ${ALL_DIRECTORIES})
endforeach(dir)
# ================================================
# Tests
# ================================================
@ -198,7 +198,7 @@ if(DOXYGEN_FOUND)
SET(DOXYFILE_LATEX ON)
SET(DOXYGEN_HAVE_DOT NO)
SET(DOXYGEN_QUIET YES)
SET(INPUT_FILES)
SET(INPUT_FILES "${INPUT_FILES} ${CMAKE_CURRENT_SOURCE_DIR}")
@ -212,6 +212,5 @@ if(DOXYGEN_FOUND)
add_custom_target(doc ALL ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMENT "Generating API documentation with Doxygen" VERBATIM)
endif(DOXYGEN_FOUND)
endif(DOXYGEN_FOUND)

View File

@ -0,0 +1,19 @@
add_executable(concordia-sentence-tokenizer concordia-sentence-tokenizer.cpp)
target_link_libraries(concordia-sentence-tokenizer concordia utf8case ${Boost_LIBRARIES} ${LIBCONFIG_LIB})
if (WITH_RE2)
target_link_libraries(concordia-sentence-tokenizer re2)
if (WITH_PCRE)
target_link_libraries(concordia-sentence-tokenizer pcrecpp)
endif(WITH_PCRE)
else(WITH_RE2)
if (WITH_PCRE)
target_link_libraries(concordia-sentence-tokenizer pcrecpp)
endif(WITH_PCRE)
endif(WITH_RE2)
# =====================================
install(TARGETS concordia-sentence-tokenizer DESTINATION bin/)

View File

@ -0,0 +1,70 @@
#include <iostream>
#include <fstream>
#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include "concordia/concordia_config.hpp"
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
namespace po = boost::program_options;
int main(int argc, char** argv) {
po::options_description desc("Allowed options");
desc.add_options()
("help,h", "Display this message")
("config,c", boost::program_options::value<std::string>(),
"Concordia configuration file (required)");
po::variables_map cli;
po::store(po::parse_command_line(argc, argv, desc), cli);
po::notify(cli);
if (cli.count("help")) {
std::cerr << desc << std::endl;
return 1;
}
std::string configFile;
if (cli.count("config")) {
configFile = cli["config"].as<std::string>();
} else {
std::cerr << "No Concordia configuration file given. Terminating."
<< std::endl;
return 1;
}
try {
boost::shared_ptr<ConcordiaConfig> config =
boost::shared_ptr<ConcordiaConfig> (
new ConcordiaConfig(configFile));
SentenceTokenizer sentenceTokenizer(config);
for (std::string line; std::getline(std::cin, line);) {
TokenizedSentence ts = sentenceTokenizer.tokenize(line);
std::cout << ts.getTokenizedSentence() << std::endl;
}
} catch(ConcordiaException & e) {
std::cerr << "ConcordiaException caught with message: "
<< std::endl
<< e.what()
<< std::endl
<< "Terminating execution."
<< std::endl;
return 1;
} catch(std::exception & e) {
std::cerr << "Unexpected exception caught with message: "
<< std::endl
<< e.what()
<< std::endl
<< "Terminating execution."
<< std::endl;
return 1;
}
return 0;
}

View File

@ -49,7 +49,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
}
boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])",
TokenAnnotation::WORD, ""));
wordsRule->apply(result);
boost::shared_ptr<RegexRule> singleLetterWordsRule(
@ -100,7 +100,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
std::ifstream tagsFile(htmlTagsPath.c_str());
if (tagsFile.is_open()) {
while (getline(tagsFile, line)) {
tagsExpression += "|";
tagsExpression += line +"|";
}
tagsFile.close();
} else {
@ -110,7 +110,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
throw ConcordiaException("No html tags file.");
}
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
tagsExpression += "br).*?>";
tagsExpression += ").*?>";
_htmlTags = boost::shared_ptr<RegexRule>(
new RegexRule(tagsExpression,
TokenAnnotation::HTML_TAG, "", false));
@ -149,4 +149,3 @@ boost::shared_ptr<RegexRule>
return boost::shared_ptr<RegexRule>(
new RegexRule(expression, annotationType, value, false));
}

View File

@ -17,115 +17,20 @@ BOOST_AUTO_TEST_CASE( NETest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
TokenizedSentence ts = tokenizer.tokenize(sentence);
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(14,annotations.size());
/*
0,4 type: 1 value: date
6,16 type: 0 value: ne_date
18,22 type: 1 value: mail
24,40 type: 0 value: ne_email
42,48 type: 1 value: number
50,54 type: 0 value: ne_number
56,61 type: 1 value: hello
61,62 type: 0 value: ne_number
63,69 type: 1 value: zażółć
70,75 type: 1 value: gęślą
76,80 type: 1 value: jaźń
82,88 type: 1 value: zażółć
89,94 type: 1 value: gęślą
95,99 type: 1 value: jaźń
*/
BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),4);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "date");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),6);
BOOST_CHECK_EQUAL(iter->getEnd(),16);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
++iter;
BOOST_CHECK_EQUAL(13,annotations.size());
BOOST_CHECK_EQUAL(iter->getStart(),18);
BOOST_CHECK_EQUAL(iter->getEnd(),22);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),24);
BOOST_CHECK_EQUAL(iter->getEnd(),40);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),42);
BOOST_CHECK_EQUAL(iter->getEnd(),48);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "number");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),50);
BOOST_CHECK_EQUAL(iter->getEnd(),54);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),56);
BOOST_CHECK_EQUAL(iter->getEnd(),61);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),61);
BOOST_CHECK_EQUAL(iter->getEnd(),62);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),63);
BOOST_CHECK_EQUAL(iter->getEnd(),69);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),70);
BOOST_CHECK_EQUAL(iter->getEnd(),75);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),76);
BOOST_CHECK_EQUAL(iter->getEnd(),80);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),82);
BOOST_CHECK_EQUAL(iter->getEnd(),88);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),89);
BOOST_CHECK_EQUAL(iter->getEnd(),94);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),95);
BOOST_CHECK_EQUAL(iter->getEnd(),99);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
std::stringstream ss;
ss << ts;
BOOST_CHECK_EQUAL("[0,4][1][date] [6,16][0][ne_date] [18,22][1][mail] [24,40][0][ne_email] [42,48][1][number] [50,54][0][ne_number] [56,62][1][hello3] [63,69][1][zażółć] [70,75][1][gęślą] [76,80][1][jaźń] [82,88][1][zażółć] [89,94][1][gęślą] [95,99][1][jaźń]", ss.str());
}
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
@ -135,77 +40,18 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
/*
0,23 type: 2 value:
23,27 type: 1 value: link
27,31 type: 2 value:
32,35 type: 1 value: and
36,39 type: 2 value:
39,43 type: 1 value: bold
43,47 type: 2 value:
48,51 type: 1 value: and
52,59 type: 1 value: newline
60,65 type: 2 value:
*/
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "link and bold and newline");
}
BOOST_CHECK_EQUAL(10,annotations.size());
BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),23);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),23);
BOOST_CHECK_EQUAL(iter->getEnd(),27);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"link");
++iter;
BOOST_AUTO_TEST_CASE( NormalSentencesTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
BOOST_CHECK_EQUAL(iter->getStart(),27);
BOOST_CHECK_EQUAL(iter->getEnd(),31);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
std::string sentence = "5.5.3.9 zbiornik balastowy t1500 347m3 ;";
TokenizedSentence ts = tokenizer.tokenize(sentence);
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "ne_bullet zbiornik balastowy t1500 347m3");
BOOST_CHECK_EQUAL(iter->getStart(),32);
BOOST_CHECK_EQUAL(iter->getEnd(),35);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),36);
BOOST_CHECK_EQUAL(iter->getEnd(),39);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),39);
BOOST_CHECK_EQUAL(iter->getEnd(),43);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),43);
BOOST_CHECK_EQUAL(iter->getEnd(),47);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),48);
BOOST_CHECK_EQUAL(iter->getEnd(),51);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),52);
BOOST_CHECK_EQUAL(iter->getEnd(),59);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),60);
BOOST_CHECK_EQUAL(iter->getEnd(),65);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
}
BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
@ -217,7 +63,7 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
/*
0,4 type: 1 value: this
5,7 type: 1 value: is
@ -235,7 +81,7 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
*/
BOOST_CHECK_EQUAL(13,annotations.size());
BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),4);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
@ -330,13 +176,12 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(161, annotations.size());
BOOST_CHECK_EQUAL(9, annotations.size());
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -2,7 +2,9 @@
#include "concordia/common/text_utils.hpp"
#include <iostream>
#include <sstream>
#include <boost/foreach.hpp>
#include <boost/algorithm/string.hpp>
TokenizedSentence::TokenizedSentence(std::string sentence):
_sentence(sentence) {
@ -73,3 +75,15 @@ void TokenizedSentence::generateTokens() {
}
}
std::string TokenizedSentence::getTokenizedSentence() const {
std::stringstream ss;
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
ss << annotation.getValue() << " ";
}
}
std::string result = ss.str();
boost::trim_right(result);
return result;
}

View File

@ -9,6 +9,10 @@
#include <string>
#include <vector>
#include <list>
#include <iostream>
#include <boost/foreach.hpp>
/*!
A sentence after tokenizing operations. The class
@ -31,13 +35,19 @@ public:
*/
virtual ~TokenizedSentence();
/*! Getter for sentence
/*! Getter for the string sentence, which is used for extracting tokens.
\returns sentence
*/
std::string getSentence() const {
return _sentence;
}
/*! Method for getting tokenized sentence in a string format (
tokens separated by single spaces.
\returns tokenized sentence
*/
std::string getTokenizedSentence() const;
/*! Getter for all annotations list. This method returns
all annotations, including those which are not considered
in the hash, i.e. stop words and html tags.
@ -82,12 +92,12 @@ public:
*/
void generateTokens();
/*!
/*!
Transform the sentence to lower case.
*/
void toLowerCase();
/*!
/*!
Add new annotations to the existing annotations list. Assumptions:
1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
2. the annotations to be added list also has the above properties.
@ -98,6 +108,21 @@ public:
*/
void addAnnotations(std::vector<TokenAnnotation> annotations);
friend std::ostream & operator << (std::ostream & o,
const TokenizedSentence & ts) {
int index = 0;
BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {
o << "[" << token.getStart() << "," << token.getEnd() << "]["
<< token.getType() << "][" << token.getValue() <<"]";
if (index < ts.getAnnotations().size() - 1) {
o << " ";
}
index++;
}
return o;
}
private:
std::string _sentence;

View File

@ -1,3 +1,4 @@
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
[\w\._\d]+@\w+(\.\w+)* ne_email
[0-9]+([\.\,][0-9]+)? ne_number
[0-9]+[\.\)]([0-9]+\.)+ ne_bullet
\b[0-9]+([\.\,][0-9]+)?\b ne_number