new tokenizer

This commit is contained in:
rjawor 2017-04-26 17:02:18 +02:00
parent a0673df75a
commit bd73749388
8 changed files with 164 additions and 192 deletions

View File

@ -166,7 +166,7 @@ endif()
# Concordia: sub-projects # Concordia: sub-projects
# ================================================ # ================================================
set(ALL_DIRECTORIES concordia concordia-console libdivsufsort utf8 utf8case) set(ALL_DIRECTORIES concordia concordia-console concordia-sentence-tokenizer libdivsufsort utf8 utf8case)
include_directories("${concordia_SOURCE_DIR}") include_directories("${concordia_SOURCE_DIR}")
@ -214,4 +214,3 @@ if(DOXYGEN_FOUND)
COMMENT "Generating API documentation with Doxygen" VERBATIM) COMMENT "Generating API documentation with Doxygen" VERBATIM)
endif(DOXYGEN_FOUND) endif(DOXYGEN_FOUND)

View File

@ -0,0 +1,19 @@
add_executable(concordia-sentence-tokenizer concordia-sentence-tokenizer.cpp)
target_link_libraries(concordia-sentence-tokenizer concordia utf8case ${Boost_LIBRARIES} ${LIBCONFIG_LIB})
if (WITH_RE2)
target_link_libraries(concordia-sentence-tokenizer re2)
if (WITH_PCRE)
target_link_libraries(concordia-sentence-tokenizer pcrecpp)
endif(WITH_PCRE)
else(WITH_RE2)
if (WITH_PCRE)
target_link_libraries(concordia-sentence-tokenizer pcrecpp)
endif(WITH_PCRE)
endif(WITH_RE2)
# =====================================
install(TARGETS concordia-sentence-tokenizer DESTINATION bin/)

View File

@ -0,0 +1,70 @@
#include <iostream>
#include <fstream>
#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include "concordia/concordia_config.hpp"
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
namespace po = boost::program_options;
int main(int argc, char** argv) {
po::options_description desc("Allowed options");
desc.add_options()
("help,h", "Display this message")
("config,c", boost::program_options::value<std::string>(),
"Concordia configuration file (required)");
po::variables_map cli;
po::store(po::parse_command_line(argc, argv, desc), cli);
po::notify(cli);
if (cli.count("help")) {
std::cerr << desc << std::endl;
return 1;
}
std::string configFile;
if (cli.count("config")) {
configFile = cli["config"].as<std::string>();
} else {
std::cerr << "No Concordia configuration file given. Terminating."
<< std::endl;
return 1;
}
try {
boost::shared_ptr<ConcordiaConfig> config =
boost::shared_ptr<ConcordiaConfig> (
new ConcordiaConfig(configFile));
SentenceTokenizer sentenceTokenizer(config);
for (std::string line; std::getline(std::cin, line);) {
TokenizedSentence ts = sentenceTokenizer.tokenize(line);
std::cout << ts.getTokenizedSentence() << std::endl;
}
} catch(ConcordiaException & e) {
std::cerr << "ConcordiaException caught with message: "
<< std::endl
<< e.what()
<< std::endl
<< "Terminating execution."
<< std::endl;
return 1;
} catch(std::exception & e) {
std::cerr << "Unexpected exception caught with message: "
<< std::endl
<< e.what()
<< std::endl
<< "Terminating execution."
<< std::endl;
return 1;
}
return 0;
}

View File

@ -49,7 +49,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
} }
boost::shared_ptr<RegexRule> wordsRule( boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])",
TokenAnnotation::WORD, "")); TokenAnnotation::WORD, ""));
wordsRule->apply(result); wordsRule->apply(result);
boost::shared_ptr<RegexRule> singleLetterWordsRule( boost::shared_ptr<RegexRule> singleLetterWordsRule(
@ -100,7 +100,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
std::ifstream tagsFile(htmlTagsPath.c_str()); std::ifstream tagsFile(htmlTagsPath.c_str());
if (tagsFile.is_open()) { if (tagsFile.is_open()) {
while (getline(tagsFile, line)) { while (getline(tagsFile, line)) {
tagsExpression += "|"; tagsExpression += line +"|";
} }
tagsFile.close(); tagsFile.close();
} else { } else {
@ -110,7 +110,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
throw ConcordiaException("No html tags file."); throw ConcordiaException("No html tags file.");
} }
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
tagsExpression += "br).*?>"; tagsExpression += ").*?>";
_htmlTags = boost::shared_ptr<RegexRule>( _htmlTags = boost::shared_ptr<RegexRule>(
new RegexRule(tagsExpression, new RegexRule(tagsExpression,
TokenAnnotation::HTML_TAG, "", false)); TokenAnnotation::HTML_TAG, "", false));
@ -149,4 +149,3 @@ boost::shared_ptr<RegexRule>
return boost::shared_ptr<RegexRule>( return boost::shared_ptr<RegexRule>(
new RegexRule(expression, annotationType, value, false)); new RegexRule(expression, annotationType, value, false));
} }

View File

@ -21,111 +21,16 @@ BOOST_AUTO_TEST_CASE( NETest )
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ"; std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
TokenizedSentence ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
std::list<TokenAnnotation> annotations = ts.getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(14,annotations.size()); BOOST_CHECK_EQUAL(13,annotations.size());
/*
0,4 type: 1 value: date
6,16 type: 0 value: ne_date
18,22 type: 1 value: mail
24,40 type: 0 value: ne_email
42,48 type: 1 value: number
50,54 type: 0 value: ne_number
56,61 type: 1 value: hello
61,62 type: 0 value: ne_number
63,69 type: 1 value: zażółć
70,75 type: 1 value: gęślą
76,80 type: 1 value: jaźń
82,88 type: 1 value: zażółć
89,94 type: 1 value: gęślą
95,99 type: 1 value: jaźń
*/
BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),4);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "date");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),6);
BOOST_CHECK_EQUAL(iter->getEnd(),16);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),18);
BOOST_CHECK_EQUAL(iter->getEnd(),22);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),24);
BOOST_CHECK_EQUAL(iter->getEnd(),40);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),42);
BOOST_CHECK_EQUAL(iter->getEnd(),48);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "number");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),50);
BOOST_CHECK_EQUAL(iter->getEnd(),54);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),56);
BOOST_CHECK_EQUAL(iter->getEnd(),61);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),61);
BOOST_CHECK_EQUAL(iter->getEnd(),62);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),63);
BOOST_CHECK_EQUAL(iter->getEnd(),69);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),70);
BOOST_CHECK_EQUAL(iter->getEnd(),75);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),76);
BOOST_CHECK_EQUAL(iter->getEnd(),80);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),82);
BOOST_CHECK_EQUAL(iter->getEnd(),88);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),89);
BOOST_CHECK_EQUAL(iter->getEnd(),94);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),95);
BOOST_CHECK_EQUAL(iter->getEnd(),99);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
std::stringstream ss;
ss << ts;
BOOST_CHECK_EQUAL("[0,4][1][date] [6,16][0][ne_date] [18,22][1][mail] [24,40][0][ne_email] [42,48][1][number] [50,54][0][ne_number] [56,62][1][hello3] [63,69][1][zażółć] [70,75][1][gęślą] [76,80][1][jaźń] [82,88][1][zażółć] [89,94][1][gęślą] [95,99][1][jaźń]", ss.str());
} }
BOOST_AUTO_TEST_CASE( HtmlTagsTest ) BOOST_AUTO_TEST_CASE( HtmlTagsTest )
@ -135,77 +40,18 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>"; std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
TokenizedSentence ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations(); BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "link and bold and newline");
std::list<TokenAnnotation>::iterator iter = annotations.begin(); }
/* BOOST_AUTO_TEST_CASE( NormalSentencesTest )
0,23 type: 2 value: {
23,27 type: 1 value: link boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
27,31 type: 2 value: SentenceTokenizer tokenizer(config);
32,35 type: 1 value: and
36,39 type: 2 value:
39,43 type: 1 value: bold
43,47 type: 2 value:
48,51 type: 1 value: and
52,59 type: 1 value: newline
60,65 type: 2 value:
*/
BOOST_CHECK_EQUAL(10,annotations.size()); std::string sentence = "5.5.3.9 zbiornik balastowy t1500 347m3 ;";
TokenizedSentence ts = tokenizer.tokenize(sentence);
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "ne_bullet zbiornik balastowy t1500 347m3");
BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),23);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),23);
BOOST_CHECK_EQUAL(iter->getEnd(),27);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"link");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),27);
BOOST_CHECK_EQUAL(iter->getEnd(),31);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),32);
BOOST_CHECK_EQUAL(iter->getEnd(),35);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),36);
BOOST_CHECK_EQUAL(iter->getEnd(),39);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),39);
BOOST_CHECK_EQUAL(iter->getEnd(),43);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),43);
BOOST_CHECK_EQUAL(iter->getEnd(),47);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),48);
BOOST_CHECK_EQUAL(iter->getEnd(),51);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),52);
BOOST_CHECK_EQUAL(iter->getEnd(),59);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
++iter;
BOOST_CHECK_EQUAL(iter->getStart(),60);
BOOST_CHECK_EQUAL(iter->getEnd(),65);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
} }
BOOST_AUTO_TEST_CASE( InWordSymbolsTest ) BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
@ -333,10 +179,9 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |"; std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
TokenizedSentence ts = tokenizer.tokenize(sentence); TokenizedSentence ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(161, annotations.size()); std::list<TokenAnnotation> annotations = ts.getAnnotations();
BOOST_CHECK_EQUAL(9, annotations.size());
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -2,7 +2,9 @@
#include "concordia/common/text_utils.hpp" #include "concordia/common/text_utils.hpp"
#include <iostream> #include <iostream>
#include <sstream>
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <boost/algorithm/string.hpp>
TokenizedSentence::TokenizedSentence(std::string sentence): TokenizedSentence::TokenizedSentence(std::string sentence):
_sentence(sentence) { _sentence(sentence) {
@ -73,3 +75,15 @@ void TokenizedSentence::generateTokens() {
} }
} }
std::string TokenizedSentence::getTokenizedSentence() const {
std::stringstream ss;
BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
if (annotation.getType() == TokenAnnotation::WORD ||
annotation.getType() == TokenAnnotation::NE) {
ss << annotation.getValue() << " ";
}
}
std::string result = ss.str();
boost::trim_right(result);
return result;
}

View File

@ -9,6 +9,10 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <list> #include <list>
#include <iostream>
#include <boost/foreach.hpp>
/*! /*!
A sentence after tokenizing operations. The class A sentence after tokenizing operations. The class
@ -31,13 +35,19 @@ public:
*/ */
virtual ~TokenizedSentence(); virtual ~TokenizedSentence();
/*! Getter for sentence /*! Getter for the string sentence, which is used for extracting tokens.
\returns sentence \returns sentence
*/ */
std::string getSentence() const { std::string getSentence() const {
return _sentence; return _sentence;
} }
/*! Method for getting tokenized sentence in a string format (
tokens separated by single spaces.
\returns tokenized sentence
*/
std::string getTokenizedSentence() const;
/*! Getter for all annotations list. This method returns /*! Getter for all annotations list. This method returns
all annotations, including those which are not considered all annotations, including those which are not considered
in the hash, i.e. stop words and html tags. in the hash, i.e. stop words and html tags.
@ -98,6 +108,21 @@ public:
*/ */
void addAnnotations(std::vector<TokenAnnotation> annotations); void addAnnotations(std::vector<TokenAnnotation> annotations);
friend std::ostream & operator << (std::ostream & o,
const TokenizedSentence & ts) {
int index = 0;
BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {
o << "[" << token.getStart() << "," << token.getEnd() << "]["
<< token.getType() << "][" << token.getValue() <<"]";
if (index < ts.getAnnotations().size() - 1) {
o << " ";
}
index++;
}
return o;
}
private: private:
std::string _sentence; std::string _sentence;

View File

@ -1,3 +1,4 @@
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date [0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
[\w\._\d]+@\w+(\.\w+)* ne_email [\w\._\d]+@\w+(\.\w+)* ne_email
[0-9]+([\.\,][0-9]+)? ne_number [0-9]+[\.\)]([0-9]+\.)+ ne_bullet
\b[0-9]+([\.\,][0-9]+)?\b ne_number