new tokenizer

2017-04-26 17:02:18 +02:00 · 2017-04-26 17:02:18 +02:00 · bd73749388
commit bd73749388
parent a0673df75a
8 changed files with 164 additions and 192 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -166,7 +166,7 @@ endif()
 # Concordia: sub-projects
 # ================================================
-set(ALL_DIRECTORIES concordia concordia-console libdivsufsort utf8 utf8case)
+set(ALL_DIRECTORIES concordia concordia-console concordia-sentence-tokenizer libdivsufsort utf8 utf8case)
 include_directories("${concordia_SOURCE_DIR}")
@ -214,4 +214,3 @@ if(DOXYGEN_FOUND)
    COMMENT "Generating API documentation with Doxygen" VERBATIM)
 endif(DOXYGEN_FOUND)
--- a/concordia-sentence-tokenizer/CMakeLists.txt
+++ b/concordia-sentence-tokenizer/CMakeLists.txt
@ -0,0 +1,19 @@
 add_executable(concordia-sentence-tokenizer concordia-sentence-tokenizer.cpp)
 target_link_libraries(concordia-sentence-tokenizer concordia utf8case ${Boost_LIBRARIES} ${LIBCONFIG_LIB})
 if (WITH_RE2)
  target_link_libraries(concordia-sentence-tokenizer re2)
  if (WITH_PCRE)
    target_link_libraries(concordia-sentence-tokenizer pcrecpp)
  endif(WITH_PCRE)
 else(WITH_RE2)
  if (WITH_PCRE)
    target_link_libraries(concordia-sentence-tokenizer pcrecpp)
  endif(WITH_PCRE)
 endif(WITH_RE2)
 # =====================================
 install(TARGETS concordia-sentence-tokenizer DESTINATION bin/)
--- a/concordia-sentence-tokenizer/concordia-sentence-tokenizer.cpp
+++ b/concordia-sentence-tokenizer/concordia-sentence-tokenizer.cpp
@ -0,0 +1,70 @@
 #include <iostream>
 #include <fstream>
 #include <boost/program_options.hpp>
 #include <boost/algorithm/string.hpp>
 #include <boost/shared_ptr.hpp>
 #include <boost/foreach.hpp>
 #include "concordia/concordia_config.hpp"
 #include "concordia/sentence_tokenizer.hpp"
 #include "concordia/tokenized_sentence.hpp"
 #include "concordia/common/config.hpp"
 #include "concordia/common/utils.hpp"
 namespace po = boost::program_options;
 int main(int argc, char** argv) {
    po::options_description desc("Allowed options");
    desc.add_options()
        ("help,h", "Display this message")
        ("config,c", boost::program_options::value<std::string>(),
                                 "Concordia configuration file (required)");
    po::variables_map cli;
    po::store(po::parse_command_line(argc, argv, desc), cli);
    po::notify(cli);
    if (cli.count("help")) {
        std::cerr << desc << std::endl;
        return 1;
    }
    std::string configFile;
    if (cli.count("config")) {
        configFile = cli["config"].as<std::string>();
    } else {
        std::cerr << "No Concordia configuration file given. Terminating."
                                                                << std::endl;
        return 1;
    }
    try {
        boost::shared_ptr<ConcordiaConfig> config =
                boost::shared_ptr<ConcordiaConfig> (
                                    new ConcordiaConfig(configFile));
        SentenceTokenizer sentenceTokenizer(config);
        for (std::string line; std::getline(std::cin, line);) {
            TokenizedSentence ts = sentenceTokenizer.tokenize(line);
            std::cout << ts.getTokenizedSentence() << std::endl;
        }
    } catch(ConcordiaException & e) {
        std::cerr << "ConcordiaException caught with message: "
                  << std::endl
                  << e.what()
                  << std::endl
                  << "Terminating execution."
                  << std::endl;
        return 1;
    } catch(std::exception & e) {
        std::cerr << "Unexpected exception caught with message: "
                  << std::endl
                  << e.what()
                  << std::endl
                  << "Terminating execution."
                  << std::endl;
        return 1;
    }
    return 0;
 }
--- a/concordia/sentence_tokenizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@ -49,7 +49,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
        }
        boost::shared_ptr<RegexRule> wordsRule(
-                            new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
+                            new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])",
                                          TokenAnnotation::WORD, ""));
        wordsRule->apply(result);
        boost::shared_ptr<RegexRule> singleLetterWordsRule(
@ -100,7 +100,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
        std::ifstream tagsFile(htmlTagsPath.c_str());
        if (tagsFile.is_open()) {
            while (getline(tagsFile, line)) {
-                tagsExpression += "|";
+                tagsExpression += line +"|";
            }
           tagsFile.close();
        } else {
@ -110,7 +110,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
        throw ConcordiaException("No html tags file.");
    }
    tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
-    tagsExpression += "br).*?>";
+    tagsExpression += ").*?>";
    _htmlTags = boost::shared_ptr<RegexRule>(
                        new RegexRule(tagsExpression,
                                TokenAnnotation::HTML_TAG, "", false));
@ -149,4 +149,3 @@ boost::shared_ptr<RegexRule>
    return boost::shared_ptr<RegexRule>(
                      new RegexRule(expression, annotationType, value, false));
 }
--- a/concordia/t/test_sentence_tokenizer.cpp
+++ b/concordia/t/test_sentence_tokenizer.cpp
@ -21,111 +21,16 @@ BOOST_AUTO_TEST_CASE( NETest )
    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
    TokenizedSentence ts = tokenizer.tokenize(sentence);
    BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
-    BOOST_CHECK_EQUAL(14,annotations.size());
+    BOOST_CHECK_EQUAL(13,annotations.size());
    /*
    0,4 type: 1 value: date
    6,16 type: 0 value: ne_date
    18,22 type: 1 value: mail
    24,40 type: 0 value: ne_email
    42,48 type: 1 value: number
    50,54 type: 0 value: ne_number
    56,61 type: 1 value: hello
    61,62 type: 0 value: ne_number
    63,69 type: 1 value: zażółć
    70,75 type: 1 value: gęślą
    76,80 type: 1 value: jaźń
    82,88 type: 1 value: zażółć
    89,94 type: 1 value: gęślą
    95,99 type: 1 value: jaźń
    */
    BOOST_CHECK_EQUAL(iter->getStart(),0);
    BOOST_CHECK_EQUAL(iter->getEnd(),4);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "date");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),6);
    BOOST_CHECK_EQUAL(iter->getEnd(),16);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
    BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),18);
    BOOST_CHECK_EQUAL(iter->getEnd(),22);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "mail");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),24);
    BOOST_CHECK_EQUAL(iter->getEnd(),40);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
    BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),42);
    BOOST_CHECK_EQUAL(iter->getEnd(),48);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "number");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),50);
    BOOST_CHECK_EQUAL(iter->getEnd(),54);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
    BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),56);
    BOOST_CHECK_EQUAL(iter->getEnd(),61);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "hello");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),61);
    BOOST_CHECK_EQUAL(iter->getEnd(),62);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
    BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),63);
    BOOST_CHECK_EQUAL(iter->getEnd(),69);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),70);
    BOOST_CHECK_EQUAL(iter->getEnd(),75);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),76);
    BOOST_CHECK_EQUAL(iter->getEnd(),80);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),82);
    BOOST_CHECK_EQUAL(iter->getEnd(),88);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),89);
    BOOST_CHECK_EQUAL(iter->getEnd(),94);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),95);
    BOOST_CHECK_EQUAL(iter->getEnd(),99);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
    std::stringstream ss;
    ss << ts;
    BOOST_CHECK_EQUAL("[0,4][1][date] [6,16][0][ne_date] [18,22][1][mail] [24,40][0][ne_email] [42,48][1][number] [50,54][0][ne_number] [56,62][1][hello3] [63,69][1][zażółć] [70,75][1][gęślą] [76,80][1][jaźń] [82,88][1][zażółć] [89,94][1][gęślą] [95,99][1][jaźń]", ss.str());
 }
 BOOST_AUTO_TEST_CASE( HtmlTagsTest )
@ -135,77 +40,18 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
    TokenizedSentence ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts.getAnnotations();
+    BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "link and bold and newline");
-    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+}
-    /*
+BOOST_AUTO_TEST_CASE( NormalSentencesTest )
-    0,23 type: 2 value: 
+{
-    23,27 type: 1 value: link
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
-    27,31 type: 2 value: 
+    SentenceTokenizer tokenizer(config);
    32,35 type: 1 value: and
    36,39 type: 2 value: 
    39,43 type: 1 value: bold
    43,47 type: 2 value: 
    48,51 type: 1 value: and
    52,59 type: 1 value: newline
    60,65 type: 2 value:
    */
-    BOOST_CHECK_EQUAL(10,annotations.size());
+    std::string sentence = "5.5.3.9 zbiornik balastowy t1500 347m3 ;";
    TokenizedSentence ts = tokenizer.tokenize(sentence);
    BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "ne_bullet zbiornik balastowy t1500 347m3");
    BOOST_CHECK_EQUAL(iter->getStart(),0);
    BOOST_CHECK_EQUAL(iter->getEnd(),23);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),23);
    BOOST_CHECK_EQUAL(iter->getEnd(),27);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"link");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),27);
    BOOST_CHECK_EQUAL(iter->getEnd(),31);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),32);
    BOOST_CHECK_EQUAL(iter->getEnd(),35);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"and");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),36);
    BOOST_CHECK_EQUAL(iter->getEnd(),39);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),39);
    BOOST_CHECK_EQUAL(iter->getEnd(),43);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"bold");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),43);
    BOOST_CHECK_EQUAL(iter->getEnd(),47);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),48);
    BOOST_CHECK_EQUAL(iter->getEnd(),51);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"and");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),52);
    BOOST_CHECK_EQUAL(iter->getEnd(),59);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
    BOOST_CHECK_EQUAL(iter->getValue(),"newline");
    ++iter;
    BOOST_CHECK_EQUAL(iter->getStart(),60);
    BOOST_CHECK_EQUAL(iter->getEnd(),65);
    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
 }
 BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
@ -333,10 +179,9 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
    TokenizedSentence ts = tokenizer.tokenize(sentence);
    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();
-    BOOST_CHECK_EQUAL(161, annotations.size());
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    BOOST_CHECK_EQUAL(9, annotations.size());
 }
 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@ -2,7 +2,9 @@
 #include "concordia/common/text_utils.hpp"
 #include <iostream>
 #include <sstream>
 #include <boost/foreach.hpp>
 #include <boost/algorithm/string.hpp>
 TokenizedSentence::TokenizedSentence(std::string sentence):
                                         _sentence(sentence) {
@ -73,3 +75,15 @@ void TokenizedSentence::generateTokens() {
    }
 }
 std::string TokenizedSentence::getTokenizedSentence() const {
    std::stringstream ss;
    BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
        if (annotation.getType() == TokenAnnotation::WORD ||
               annotation.getType() == TokenAnnotation::NE) {
            ss << annotation.getValue() << " ";
        }
    }
    std::string result = ss.str();
    boost::trim_right(result);
    return result;
 }
--- a/concordia/tokenized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@ -9,6 +9,10 @@
 #include <string>
 #include <vector>
 #include <list>
 #include <iostream>
 #include <boost/foreach.hpp>
 /*!
  A sentence after tokenizing operations. The class
@ -31,13 +35,19 @@ public:
    */
    virtual ~TokenizedSentence();
-    /*! Getter for sentence
+    /*! Getter for the string sentence, which is used for extracting tokens.
      \returns sentence
    */
    std::string getSentence() const {
        return _sentence;
    }
    /*! Method for getting tokenized sentence in a string format (
    tokens separated by single spaces.
      \returns tokenized sentence
    */
    std::string getTokenizedSentence() const;
    /*! Getter for all annotations list. This method returns
        all annotations, including those which are not considered
        in the hash, i.e. stop words and html tags.
@ -98,6 +108,21 @@ public:
    */
    void addAnnotations(std::vector<TokenAnnotation> annotations);
    friend std::ostream & operator << (std::ostream & o,
                          const TokenizedSentence & ts) {
        int index = 0;
        BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {
            o << "[" << token.getStart() << "," << token.getEnd() << "]["
              << token.getType() << "][" << token.getValue() <<"]";
            if (index < ts.getAnnotations().size() - 1) {
                o << " ";
            }
            index++;
        }
        return o;
    }
 private:
    std::string _sentence;
--- a/tests/resources/tokenizer/named_entities.txt
+++ b/tests/resources/tokenizer/named_entities.txt
@ -1,3 +1,4 @@
 [0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4}  ne_date
 [\w\._\d]+@\w+(\.\w+)*  ne_email
-[0-9]+([\.\,][0-9]+)? ne_number
+[0-9]+[\.\)]([0-9]+\.)+ ne_bullet
 \b[0-9]+([\.\,][0-9]+)?\b ne_number