new tokenizer

2017-04-26 17:02:18 +02:00 · 2017-04-26 17:02:18 +02:00 · bd73749388
commit bd73749388
parent a0673df75a
8 changed files with 164 additions and 192 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -166,7 +166,7 @@ endif()
 # Concordia: sub-projects
 # ================================================

-set(ALL_DIRECTORIES concordia concordia-console libdivsufsort utf8 utf8case)
+set(ALL_DIRECTORIES concordia concordia-console concordia-sentence-tokenizer libdivsufsort utf8 utf8case)

 include_directories("${concordia_SOURCE_DIR}")

@ -214,4 +214,3 @@ if(DOXYGEN_FOUND)
    COMMENT "Generating API documentation with Doxygen" VERBATIM)

 endif(DOXYGEN_FOUND)
-
--- a/concordia-sentence-tokenizer/CMakeLists.txt
+++ b/concordia-sentence-tokenizer/CMakeLists.txt
@ -0,0 +1,19 @@
+
+add_executable(concordia-sentence-tokenizer concordia-sentence-tokenizer.cpp)
+
+target_link_libraries(concordia-sentence-tokenizer concordia utf8case ${Boost_LIBRARIES} ${LIBCONFIG_LIB})
+
+if (WITH_RE2)
+  target_link_libraries(concordia-sentence-tokenizer re2)
+  if (WITH_PCRE)
+    target_link_libraries(concordia-sentence-tokenizer pcrecpp)
+  endif(WITH_PCRE)
+else(WITH_RE2)
+  if (WITH_PCRE)
+    target_link_libraries(concordia-sentence-tokenizer pcrecpp)
+  endif(WITH_PCRE)
+endif(WITH_RE2)
+
+# =====================================
+
+install(TARGETS concordia-sentence-tokenizer DESTINATION bin/)
--- a/concordia-sentence-tokenizer/concordia-sentence-tokenizer.cpp
+++ b/concordia-sentence-tokenizer/concordia-sentence-tokenizer.cpp
@ -0,0 +1,70 @@
+#include <iostream>
+#include <fstream>
+#include <boost/program_options.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/foreach.hpp>
+
+#include "concordia/concordia_config.hpp"
+#include "concordia/sentence_tokenizer.hpp"
+#include "concordia/tokenized_sentence.hpp"
+#include "concordia/common/config.hpp"
+#include "concordia/common/utils.hpp"
+
+namespace po = boost::program_options;
+
+int main(int argc, char** argv) {
+    po::options_description desc("Allowed options");
+
+    desc.add_options()
+        ("help,h", "Display this message")
+        ("config,c", boost::program_options::value<std::string>(),
+                                 "Concordia configuration file (required)");
+    po::variables_map cli;
+    po::store(po::parse_command_line(argc, argv, desc), cli);
+    po::notify(cli);
+
+    if (cli.count("help")) {
+        std::cerr << desc << std::endl;
+        return 1;
+    }
+
+    std::string configFile;
+    if (cli.count("config")) {
+        configFile = cli["config"].as<std::string>();
+    } else {
+        std::cerr << "No Concordia configuration file given. Terminating."
+                                                                << std::endl;
+        return 1;
+    }
+
+    try {
+        boost::shared_ptr<ConcordiaConfig> config =
+                boost::shared_ptr<ConcordiaConfig> (
+                                    new ConcordiaConfig(configFile));
+
+        SentenceTokenizer sentenceTokenizer(config);
+        for (std::string line; std::getline(std::cin, line);) {
+            TokenizedSentence ts = sentenceTokenizer.tokenize(line);
+            std::cout << ts.getTokenizedSentence() << std::endl;
+        }
+
+    } catch(ConcordiaException & e) {
+        std::cerr << "ConcordiaException caught with message: "
+                  << std::endl
+                  << e.what()
+                  << std::endl
+                  << "Terminating execution."
+                  << std::endl;
+        return 1;
+    } catch(std::exception & e) {
+        std::cerr << "Unexpected exception caught with message: "
+                  << std::endl
+                  << e.what()
+                  << std::endl
+                  << "Terminating execution."
+                  << std::endl;
+        return 1;
+    }
+    return 0;
+}
--- a/concordia/sentence_tokenizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@ -49,7 +49,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
        }

        boost::shared_ptr<RegexRule> wordsRule(
-                            new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
+                            new RegexRule("(\\p{L}|[0-9])(\\p{L}|[0-9]|'|\\-)*(\\p{L}|[0-9])",
                                          TokenAnnotation::WORD, ""));
        wordsRule->apply(result);
        boost::shared_ptr<RegexRule> singleLetterWordsRule(
@ -100,7 +100,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
        std::ifstream tagsFile(htmlTagsPath.c_str());
        if (tagsFile.is_open()) {
            while (getline(tagsFile, line)) {
-                tagsExpression += "|";
+                tagsExpression += line +"|";
            }
           tagsFile.close();
        } else {
@ -110,7 +110,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
        throw ConcordiaException("No html tags file.");
    }
    tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
-    tagsExpression += "br).*?>";
+    tagsExpression += ").*?>";
    _htmlTags = boost::shared_ptr<RegexRule>(
                        new RegexRule(tagsExpression,
                                TokenAnnotation::HTML_TAG, "", false));
@ -149,4 +149,3 @@ boost::shared_ptr<RegexRule>
    return boost::shared_ptr<RegexRule>(
                      new RegexRule(expression, annotationType, value, false));
 }
-
--- a/concordia/t/test_sentence_tokenizer.cpp
+++ b/concordia/t/test_sentence_tokenizer.cpp
@ -21,111 +21,16 @@ BOOST_AUTO_TEST_CASE( NETest )

    std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
    TokenizedSentence ts = tokenizer.tokenize(sentence);
+    BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
+
    std::list<TokenAnnotation> annotations = ts.getAnnotations();
    std::list<TokenAnnotation>::iterator iter = annotations.begin();

-    BOOST_CHECK_EQUAL(14,annotations.size());
-    
-    /*
-    0,4 type: 1 value: date
-    6,16 type: 0 value: ne_date
-    18,22 type: 1 value: mail
-    24,40 type: 0 value: ne_email
-    42,48 type: 1 value: number
-    50,54 type: 0 value: ne_number
-    56,61 type: 1 value: hello
-    61,62 type: 0 value: ne_number
-    63,69 type: 1 value: zażółć
-    70,75 type: 1 value: gęślą
-    76,80 type: 1 value: jaźń
-    82,88 type: 1 value: zażółć
-    89,94 type: 1 value: gęślą
-    95,99 type: 1 value: jaźń
-    */
-
-    BOOST_CHECK_EQUAL(iter->getStart(),0);
-    BOOST_CHECK_EQUAL(iter->getEnd(),4);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(), "date");
-    ++iter;
-    
-    BOOST_CHECK_EQUAL(iter->getStart(),6);
-    BOOST_CHECK_EQUAL(iter->getEnd(),16);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
-    BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),18);
-    BOOST_CHECK_EQUAL(iter->getEnd(),22);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(), "mail");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),24);
-    BOOST_CHECK_EQUAL(iter->getEnd(),40);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
-    BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),42);
-    BOOST_CHECK_EQUAL(iter->getEnd(),48);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(), "number");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),50);
-    BOOST_CHECK_EQUAL(iter->getEnd(),54);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
-    BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
-    ++iter;
-    
-    BOOST_CHECK_EQUAL(iter->getStart(),56);
-    BOOST_CHECK_EQUAL(iter->getEnd(),61);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(), "hello");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),61);
-    BOOST_CHECK_EQUAL(iter->getEnd(),62);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
-    BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),63);
-    BOOST_CHECK_EQUAL(iter->getEnd(),69);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),70);
-    BOOST_CHECK_EQUAL(iter->getEnd(),75);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),76);
-    BOOST_CHECK_EQUAL(iter->getEnd(),80);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),82);
-    BOOST_CHECK_EQUAL(iter->getEnd(),88);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),89);
-    BOOST_CHECK_EQUAL(iter->getEnd(),94);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),95);
-    BOOST_CHECK_EQUAL(iter->getEnd(),99);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
+    BOOST_CHECK_EQUAL(13,annotations.size());

+    std::stringstream ss;
+    ss << ts;
+    BOOST_CHECK_EQUAL("[0,4][1][date] [6,16][0][ne_date] [18,22][1][mail] [24,40][0][ne_email] [42,48][1][number] [50,54][0][ne_number] [56,62][1][hello3] [63,69][1][zażółć] [70,75][1][gęślą] [76,80][1][jaźń] [82,88][1][zażółć] [89,94][1][gęślą] [95,99][1][jaźń]", ss.str());
 }

 BOOST_AUTO_TEST_CASE( HtmlTagsTest )
@ -135,77 +40,18 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )

    std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
    TokenizedSentence ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts.getAnnotations();
-    std::list<TokenAnnotation>::iterator iter = annotations.begin();
+    BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "link and bold and newline");
+}

-    /*
-    0,23 type: 2 value: 
-    23,27 type: 1 value: link
-    27,31 type: 2 value: 
-    32,35 type: 1 value: and
-    36,39 type: 2 value: 
-    39,43 type: 1 value: bold
-    43,47 type: 2 value: 
-    48,51 type: 1 value: and
-    52,59 type: 1 value: newline
-    60,65 type: 2 value:
-    */
+BOOST_AUTO_TEST_CASE( NormalSentencesTest )
+{
+    boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
+    SentenceTokenizer tokenizer(config);

-    BOOST_CHECK_EQUAL(10,annotations.size());
+    std::string sentence = "5.5.3.9 zbiornik balastowy t1500 347m3 ;";
+    TokenizedSentence ts = tokenizer.tokenize(sentence);
+    BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "ne_bullet zbiornik balastowy t1500 347m3");

-    BOOST_CHECK_EQUAL(iter->getStart(),0);
-    BOOST_CHECK_EQUAL(iter->getEnd(),23);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    ++iter;
-    
-    BOOST_CHECK_EQUAL(iter->getStart(),23);
-    BOOST_CHECK_EQUAL(iter->getEnd(),27);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(),"link");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),27);
-    BOOST_CHECK_EQUAL(iter->getEnd(),31);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),32);
-    BOOST_CHECK_EQUAL(iter->getEnd(),35);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(),"and");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),36);
-    BOOST_CHECK_EQUAL(iter->getEnd(),39);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),39);
-    BOOST_CHECK_EQUAL(iter->getEnd(),43);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(),"bold");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),43);
-    BOOST_CHECK_EQUAL(iter->getEnd(),47);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),48);
-    BOOST_CHECK_EQUAL(iter->getEnd(),51);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(),"and");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),52);
-    BOOST_CHECK_EQUAL(iter->getEnd(),59);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
-    BOOST_CHECK_EQUAL(iter->getValue(),"newline");
-    ++iter;
-
-    BOOST_CHECK_EQUAL(iter->getStart(),60);
-    BOOST_CHECK_EQUAL(iter->getEnd(),65);
-    BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
 }

 BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
@ -333,10 +179,9 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )

    std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
    TokenizedSentence ts = tokenizer.tokenize(sentence);
-    std::list<TokenAnnotation> annotations = ts.getAnnotations();
-    std::list<TokenAnnotation>::iterator iter = annotations.begin();

-    BOOST_CHECK_EQUAL(161, annotations.size());
+    std::list<TokenAnnotation> annotations = ts.getAnnotations();
+    BOOST_CHECK_EQUAL(9, annotations.size());
 }

 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@ -2,7 +2,9 @@
 #include "concordia/common/text_utils.hpp"

 #include <iostream>
+#include <sstream>
 #include <boost/foreach.hpp>
+#include <boost/algorithm/string.hpp>

 TokenizedSentence::TokenizedSentence(std::string sentence):
                                         _sentence(sentence) {
@ -73,3 +75,15 @@ void TokenizedSentence::generateTokens() {
    }
 }

+std::string TokenizedSentence::getTokenizedSentence() const {
+    std::stringstream ss;
+    BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
+        if (annotation.getType() == TokenAnnotation::WORD ||
+               annotation.getType() == TokenAnnotation::NE) {
+            ss << annotation.getValue() << " ";
+        }
+    }
+    std::string result = ss.str();
+    boost::trim_right(result);
+    return result;
+}
--- a/concordia/tokenized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@ -9,6 +9,10 @@
 #include <string>
 #include <vector>
 #include <list>
+#include <iostream>
+#include <boost/foreach.hpp>
+
+

 /*!
  A sentence after tokenizing operations. The class
@ -31,13 +35,19 @@ public:
    */
    virtual ~TokenizedSentence();

-    /*! Getter for sentence
+    /*! Getter for the string sentence, which is used for extracting tokens.
      \returns sentence
    */
    std::string getSentence() const {
        return _sentence;
    }

+    /*! Method for getting tokenized sentence in a string format (
+    tokens separated by single spaces.
+      \returns tokenized sentence
+    */
+    std::string getTokenizedSentence() const;
+
    /*! Getter for all annotations list. This method returns
        all annotations, including those which are not considered
        in the hash, i.e. stop words and html tags.
@ -98,6 +108,21 @@ public:
    */
    void addAnnotations(std::vector<TokenAnnotation> annotations);

+    friend std::ostream & operator << (std::ostream & o,
+                          const TokenizedSentence & ts) {
+        int index = 0;
+        BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {
+            o << "[" << token.getStart() << "," << token.getEnd() << "]["
+              << token.getType() << "][" << token.getValue() <<"]";
+            if (index < ts.getAnnotations().size() - 1) {
+                o << " ";
+            }
+            index++;
+        }
+        return o;
+    }
+
+
 private:
    std::string _sentence;

--- a/tests/resources/tokenizer/named_entities.txt
+++ b/tests/resources/tokenizer/named_entities.txt
@ -1,3 +1,4 @@
 [0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4}  ne_date
 [\w\._\d]+@\w+(\.\w+)*  ne_email
-[0-9]+([\.\,][0-9]+)? ne_number
+[0-9]+[\.\)]([0-9]+\.)+ ne_bullet
+\b[0-9]+([\.\,][0-9]+)?\b ne_number