tokenize by whitespace option

2015-12-27 20:54:40 +01:00 · 2015-12-27 20:54:40 +01:00 · 0a8d2fdd39
commit 0a8d2fdd39
parent 873d7c300c
7 changed files with 70 additions and 29 deletions
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -49,20 +49,22 @@ std::string _createLibraryVersion() {
 }

 TokenizedSentence
-            Concordia::tokenize(const std::string & sentence)
+            Concordia::tokenize(const std::string & sentence,
+                                bool byWhitespace)
                                  throw(ConcordiaException) {
    TokenizedSentence result =
-                _hashGenerator->generateHash(sentence);
+                _hashGenerator->generateHash(sentence, byWhitespace);
    _hashGenerator->serializeWordMap();
    return result;
 }

 std::vector<TokenizedSentence> Concordia::tokenizeAll(
-                     const std::vector<std::string> & sentences)
+                     const std::vector<std::string> & sentences,
+                     bool byWhitespace)
                                     throw(ConcordiaException) {
    std::vector<TokenizedSentence> result;
    BOOST_FOREACH(std::string sentence, sentences) {
-        result.push_back(_hashGenerator->generateHash(sentence));
+        result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
    }

    _hashGenerator->serializeWordMap();
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -60,20 +60,24 @@ public:

    /*! Tokenizes the given sentence.
      \param sentence sentence to be tokenized
+      \param byWhitespace whether to tokenize the sentence by whitespace
      \returns tokenized sentence object,
               containing information about original word positions
      \throws ConcordiaException
    */
-    TokenizedSentence tokenize(const std::string & sentence)
+    TokenizedSentence tokenize(const std::string & sentence,
+                               bool byWhitespace = false)
                                                     throw(ConcordiaException);

    /*! Tokenizes all the given sentences.
      \param sentences vector of sentences to be tokenized
+      \param byWhitespace whether to tokenize the sentence by whitespace
      \returns vector of tokenized sentence objects
      \throws ConcordiaException
    */
    std::vector<TokenizedSentence> tokenizeAll(
-                                   const std::vector<std::string> & sentences)
+                                   const std::vector<std::string> & sentences,
+                                   bool byWhitespace = false)
                                                     throw(ConcordiaException);

    /*! Adds an Example to the index.
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -29,8 +29,9 @@ HashGenerator::~HashGenerator() {
 }

 TokenizedSentence HashGenerator::generateHash(
-                     const std::string & sentence) throw(ConcordiaException) {
-    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
+                     const std::string & sentence,
+                     bool byWhitespace) throw(ConcordiaException) {
+    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
    ts.generateHash(_wordMap);

    if (ts.getTokens().size() > Utils::maxSentenceSize) {
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -44,9 +44,11 @@ public:
    /*!
      Generates hash of a sentence.
      \param sentence sentence to generate hash from
+      \param byWhitespace whether to tokenize the sentence by whitespace
      \returns tokenized sentence, containing the hash
    */
-    TokenizedSentence generateHash(const std::string & sentence)
+    TokenizedSentence generateHash(const std::string & sentence,
+                                   bool byWhitespace = false)
                                       throw(ConcordiaException);

    /*!
--- a/concordia/sentence_tokenizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@ -24,29 +24,37 @@ SentenceTokenizer::SentenceTokenizer(
 SentenceTokenizer::~SentenceTokenizer() {
 }

-TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
+TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
+                                              bool byWhitespace) {
    TokenizedSentence result(sentence);

-    _htmlTags->apply(result);
+    if(byWhitespace) {
+        boost::shared_ptr<RegexRule> whitespaceRule(
+                            new RegexRule("\\S+",
+                                          TokenAnnotation::WORD, ""));
+        whitespaceRule->apply(result);
+    } else {
+        _htmlTags->apply(result);

-    BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
-        neRule.apply(result);
+        BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
+            neRule.apply(result);
+        }
+
+        result.toLowerCase();
+
+        if (_stopWordsEnabled) {
+            _stopWords->apply(result);
+        }
+
+        boost::shared_ptr<RegexRule> wordsRule(
+                            new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
+                                          TokenAnnotation::WORD, ""));
+        wordsRule->apply(result);
+        boost::shared_ptr<RegexRule> singleLetterWordsRule(
+                            new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
+        singleLetterWordsRule->apply(result);
    }
-
-    result.toLowerCase();
-
-    if (_stopWordsEnabled) {
-        _stopWords->apply(result);
-    }
-
-    boost::shared_ptr<RegexRule> wordsRule(
-                        new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
-                                      TokenAnnotation::WORD, ""));
-    wordsRule->apply(result);
-    boost::shared_ptr<RegexRule> singleLetterWordsRule(
-                        new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
-    singleLetterWordsRule->apply(result);
-
+    
    return result;
 }

--- a/concordia/sentence_tokenizer.hpp
+++ b/concordia/sentence_tokenizer.hpp
@ -34,9 +34,11 @@ public:

    /*! Tokenizes the sentence.
      \param sentence input sentence
+      \param byWhitespace whether to tokenize the sentence by whitespace
      \returns tokenized sentence object build on the input sentence
    */
-    TokenizedSentence tokenize(const std::string & sentence);
+    TokenizedSentence tokenize(const std::string & sentence,
+                               bool byWhitespace = false);

 private:
    void _createNeRules(std::string & namedEntitiesPath);
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@ -435,4 +435,26 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )

 }

+BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
+{
+    Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
+                                    TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
+    TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true);
+
+    BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
+    
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
+
+    concordia.clearIndex();
+    
+}
+
+
 BOOST_AUTO_TEST_SUITE_END()