From 0a8d2fdd393f7a936496152ee6812a18d49b589f Mon Sep 17 00:00:00 2001
From: rjawor <rjawor@amu.edu.pl>
Date: Sun, 27 Dec 2015 20:54:40 +0100
Subject: [PATCH] tokenize by whitespace option

---
 concordia/concordia.cpp          | 10 ++++---
 concordia/concordia.hpp          |  8 ++++--
 concordia/hash_generator.cpp     |  5 ++--
 concordia/hash_generator.hpp     |  4 ++-
 concordia/sentence_tokenizer.cpp | 46 +++++++++++++++++++-------------
 concordia/sentence_tokenizer.hpp |  4 ++-
 concordia/t/test_concordia.cpp   | 22 +++++++++++++++
 7 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp
index dcebd23..4ff2f5e 100644
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@@ -49,20 +49,22 @@ std::string _createLibraryVersion() {
 }
 
 TokenizedSentence
-            Concordia::tokenize(const std::string & sentence)
+            Concordia::tokenize(const std::string & sentence,
+                                bool byWhitespace)
                                   throw(ConcordiaException) {
     TokenizedSentence result =
-                _hashGenerator->generateHash(sentence);
+                _hashGenerator->generateHash(sentence, byWhitespace);
     _hashGenerator->serializeWordMap();
     return result;
 }
 
 std::vector<TokenizedSentence> Concordia::tokenizeAll(
-                     const std::vector<std::string> & sentences)
+                     const std::vector<std::string> & sentences,
+                     bool byWhitespace)
                                      throw(ConcordiaException) {
     std::vector<TokenizedSentence> result;
     BOOST_FOREACH(std::string sentence, sentences) {
-        result.push_back(_hashGenerator->generateHash(sentence));
+        result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
     }
 
     _hashGenerator->serializeWordMap();
diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp
index 2ecc725..f3fdaed 100644
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@@ -60,20 +60,24 @@ public:
 
     /*! Tokenizes the given sentence.
       \param sentence sentence to be tokenized
+      \param byWhitespace whether to tokenize the sentence by whitespace
       \returns tokenized sentence object,
                containing information about original word positions
       \throws ConcordiaException
     */
-    TokenizedSentence tokenize(const std::string & sentence)
+    TokenizedSentence tokenize(const std::string & sentence,
+                               bool byWhitespace = false)
                                                      throw(ConcordiaException);
 
     /*! Tokenizes all the given sentences.
       \param sentences vector of sentences to be tokenized
+      \param byWhitespace whether to tokenize the sentence by whitespace
       \returns vector of tokenized sentence objects
       \throws ConcordiaException
     */
     std::vector<TokenizedSentence> tokenizeAll(
-                                   const std::vector<std::string> & sentences)
+                                   const std::vector<std::string> & sentences,
+                                   bool byWhitespace = false)
                                                      throw(ConcordiaException);
 
     /*! Adds an Example to the index.
diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp
index 04c7f3c..9a2a605 100644
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@@ -29,8 +29,9 @@ HashGenerator::~HashGenerator() {
 }
 
 TokenizedSentence HashGenerator::generateHash(
-                     const std::string & sentence) throw(ConcordiaException) {
-    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
+                     const std::string & sentence,
+                     bool byWhitespace) throw(ConcordiaException) {
+    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
     ts.generateHash(_wordMap);
 
     if (ts.getTokens().size() > Utils::maxSentenceSize) {
diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp
index 8c308c1..adf4df2 100644
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@@ -44,9 +44,11 @@ public:
     /*!
       Generates hash of a sentence.
       \param sentence sentence to generate hash from
+      \param byWhitespace whether to tokenize the sentence by whitespace
       \returns tokenized sentence, containing the hash
     */
-    TokenizedSentence generateHash(const std::string & sentence)
+    TokenizedSentence generateHash(const std::string & sentence,
+                                   bool byWhitespace = false)
                                        throw(ConcordiaException);
 
     /*!
diff --git a/concordia/sentence_tokenizer.cpp b/concordia/sentence_tokenizer.cpp
index 0666a5d..9c7b3b6 100644
--- a/concordia/sentence_tokenizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@@ -24,29 +24,37 @@ SentenceTokenizer::SentenceTokenizer(
 SentenceTokenizer::~SentenceTokenizer() {
 }
 
-TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
+TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
+                                              bool byWhitespace) {
     TokenizedSentence result(sentence);
 
-    _htmlTags->apply(result);
+    if(byWhitespace) {
+        boost::shared_ptr<RegexRule> whitespaceRule(
+                            new RegexRule("\\S+",
+                                          TokenAnnotation::WORD, ""));
+        whitespaceRule->apply(result);
+    } else {
+        _htmlTags->apply(result);
 
-    BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
-        neRule.apply(result);
+        BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
+            neRule.apply(result);
+        }
+
+        result.toLowerCase();
+
+        if (_stopWordsEnabled) {
+            _stopWords->apply(result);
+        }
+
+        boost::shared_ptr<RegexRule> wordsRule(
+                            new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
+                                          TokenAnnotation::WORD, ""));
+        wordsRule->apply(result);
+        boost::shared_ptr<RegexRule> singleLetterWordsRule(
+                            new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
+        singleLetterWordsRule->apply(result);
     }
-
-    result.toLowerCase();
-
-    if (_stopWordsEnabled) {
-        _stopWords->apply(result);
-    }
-
-    boost::shared_ptr<RegexRule> wordsRule(
-                        new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
-                                      TokenAnnotation::WORD, ""));
-    wordsRule->apply(result);
-    boost::shared_ptr<RegexRule> singleLetterWordsRule(
-                        new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
-    singleLetterWordsRule->apply(result);
-
+    
     return result;
 }
 
diff --git a/concordia/sentence_tokenizer.hpp b/concordia/sentence_tokenizer.hpp
index 6d92f1c..31e0de0 100644
--- a/concordia/sentence_tokenizer.hpp
+++ b/concordia/sentence_tokenizer.hpp
@@ -34,9 +34,11 @@ public:
 
     /*! Tokenizes the sentence.
       \param sentence input sentence
+      \param byWhitespace whether to tokenize the sentence by whitespace
       \returns tokenized sentence object build on the input sentence
     */
-    TokenizedSentence tokenize(const std::string & sentence);
+    TokenizedSentence tokenize(const std::string & sentence,
+                               bool byWhitespace = false);
 
 private:
     void _createNeRules(std::string & namedEntitiesPath);
diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp
index 178f22c..9c9675b 100644
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@@ -435,4 +435,26 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
 
 }
 
+BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
+{
+    Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
+                                    TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
+    TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true);
+
+    BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
+    
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
+
+    concordia.clearIndex();
+    
+}
+
+
 BOOST_AUTO_TEST_SUITE_END()