From b3d7c993aa69d73e3982fa129d6f246fc49cbeb7 Mon Sep 17 00:00:00 2001
From: rjawor <rjawor@amu.edu.pl>
Date: Fri, 1 Jan 2016 20:45:07 +0100
Subject: [PATCH] tokenize only option - no word map

---
 concordia/concordia.cpp          | 33 ++++++++++++++++++++++----------
 concordia/concordia.hpp          |  8 ++++++--
 concordia/hash_generator.cpp     | 13 +++++++++++++
 concordia/hash_generator.hpp     | 11 +++++++++++
 concordia/t/test_concordia.cpp   | 24 +++++++++++++++++++++++
 concordia/tokenized_sentence.cpp |  9 +++++++++
 concordia/tokenized_sentence.hpp | 12 ++++++++++--
 7 files changed, 96 insertions(+), 14 deletions(-)

diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp
index 4ff2f5e..6c1949f 100644
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@@ -50,24 +50,37 @@ std::string _createLibraryVersion() {
 
 TokenizedSentence
             Concordia::tokenize(const std::string & sentence,
-                                bool byWhitespace)
+                                bool byWhitespace,
+                                bool generateCodes)
                                   throw(ConcordiaException) {
-    TokenizedSentence result =
-                _hashGenerator->generateHash(sentence, byWhitespace);
-    _hashGenerator->serializeWordMap();
-    return result;
+    if (generateCodes) {
+        TokenizedSentence result =
+                    _hashGenerator->generateHash(sentence, byWhitespace);
+        _hashGenerator->serializeWordMap();
+        return result;
+    } else {
+        return _hashGenerator->generateTokens(sentence, byWhitespace);
+    }
 }
 
 std::vector<TokenizedSentence> Concordia::tokenizeAll(
                      const std::vector<std::string> & sentences,
-                     bool byWhitespace)
+                     bool byWhitespace,
+                     bool generateCodes)
                                      throw(ConcordiaException) {
     std::vector<TokenizedSentence> result;
-    BOOST_FOREACH(std::string sentence, sentences) {
-        result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
-    }
+    
+    if (generateCodes) {
+        BOOST_FOREACH(std::string sentence, sentences) {
+            result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
+        }
 
-    _hashGenerator->serializeWordMap();
+        _hashGenerator->serializeWordMap();
+    } else {
+        BOOST_FOREACH(std::string sentence, sentences) {
+            result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
+        }    
+    }
     return result;
 }
 
diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp
index f3fdaed..5b0a8b3 100644
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@@ -61,23 +61,27 @@ public:
     /*! Tokenizes the given sentence.
       \param sentence sentence to be tokenized
       \param byWhitespace whether to tokenize the sentence by whitespace
+      \param generateCodes whether to generate codes for tokens using WordMap
       \returns tokenized sentence object,
                containing information about original word positions
       \throws ConcordiaException
     */
     TokenizedSentence tokenize(const std::string & sentence,
-                               bool byWhitespace = false)
+                               bool byWhitespace = false,
+                               bool generateCodes = true)
                                                      throw(ConcordiaException);
 
     /*! Tokenizes all the given sentences.
       \param sentences vector of sentences to be tokenized
       \param byWhitespace whether to tokenize the sentence by whitespace
+      \param generateCodes whether to generate codes for tokens using WordMap
       \returns vector of tokenized sentence objects
       \throws ConcordiaException
     */
     std::vector<TokenizedSentence> tokenizeAll(
                                    const std::vector<std::string> & sentences,
-                                   bool byWhitespace = false)
+                                   bool byWhitespace = false,
+                                   bool generateCodes = true)
                                                      throw(ConcordiaException);
 
     /*! Adds an Example to the index.
diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp
index 9a2a605..7ba9f16 100644
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@@ -41,6 +41,19 @@ TokenizedSentence HashGenerator::generateHash(
     return ts;
 }
 
+TokenizedSentence HashGenerator::generateTokens(
+                     const std::string & sentence,
+                     bool byWhitespace) throw(ConcordiaException) {
+    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
+    ts.generateTokens();
+
+    if (ts.getTokens().size() > Utils::maxSentenceSize) {
+        throw ConcordiaException("Trying to add too long sentence.");
+    }
+
+    return ts;
+}
+
 void HashGenerator::serializeWordMap() {
     std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
     boost::archive::binary_oarchive oa(ofs);
diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp
index adf4df2..8d7be36 100644
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@@ -51,6 +51,17 @@ public:
                                    bool byWhitespace = false)
                                        throw(ConcordiaException);
 
+    /*!
+      This method acts like generateHash, but only performs tokenization.
+      Resulting TokenizedSentence does not have token codes information.
+      \param sentence sentence to tokenize
+      \param byWhitespace whether to tokenize the sentence by whitespace
+      \returns tokenized sentence, containing the tokens
+    */
+    TokenizedSentence generateTokens(const std::string & sentence,
+                                     bool byWhitespace = false)
+                                       throw(ConcordiaException);
+
     /*!
         Saves the contents of current WordMap to HDD.
     */
diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp
index 9c9675b..b4fe963 100644
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@@ -452,9 +452,33 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
     BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
     BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
 
+    BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
+
     concordia.clearIndex();
     
 }
 
+BOOST_AUTO_TEST_CASE( TokenizeOnly )
+{
+    Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
+                                    TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
+    TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true, false);
+
+    BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
+    
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
+
+    BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
+
+    concordia.clearIndex();
+    
+}
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/concordia/tokenized_sentence.cpp b/concordia/tokenized_sentence.cpp
index 964e5e3..a75622c 100644
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@@ -64,3 +64,12 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
     }
 }
 
+void TokenizedSentence::generateTokens() {
+    BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
+        if (annotation.getType() == TokenAnnotation::WORD ||
+               annotation.getType() == TokenAnnotation::NE) {
+            _tokens.push_back(annotation);
+        }
+    }
+}
+
diff --git a/concordia/tokenized_sentence.hpp b/concordia/tokenized_sentence.hpp
index 345c5fb..d68e26c 100644
--- a/concordia/tokenized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@@ -67,13 +67,21 @@ public:
     /*! Method for generating hash based on annotations.
         This method takes into account annotations of type
         word and named entity. These are encoded and added
-        to to code list. Annotations corresponding to these
+        to code list. Annotations corresponding to these
         tokens are added to the tokens list.
       \param wordMap word map to use when encoding tokens
-      \returns tokens list
     */
     void generateHash(boost::shared_ptr<WordMap> wordMap);
 
+    /*! Method for generating tokens based on annotations.
+        This method takes into account annotations of type
+        word and named entity. Unlike in generateHash,
+        these are not encoded or added to code list.
+        Annotations corresponding to these
+        tokens are added to the tokens list.
+    */
+    void generateTokens();
+
     /*! 
         Transform the sentence to lower case.
     */