tokenize only option - no word map

2016-01-01 20:45:07 +01:00 · 2016-01-01 20:45:07 +01:00 · b3d7c993aa
commit b3d7c993aa
parent bbf3853d2a
7 changed files with 96 additions and 14 deletions
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -50,24 +50,37 @@ std::string _createLibraryVersion() {

 TokenizedSentence
            Concordia::tokenize(const std::string & sentence,
-                                bool byWhitespace)
+                                bool byWhitespace,
+                                bool generateCodes)
                                  throw(ConcordiaException) {
+    if (generateCodes) {
        TokenizedSentence result =
                    _hashGenerator->generateHash(sentence, byWhitespace);
        _hashGenerator->serializeWordMap();
        return result;
+    } else {
+        return _hashGenerator->generateTokens(sentence, byWhitespace);
+    }
 }

 std::vector<TokenizedSentence> Concordia::tokenizeAll(
                     const std::vector<std::string> & sentences,
-                     bool byWhitespace)
+                     bool byWhitespace,
+                     bool generateCodes)
                                     throw(ConcordiaException) {
    std::vector<TokenizedSentence> result;
+    
+    if (generateCodes) {
        BOOST_FOREACH(std::string sentence, sentences) {
            result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
        }

        _hashGenerator->serializeWordMap();
+    } else {
+        BOOST_FOREACH(std::string sentence, sentences) {
+            result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
+        }    
+    }
    return result;
 }

--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -61,23 +61,27 @@ public:
    /*! Tokenizes the given sentence.
      \param sentence sentence to be tokenized
      \param byWhitespace whether to tokenize the sentence by whitespace
+      \param generateCodes whether to generate codes for tokens using WordMap
      \returns tokenized sentence object,
               containing information about original word positions
      \throws ConcordiaException
    */
    TokenizedSentence tokenize(const std::string & sentence,
-                               bool byWhitespace = false)
+                               bool byWhitespace = false,
+                               bool generateCodes = true)
                                                     throw(ConcordiaException);

    /*! Tokenizes all the given sentences.
      \param sentences vector of sentences to be tokenized
      \param byWhitespace whether to tokenize the sentence by whitespace
+      \param generateCodes whether to generate codes for tokens using WordMap
      \returns vector of tokenized sentence objects
      \throws ConcordiaException
    */
    std::vector<TokenizedSentence> tokenizeAll(
                                   const std::vector<std::string> & sentences,
-                                   bool byWhitespace = false)
+                                   bool byWhitespace = false,
+                                   bool generateCodes = true)
                                                     throw(ConcordiaException);

    /*! Adds an Example to the index.
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -41,6 +41,19 @@ TokenizedSentence HashGenerator::generateHash(
    return ts;
 }

+TokenizedSentence HashGenerator::generateTokens(
+                     const std::string & sentence,
+                     bool byWhitespace) throw(ConcordiaException) {
+    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
+    ts.generateTokens();
+
+    if (ts.getTokens().size() > Utils::maxSentenceSize) {
+        throw ConcordiaException("Trying to add too long sentence.");
+    }
+
+    return ts;
+}
+
 void HashGenerator::serializeWordMap() {
    std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
    boost::archive::binary_oarchive oa(ofs);
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -51,6 +51,17 @@ public:
                                   bool byWhitespace = false)
                                       throw(ConcordiaException);

+    /*!
+      This method acts like generateHash, but only performs tokenization.
+      Resulting TokenizedSentence does not have token codes information.
+      \param sentence sentence to tokenize
+      \param byWhitespace whether to tokenize the sentence by whitespace
+      \returns tokenized sentence, containing the tokens
+    */
+    TokenizedSentence generateTokens(const std::string & sentence,
+                                     bool byWhitespace = false)
+                                       throw(ConcordiaException);
+
    /*!
        Saves the contents of current WordMap to HDD.
    */
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@ -452,9 +452,33 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");

+    BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
+
    concordia.clearIndex();
    
 }

+BOOST_AUTO_TEST_CASE( TokenizeOnly )
+{
+    Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
+                                    TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
+    TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true, false);
+
+    BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
+    
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
+    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
+
+    BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
+
+    concordia.clearIndex();
+    
+}

 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@ -64,3 +64,12 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
    }
 }

+void TokenizedSentence::generateTokens() {
+    BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
+        if (annotation.getType() == TokenAnnotation::WORD ||
+               annotation.getType() == TokenAnnotation::NE) {
+            _tokens.push_back(annotation);
+        }
+    }
+}
+
--- a/concordia/tokenized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@ -67,13 +67,21 @@ public:
    /*! Method for generating hash based on annotations.
        This method takes into account annotations of type
        word and named entity. These are encoded and added
-        to to code list. Annotations corresponding to these
+        to code list. Annotations corresponding to these
        tokens are added to the tokens list.
      \param wordMap word map to use when encoding tokens
-      \returns tokens list
    */
    void generateHash(boost::shared_ptr<WordMap> wordMap);

+    /*! Method for generating tokens based on annotations.
+        This method takes into account annotations of type
+        word and named entity. Unlike in generateHash,
+        these are not encoded or added to code list.
+        Annotations corresponding to these
+        tokens are added to the tokens list.
+    */
+    void generateTokens();
+
    /*! 
        Transform the sentence to lower case.
    */