tokenize only option - no word map

2016-01-01 20:45:07 +01:00 · 2016-01-01 20:45:07 +01:00 · b3d7c993aa
commit b3d7c993aa
parent bbf3853d2a
7 changed files with 96 additions and 14 deletions
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -50,24 +50,37 @@ std::string _createLibraryVersion() {
 TokenizedSentence
            Concordia::tokenize(const std::string & sentence,
-                                bool byWhitespace)
+                                bool byWhitespace,
                                bool generateCodes)
                                  throw(ConcordiaException) {
    if (generateCodes) {
        TokenizedSentence result =
                    _hashGenerator->generateHash(sentence, byWhitespace);
        _hashGenerator->serializeWordMap();
        return result;
    } else {
        return _hashGenerator->generateTokens(sentence, byWhitespace);
    }
 }
 std::vector<TokenizedSentence> Concordia::tokenizeAll(
                     const std::vector<std::string> & sentences,
-                     bool byWhitespace)
+                     bool byWhitespace,
                     bool generateCodes)
                                     throw(ConcordiaException) {
    std::vector<TokenizedSentence> result;
    if (generateCodes) {
        BOOST_FOREACH(std::string sentence, sentences) {
            result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
        }
        _hashGenerator->serializeWordMap();
    } else {
        BOOST_FOREACH(std::string sentence, sentences) {
            result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace));
        }    
    }
    return result;
 }
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -61,23 +61,27 @@ public:
    /*! Tokenizes the given sentence.
      \param sentence sentence to be tokenized
      \param byWhitespace whether to tokenize the sentence by whitespace
      \param generateCodes whether to generate codes for tokens using WordMap
      \returns tokenized sentence object,
               containing information about original word positions
      \throws ConcordiaException
    */
    TokenizedSentence tokenize(const std::string & sentence,
-                               bool byWhitespace = false)
+                               bool byWhitespace = false,
                               bool generateCodes = true)
                                                     throw(ConcordiaException);
    /*! Tokenizes all the given sentences.
      \param sentences vector of sentences to be tokenized
      \param byWhitespace whether to tokenize the sentence by whitespace
      \param generateCodes whether to generate codes for tokens using WordMap
      \returns vector of tokenized sentence objects
      \throws ConcordiaException
    */
    std::vector<TokenizedSentence> tokenizeAll(
                                   const std::vector<std::string> & sentences,
-                                   bool byWhitespace = false)
+                                   bool byWhitespace = false,
                                   bool generateCodes = true)
                                                     throw(ConcordiaException);
    /*! Adds an Example to the index.
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -41,6 +41,19 @@ TokenizedSentence HashGenerator::generateHash(
    return ts;
 }
 TokenizedSentence HashGenerator::generateTokens(
                     const std::string & sentence,
                     bool byWhitespace) throw(ConcordiaException) {
    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
    ts.generateTokens();
    if (ts.getTokens().size() > Utils::maxSentenceSize) {
        throw ConcordiaException("Trying to add too long sentence.");
    }
    return ts;
 }
 void HashGenerator::serializeWordMap() {
    std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
    boost::archive::binary_oarchive oa(ofs);
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -51,6 +51,17 @@ public:
                                   bool byWhitespace = false)
                                       throw(ConcordiaException);
    /*!
      This method acts like generateHash, but only performs tokenization.
      Resulting TokenizedSentence does not have token codes information.
      \param sentence sentence to tokenize
      \param byWhitespace whether to tokenize the sentence by whitespace
      \returns tokenized sentence, containing the tokens
    */
    TokenizedSentence generateTokens(const std::string & sentence,
                                     bool byWhitespace = false)
                                       throw(ConcordiaException);
    /*!
        Saves the contents of current WordMap to HDD.
    */
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@ -452,9 +452,33 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
    BOOST_CHECK_EQUAL(ts.getCodes().size(), 7);
    concordia.clearIndex();
 }
 BOOST_AUTO_TEST_CASE( TokenizeOnly )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
                                    TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
    TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true, false);
    BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
    BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens
    concordia.clearIndex();
 }
 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/tokenized_sentence.cpp
+++ b/concordia/tokenized_sentence.cpp
@ -64,3 +64,12 @@ void TokenizedSentence::generateHash(boost::shared_ptr<WordMap> wordMap) {
    }
 }
 void TokenizedSentence::generateTokens() {
    BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) {
        if (annotation.getType() == TokenAnnotation::WORD ||
               annotation.getType() == TokenAnnotation::NE) {
            _tokens.push_back(annotation);
        }
    }
 }
--- a/concordia/tokenized_sentence.hpp
+++ b/concordia/tokenized_sentence.hpp
@ -67,13 +67,21 @@ public:
    /*! Method for generating hash based on annotations.
        This method takes into account annotations of type
        word and named entity. These are encoded and added
-        to to code list. Annotations corresponding to these
+        to code list. Annotations corresponding to these
        tokens are added to the tokens list.
      \param wordMap word map to use when encoding tokens
      \returns tokens list
    */
    void generateHash(boost::shared_ptr<WordMap> wordMap);
    /*! Method for generating tokens based on annotations.
        This method takes into account annotations of type
        word and named entity. Unlike in generateHash,
        these are not encoded or added to code list.
        Annotations corresponding to these
        tokens are added to the tokens list.
    */
    void generateTokens();
    /*! 
        Transform the sentence to lower case.
    */