tokenize by whitespace option

2015-12-27 20:54:40 +01:00 · 2015-12-27 20:54:40 +01:00 · 0a8d2fdd39
commit 0a8d2fdd39
parent 873d7c300c
7 changed files with 70 additions and 29 deletions
--- a/concordia/concordia.cpp
+++ b/concordia/concordia.cpp
@ -49,20 +49,22 @@ std::string _createLibraryVersion() {
 }
 TokenizedSentence
-            Concordia::tokenize(const std::string & sentence)
+            Concordia::tokenize(const std::string & sentence,
                                bool byWhitespace)
                                  throw(ConcordiaException) {
    TokenizedSentence result =
-                _hashGenerator->generateHash(sentence);
+                _hashGenerator->generateHash(sentence, byWhitespace);
    _hashGenerator->serializeWordMap();
    return result;
 }
 std::vector<TokenizedSentence> Concordia::tokenizeAll(
-                     const std::vector<std::string> & sentences)
+                     const std::vector<std::string> & sentences,
                     bool byWhitespace)
                                     throw(ConcordiaException) {
    std::vector<TokenizedSentence> result;
    BOOST_FOREACH(std::string sentence, sentences) {
-        result.push_back(_hashGenerator->generateHash(sentence));
+        result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
    }
    _hashGenerator->serializeWordMap();
--- a/concordia/concordia.hpp
+++ b/concordia/concordia.hpp
@ -60,20 +60,24 @@ public:
    /*! Tokenizes the given sentence.
      \param sentence sentence to be tokenized
      \param byWhitespace whether to tokenize the sentence by whitespace
      \returns tokenized sentence object,
               containing information about original word positions
      \throws ConcordiaException
    */
-    TokenizedSentence tokenize(const std::string & sentence)
+    TokenizedSentence tokenize(const std::string & sentence,
                               bool byWhitespace = false)
                                                     throw(ConcordiaException);
    /*! Tokenizes all the given sentences.
      \param sentences vector of sentences to be tokenized
      \param byWhitespace whether to tokenize the sentence by whitespace
      \returns vector of tokenized sentence objects
      \throws ConcordiaException
    */
    std::vector<TokenizedSentence> tokenizeAll(
-                                   const std::vector<std::string> & sentences)
+                                   const std::vector<std::string> & sentences,
                                   bool byWhitespace = false)
                                                     throw(ConcordiaException);
    /*! Adds an Example to the index.
--- a/concordia/hash_generator.cpp
+++ b/concordia/hash_generator.cpp
@ -29,8 +29,9 @@ HashGenerator::~HashGenerator() {
 }
 TokenizedSentence HashGenerator::generateHash(
-                     const std::string & sentence) throw(ConcordiaException) {
+                     const std::string & sentence,
-    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
+                     bool byWhitespace) throw(ConcordiaException) {
    TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
    ts.generateHash(_wordMap);
    if (ts.getTokens().size() > Utils::maxSentenceSize) {
--- a/concordia/hash_generator.hpp
+++ b/concordia/hash_generator.hpp
@ -44,9 +44,11 @@ public:
    /*!
      Generates hash of a sentence.
      \param sentence sentence to generate hash from
      \param byWhitespace whether to tokenize the sentence by whitespace
      \returns tokenized sentence, containing the hash
    */
-    TokenizedSentence generateHash(const std::string & sentence)
+    TokenizedSentence generateHash(const std::string & sentence,
                                   bool byWhitespace = false)
                                       throw(ConcordiaException);
    /*!
--- a/concordia/sentence_tokenizer.cpp
+++ b/concordia/sentence_tokenizer.cpp
@ -24,9 +24,16 @@ SentenceTokenizer::SentenceTokenizer(
 SentenceTokenizer::~SentenceTokenizer() {
 }
-TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
+TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
                                              bool byWhitespace) {
    TokenizedSentence result(sentence);
    if(byWhitespace) {
        boost::shared_ptr<RegexRule> whitespaceRule(
                            new RegexRule("\\S+",
                                          TokenAnnotation::WORD, ""));
        whitespaceRule->apply(result);
    } else {
        _htmlTags->apply(result);
        BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
@ -46,6 +53,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
        boost::shared_ptr<RegexRule> singleLetterWordsRule(
                            new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
        singleLetterWordsRule->apply(result);
    }
    return result;
 }
--- a/concordia/sentence_tokenizer.hpp
+++ b/concordia/sentence_tokenizer.hpp
@ -34,9 +34,11 @@ public:
    /*! Tokenizes the sentence.
      \param sentence input sentence
      \param byWhitespace whether to tokenize the sentence by whitespace
      \returns tokenized sentence object build on the input sentence
    */
-    TokenizedSentence tokenize(const std::string & sentence);
+    TokenizedSentence tokenize(const std::string & sentence,
                               bool byWhitespace = false);
 private:
    void _createNeRules(std::string & namedEntitiesPath);
--- a/concordia/t/test_concordia.cpp
+++ b/concordia/t/test_concordia.cpp
@ -435,4 +435,26 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
 }
 BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
 {
    Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
                                    TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
    TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true);
    BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
    BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
    BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
    concordia.clearIndex();
 }
 BOOST_AUTO_TEST_SUITE_END()