From 0a8d2fdd393f7a936496152ee6812a18d49b589f Mon Sep 17 00:00:00 2001 From: rjawor Date: Sun, 27 Dec 2015 20:54:40 +0100 Subject: [PATCH] tokenize by whitespace option --- concordia/concordia.cpp | 10 ++++--- concordia/concordia.hpp | 8 ++++-- concordia/hash_generator.cpp | 5 ++-- concordia/hash_generator.hpp | 4 ++- concordia/sentence_tokenizer.cpp | 46 +++++++++++++++++++------------- concordia/sentence_tokenizer.hpp | 4 ++- concordia/t/test_concordia.cpp | 22 +++++++++++++++ 7 files changed, 70 insertions(+), 29 deletions(-) diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index dcebd23..4ff2f5e 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -49,20 +49,22 @@ std::string _createLibraryVersion() { } TokenizedSentence - Concordia::tokenize(const std::string & sentence) + Concordia::tokenize(const std::string & sentence, + bool byWhitespace) throw(ConcordiaException) { TokenizedSentence result = - _hashGenerator->generateHash(sentence); + _hashGenerator->generateHash(sentence, byWhitespace); _hashGenerator->serializeWordMap(); return result; } std::vector Concordia::tokenizeAll( - const std::vector & sentences) + const std::vector & sentences, + bool byWhitespace) throw(ConcordiaException) { std::vector result; BOOST_FOREACH(std::string sentence, sentences) { - result.push_back(_hashGenerator->generateHash(sentence)); + result.push_back(_hashGenerator->generateHash(sentence, byWhitespace)); } _hashGenerator->serializeWordMap(); diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index 2ecc725..f3fdaed 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -60,20 +60,24 @@ public: /*! Tokenizes the given sentence. \param sentence sentence to be tokenized + \param byWhitespace whether to tokenize the sentence by whitespace \returns tokenized sentence object, containing information about original word positions \throws ConcordiaException */ - TokenizedSentence tokenize(const std::string & sentence) + TokenizedSentence tokenize(const std::string & sentence, + bool byWhitespace = false) throw(ConcordiaException); /*! Tokenizes all the given sentences. \param sentences vector of sentences to be tokenized + \param byWhitespace whether to tokenize the sentence by whitespace \returns vector of tokenized sentence objects \throws ConcordiaException */ std::vector tokenizeAll( - const std::vector & sentences) + const std::vector & sentences, + bool byWhitespace = false) throw(ConcordiaException); /*! Adds an Example to the index. diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index 04c7f3c..9a2a605 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -29,8 +29,9 @@ HashGenerator::~HashGenerator() { } TokenizedSentence HashGenerator::generateHash( - const std::string & sentence) throw(ConcordiaException) { - TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence); + const std::string & sentence, + bool byWhitespace) throw(ConcordiaException) { + TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace); ts.generateHash(_wordMap); if (ts.getTokens().size() > Utils::maxSentenceSize) { diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index 8c308c1..adf4df2 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -44,9 +44,11 @@ public: /*! Generates hash of a sentence. \param sentence sentence to generate hash from + \param byWhitespace whether to tokenize the sentence by whitespace \returns tokenized sentence, containing the hash */ - TokenizedSentence generateHash(const std::string & sentence) + TokenizedSentence generateHash(const std::string & sentence, + bool byWhitespace = false) throw(ConcordiaException); /*! diff --git a/concordia/sentence_tokenizer.cpp b/concordia/sentence_tokenizer.cpp index 0666a5d..9c7b3b6 100644 --- a/concordia/sentence_tokenizer.cpp +++ b/concordia/sentence_tokenizer.cpp @@ -24,29 +24,37 @@ SentenceTokenizer::SentenceTokenizer( SentenceTokenizer::~SentenceTokenizer() { } -TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) { +TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence, + bool byWhitespace) { TokenizedSentence result(sentence); - _htmlTags->apply(result); + if(byWhitespace) { + boost::shared_ptr whitespaceRule( + new RegexRule("\\S+", + TokenAnnotation::WORD, "")); + whitespaceRule->apply(result); + } else { + _htmlTags->apply(result); - BOOST_FOREACH(RegexRule & neRule, _namedEntities) { - neRule.apply(result); + BOOST_FOREACH(RegexRule & neRule, _namedEntities) { + neRule.apply(result); + } + + result.toLowerCase(); + + if (_stopWordsEnabled) { + _stopWords->apply(result); + } + + boost::shared_ptr wordsRule( + new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", + TokenAnnotation::WORD, "")); + wordsRule->apply(result); + boost::shared_ptr singleLetterWordsRule( + new RegexRule("\\p{L}", TokenAnnotation::WORD, "")); + singleLetterWordsRule->apply(result); } - - result.toLowerCase(); - - if (_stopWordsEnabled) { - _stopWords->apply(result); - } - - boost::shared_ptr wordsRule( - new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", - TokenAnnotation::WORD, "")); - wordsRule->apply(result); - boost::shared_ptr singleLetterWordsRule( - new RegexRule("\\p{L}", TokenAnnotation::WORD, "")); - singleLetterWordsRule->apply(result); - + return result; } diff --git a/concordia/sentence_tokenizer.hpp b/concordia/sentence_tokenizer.hpp index 6d92f1c..31e0de0 100644 --- a/concordia/sentence_tokenizer.hpp +++ b/concordia/sentence_tokenizer.hpp @@ -34,9 +34,11 @@ public: /*! Tokenizes the sentence. \param sentence input sentence + \param byWhitespace whether to tokenize the sentence by whitespace \returns tokenized sentence object build on the input sentence */ - TokenizedSentence tokenize(const std::string & sentence); + TokenizedSentence tokenize(const std::string & sentence, + bool byWhitespace = false); private: void _createNeRules(std::string & namedEntitiesPath); diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 178f22c..9c9675b 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -435,4 +435,26 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences ) } +BOOST_AUTO_TEST_CASE( TokenizeWhitespace ) +{ + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true); + + BOOST_CHECK_EQUAL(ts.getTokens().size(), 7); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23"); + + BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11); + BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18); + BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1); + BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./"); + + concordia.clearIndex(); + +} + + BOOST_AUTO_TEST_SUITE_END()