From b3d7c993aa69d73e3982fa129d6f246fc49cbeb7 Mon Sep 17 00:00:00 2001 From: rjawor Date: Fri, 1 Jan 2016 20:45:07 +0100 Subject: [PATCH] tokenize only option - no word map --- concordia/concordia.cpp | 33 ++++++++++++++++++++++---------- concordia/concordia.hpp | 8 ++++++-- concordia/hash_generator.cpp | 13 +++++++++++++ concordia/hash_generator.hpp | 11 +++++++++++ concordia/t/test_concordia.cpp | 24 +++++++++++++++++++++++ concordia/tokenized_sentence.cpp | 9 +++++++++ concordia/tokenized_sentence.hpp | 12 ++++++++++-- 7 files changed, 96 insertions(+), 14 deletions(-) diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 4ff2f5e..6c1949f 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -50,24 +50,37 @@ std::string _createLibraryVersion() { TokenizedSentence Concordia::tokenize(const std::string & sentence, - bool byWhitespace) + bool byWhitespace, + bool generateCodes) throw(ConcordiaException) { - TokenizedSentence result = - _hashGenerator->generateHash(sentence, byWhitespace); - _hashGenerator->serializeWordMap(); - return result; + if (generateCodes) { + TokenizedSentence result = + _hashGenerator->generateHash(sentence, byWhitespace); + _hashGenerator->serializeWordMap(); + return result; + } else { + return _hashGenerator->generateTokens(sentence, byWhitespace); + } } std::vector Concordia::tokenizeAll( const std::vector & sentences, - bool byWhitespace) + bool byWhitespace, + bool generateCodes) throw(ConcordiaException) { std::vector result; - BOOST_FOREACH(std::string sentence, sentences) { - result.push_back(_hashGenerator->generateHash(sentence, byWhitespace)); - } + + if (generateCodes) { + BOOST_FOREACH(std::string sentence, sentences) { + result.push_back(_hashGenerator->generateHash(sentence, byWhitespace)); + } - _hashGenerator->serializeWordMap(); + _hashGenerator->serializeWordMap(); + } else { + BOOST_FOREACH(std::string sentence, sentences) { + result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace)); + } + } return result; } diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index f3fdaed..5b0a8b3 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -61,23 +61,27 @@ public: /*! Tokenizes the given sentence. \param sentence sentence to be tokenized \param byWhitespace whether to tokenize the sentence by whitespace + \param generateCodes whether to generate codes for tokens using WordMap \returns tokenized sentence object, containing information about original word positions \throws ConcordiaException */ TokenizedSentence tokenize(const std::string & sentence, - bool byWhitespace = false) + bool byWhitespace = false, + bool generateCodes = true) throw(ConcordiaException); /*! Tokenizes all the given sentences. \param sentences vector of sentences to be tokenized \param byWhitespace whether to tokenize the sentence by whitespace + \param generateCodes whether to generate codes for tokens using WordMap \returns vector of tokenized sentence objects \throws ConcordiaException */ std::vector tokenizeAll( const std::vector & sentences, - bool byWhitespace = false) + bool byWhitespace = false, + bool generateCodes = true) throw(ConcordiaException); /*! Adds an Example to the index. diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index 9a2a605..7ba9f16 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -41,6 +41,19 @@ TokenizedSentence HashGenerator::generateHash( return ts; } +TokenizedSentence HashGenerator::generateTokens( + const std::string & sentence, + bool byWhitespace) throw(ConcordiaException) { + TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace); + ts.generateTokens(); + + if (ts.getTokens().size() > Utils::maxSentenceSize) { + throw ConcordiaException("Trying to add too long sentence."); + } + + return ts; +} + void HashGenerator::serializeWordMap() { std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary); boost::archive::binary_oarchive oa(ofs); diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index adf4df2..8d7be36 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -51,6 +51,17 @@ public: bool byWhitespace = false) throw(ConcordiaException); + /*! + This method acts like generateHash, but only performs tokenization. + Resulting TokenizedSentence does not have token codes information. + \param sentence sentence to tokenize + \param byWhitespace whether to tokenize the sentence by whitespace + \returns tokenized sentence, containing the tokens + */ + TokenizedSentence generateTokens(const std::string & sentence, + bool byWhitespace = false) + throw(ConcordiaException); + /*! Saves the contents of current WordMap to HDD. */ diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 9c9675b..b4fe963 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -452,9 +452,33 @@ BOOST_AUTO_TEST_CASE( TokenizeWhitespace ) BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1); BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./"); + BOOST_CHECK_EQUAL(ts.getCodes().size(), 7); + concordia.clearIndex(); } +BOOST_AUTO_TEST_CASE( TokenizeOnly ) +{ + Concordia concordia = Concordia(TestResourcesManager::getTempPath(), + TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true, false); + + BOOST_CHECK_EQUAL(ts.getTokens().size(), 7); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23"); + + BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11); + BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18); + BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1); + BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./"); + + BOOST_CHECK_EQUAL(ts.getCodes().size(), 0); //there should be no codes, only tokens + + concordia.clearIndex(); + +} BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/tokenized_sentence.cpp b/concordia/tokenized_sentence.cpp index 964e5e3..a75622c 100644 --- a/concordia/tokenized_sentence.cpp +++ b/concordia/tokenized_sentence.cpp @@ -64,3 +64,12 @@ void TokenizedSentence::generateHash(boost::shared_ptr wordMap) { } } +void TokenizedSentence::generateTokens() { + BOOST_FOREACH(TokenAnnotation annotation, _tokenAnnotations) { + if (annotation.getType() == TokenAnnotation::WORD || + annotation.getType() == TokenAnnotation::NE) { + _tokens.push_back(annotation); + } + } +} + diff --git a/concordia/tokenized_sentence.hpp b/concordia/tokenized_sentence.hpp index 345c5fb..d68e26c 100644 --- a/concordia/tokenized_sentence.hpp +++ b/concordia/tokenized_sentence.hpp @@ -67,13 +67,21 @@ public: /*! Method for generating hash based on annotations. This method takes into account annotations of type word and named entity. These are encoded and added - to to code list. Annotations corresponding to these + to code list. Annotations corresponding to these tokens are added to the tokens list. \param wordMap word map to use when encoding tokens - \returns tokens list */ void generateHash(boost::shared_ptr wordMap); + /*! Method for generating tokens based on annotations. + This method takes into account annotations of type + word and named entity. Unlike in generateHash, + these are not encoded or added to code list. + Annotations corresponding to these + tokens are added to the tokens list. + */ + void generateTokens(); + /*! Transform the sentence to lower case. */