From 68fecaddf8374b29810ebcf0ac32bcc4d0232d85 Mon Sep 17 00:00:00 2001 From: rjawor Date: Wed, 19 Aug 2015 20:49:26 +0200 Subject: [PATCH] adding all tokenized examples --- TODO.txt | 1 + concordia-console/concordia-console.cpp | 4 +- concordia/concordia.cpp | 34 ++++++++++--- concordia/concordia.hpp | 31 +++++++++--- concordia/concordia_index.cpp | 47 +++++++++++++----- concordia/concordia_index.hpp | 33 ++++++++++--- concordia/concordia_search_result.cpp | 8 ++-- concordia/concordia_search_result.hpp | 7 ++- concordia/hash_generator.cpp | 9 ++-- concordia/hash_generator.hpp | 5 +- concordia/index_searcher.cpp | 9 ++-- concordia/regex_rule.cpp | 8 ++-- concordia/regex_rule.hpp | 2 +- concordia/sentence_tokenizer.cpp | 8 ++-- concordia/sentence_tokenizer.hpp | 3 +- concordia/t/test_concordia.cpp | 64 +++++++++++++++++-------- concordia/t/test_concordia_searcher.cpp | 2 +- concordia/t/test_hash_generator.cpp | 10 ++-- concordia/t/test_regex_rule.cpp | 36 +++++++------- concordia/t/test_sentence_tokenizer.cpp | 18 +++---- 20 files changed, 220 insertions(+), 119 deletions(-) diff --git a/TODO.txt b/TODO.txt index 810d6cd..14b9f38 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,4 +1,5 @@ ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) ----------------------------- +DONE - change the arguments of addExample* fucntions to const reference to TokenizedSentence (not boost::shared_ptr - multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results. IN PROGRESS - document the code (classes, cfg files) and update tutorial - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń. diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index 8dcadd4..1702262 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -29,7 +29,7 @@ void checkConcordiaResults( long lineIndex = 1; BOOST_FOREACH(ConcordiaSearchResult result, results) { SUFFIX_MARKER_TYPE patternSize = - result.getTokenizedPattern()->getTokens().size(); + result.getTokenizedPattern().getTokens().size(); if (patternSize > 0) { if (result.getBestOverlay().size() != 1) { reportError(baseLineCount + lineIndex, @@ -203,7 +203,7 @@ int main(int argc, char** argv) { std::cout << "\tPattern used: " << std::endl << "\t\t"; BOOST_FOREACH(TokenAnnotation annotation, - result->getTokenizedPattern()->getTokens()) { + result->getTokenizedPattern().getTokens()) { std::cout << annotation.getValue() << " "; } std::cout << std::endl; diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index af1a3c3..0972333 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -1,4 +1,5 @@ #include +#include #include "concordia/concordia.hpp" #include "concordia/common/config.hpp" @@ -42,19 +43,31 @@ std::string _createLibraryVersion() { return version.str(); } -boost::shared_ptr +TokenizedSentence Concordia::tokenize(const std::string & sentence) throw(ConcordiaException) { - boost::shared_ptr result = + TokenizedSentence result = _hashGenerator->generateHash(sentence); _hashGenerator->serializeWordMap(); return result; } +std::vector Concordia::tokenizeAll( + const std::vector & sentences) + throw(ConcordiaException) { + std::vector result; + BOOST_FOREACH(std::string sentence, sentences) { + result.push_back(_hashGenerator->generateHash(sentence)); + } + + _hashGenerator->serializeWordMap(); + return result; +} + // Sentences are written to disk and added to T. // SA is generated on command by other methods. -boost::shared_ptr Concordia::addExample( +TokenizedSentence Concordia::addExample( const Example & example) throw(ConcordiaException) { return _index->addExample(_hashGenerator, _T, _markers, example); @@ -63,13 +76,21 @@ boost::shared_ptr Concordia::addExample( // Sentences are written to disk and added to T. // SA is generated on command by other methods. void Concordia::addTokenizedExample( - boost::shared_ptr tokenizedSentence, - SUFFIX_MARKER_TYPE id) + const TokenizedSentence & tokenizedSentence, + const SUFFIX_MARKER_TYPE id) throw(ConcordiaException) { _index->addTokenizedExample(_hashGenerator, _T, _markers, tokenizedSentence, id); } +void Concordia::addAllTokenizedExamples( + const std::vector & tokenizedSentences, + const std::vector & ids) + throw(ConcordiaException) { + _index->addAllTokenizedExamples(_hashGenerator, _T, + _markers, tokenizedSentences, ids); +} + // Sentences are written to disk and added to T. // SA is generated on command by other methods. @@ -188,8 +209,7 @@ boost::shared_ptr Concordia::concordiaSearch( } else { std::string empty; return boost::shared_ptr( - new ConcordiaSearchResult(boost::shared_ptr( - new TokenizedSentence(empty)))); + new ConcordiaSearchResult(TokenizedSentence(empty))); } } diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index 33628e8..3deb2b1 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -58,7 +58,16 @@ public: containing information about original word positions \throws ConcordiaException */ - boost::shared_ptr tokenize(const std::string & sentence) + TokenizedSentence tokenize(const std::string & sentence) + throw(ConcordiaException); + + /*! Tokenizes all the given sentences. + \param sentences vector of sentences to be tokenized + \returns vector of tokenized sentence objects + \throws ConcordiaException + */ + std::vector tokenizeAll( + const std::vector & sentences) throw(ConcordiaException); /*! Adds an Example to the index. @@ -67,17 +76,27 @@ public: containing information about original word positions \throws ConcordiaException */ - boost::shared_ptr addExample(const Example & example) - throw(ConcordiaException); + TokenizedSentence addExample(const Example & example) + throw(ConcordiaException); /*! Adds a tokenized example to the index. \param tokenizedSentence tokenized sentence to be added - \param id of the sentence to be added + \param id id of the sentence to be added \throws ConcordiaException */ void addTokenizedExample( - boost::shared_ptr tokenizedSentence, - SUFFIX_MARKER_TYPE id) + const TokenizedSentence & tokenizedSentence, + const SUFFIX_MARKER_TYPE id) + throw(ConcordiaException); + + /*! Adds multiple tokenized examples to the index. + \param examples vector of examples to be added + \param ids vector of ids of the sentences to be added + \throws ConcordiaException + */ + void addAllTokenizedExamples( + const std::vector & tokenizedSentences, + const std::vector & ids) throw(ConcordiaException); /*! Adds multiple examples to the index. diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index fc7493e..3eb98d7 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -4,6 +4,8 @@ #include "concordia/common/config.hpp" #include #include +#include + #include #include @@ -48,10 +50,10 @@ std::vector ConcordiaIndex::addAllExamples( std::vector hashedPatterns; BOOST_FOREACH(Example example, examples) { - boost::shared_ptr hashedPattern = + TokenizedSentence hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator, T, markers, example); - hashedPatterns.push_back(*hashedPattern); + hashedPatterns.push_back(hashedPattern); } hashedIndexFile.close(); @@ -61,7 +63,7 @@ std::vector ConcordiaIndex::addAllExamples( return hashedPatterns; } -boost::shared_ptr ConcordiaIndex::addExample( +TokenizedSentence ConcordiaIndex::addExample( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, @@ -72,7 +74,7 @@ boost::shared_ptr ConcordiaIndex::addExample( std::ofstream markersFile; markersFile.open(_markersFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); - boost::shared_ptr hashedPattern = + TokenizedSentence hashedPattern = _addSingleExample(hashedIndexFile, markersFile, hashGenerator, T, markers, example); hashedIndexFile.close(); @@ -86,8 +88,8 @@ void ConcordiaIndex::addTokenizedExample( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, - boost::shared_ptr tokenizedSentence, - SUFFIX_MARKER_TYPE id) { + const TokenizedSentence & tokenizedSentence, + const SUFFIX_MARKER_TYPE id) { std::ofstream hashedIndexFile; hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out| std::ios::app|std::ios::binary); @@ -100,15 +102,38 @@ void ConcordiaIndex::addTokenizedExample( markersFile.close(); } +void ConcordiaIndex::addAllTokenizedExamples( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + const std::vector & tokenizedSentences, + const std::vector & ids) { + std::ofstream hashedIndexFile; + hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out| + std::ios::app|std::ios::binary); + std::ofstream markersFile; + markersFile.open(_markersFilePath.c_str(), std::ios::out| + std::ios::app|std::ios::binary); + + int index = 0; + BOOST_FOREACH(TokenizedSentence tokenizedSentence, tokenizedSentences) { + _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator, + T, markers, tokenizedSentence, ids.at(index)); + index++; + } + hashedIndexFile.close(); + markersFile.close(); +} + void ConcordiaIndex::_addSingleTokenizedExample( std::ofstream & hashedIndexFile, std::ofstream & markersFile, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, - boost::shared_ptr tokenizedSentence, - SUFFIX_MARKER_TYPE id) { - std::vector hash = tokenizedSentence->getCodes(); + const TokenizedSentence & tokenizedSentence, + const SUFFIX_MARKER_TYPE id) { + std::vector hash = tokenizedSentence.getCodes(); int offset = 0; for (std::vector::iterator it = hash.begin(); @@ -139,14 +164,14 @@ void ConcordiaIndex::_addSingleTokenizedExample( markers->push_back(sentenceBoundaryMA); } -boost::shared_ptr ConcordiaIndex::_addSingleExample( +TokenizedSentence ConcordiaIndex::_addSingleExample( std::ofstream & hashedIndexFile, std::ofstream & markersFile, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, const Example & example) { - boost::shared_ptr hashedPattern = + TokenizedSentence hashedPattern = hashGenerator->generateHash(example.getSentence()); _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator, T, markers, hashedPattern, example.getId()); diff --git a/concordia/concordia_index.hpp b/concordia/concordia_index.hpp index f59469b..c3dd27a 100644 --- a/concordia/concordia_index.hpp +++ b/concordia/concordia_index.hpp @@ -53,7 +53,7 @@ public: \returns tokenized example \throws ConcordiaException */ - boost::shared_ptr addExample( + TokenizedSentence addExample( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, @@ -63,7 +63,6 @@ public: and markers array are appended with the example. At the same time, HDD versions of these two data structures are also appended with the same example. - The method returns a tokenized version of the example. \param hashGenerator hash generator to be used to prepare the hash of the example \param T RAM-based hash index to be appended to @@ -77,8 +76,28 @@ public: boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, - boost::shared_ptr tokenizedSentence, - SUFFIX_MARKER_TYPE id); + const TokenizedSentence & tokenizedSentence, + const SUFFIX_MARKER_TYPE id); + + /*! Adds multiple tokenized examples to the index. Hashed index + and markers array are appended with the examples. + At the same time, HDD versions of these + two data structures are also appended with the same examples. + \param hashGenerator hash generator to be used to prepare the hash + of the example + \param T RAM-based hash index to be appended to + \param markers RAM-based markers array to be appended to + \param example example to be added to index + \param tokenizedSentences vector of tokenized sentences to be added + \param ids vector of ids of the sentences to be added + \throws ConcordiaException + */ + void addAllTokenizedExamples( + boost::shared_ptr hashGenerator, + boost::shared_ptr > T, + boost::shared_ptr > markers, + const std::vector & tokenizedSentences, + const std::vector & ids); /*! Adds multiple examples to the index. Examples are first hashed using the hash generator passed to this method. Then, hashed index @@ -114,10 +133,10 @@ private: boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, - boost::shared_ptr tokenizedSentence, - SUFFIX_MARKER_TYPE id); + const TokenizedSentence & tokenizedSentence, + const SUFFIX_MARKER_TYPE id); - boost::shared_ptr _addSingleExample( + TokenizedSentence _addSingleExample( std::ofstream & hashedIndexFile, std::ofstream & markersFile, boost::shared_ptr hashGenerator, diff --git a/concordia/concordia_search_result.cpp b/concordia/concordia_search_result.cpp index 410ba7c..dbd3bc3 100644 --- a/concordia/concordia_search_result.cpp +++ b/concordia/concordia_search_result.cpp @@ -4,9 +4,9 @@ #include ConcordiaSearchResult::ConcordiaSearchResult( - boost::shared_ptr tokenizedPattern): - _tokenizedPattern(tokenizedPattern), - _bestOverlayScore(0) { + TokenizedSentence tokenizedPattern): + _tokenizedPattern(tokenizedPattern), + _bestOverlayScore(0) { } ConcordiaSearchResult::~ConcordiaSearchResult() { @@ -27,7 +27,7 @@ void ConcordiaSearchResult::computeBestOverlay() { // the fragments are already sorted by their ends, ascending _checkPossibleOverlays(std::vector(), -1, - _tokenizedPattern->getTokens().size()); + _tokenizedPattern.getTokens().size()); } void ConcordiaSearchResult::_checkPossibleOverlays( diff --git a/concordia/concordia_search_result.hpp b/concordia/concordia_search_result.hpp index 41fa7e4..da4c751 100644 --- a/concordia/concordia_search_result.hpp +++ b/concordia/concordia_search_result.hpp @@ -26,8 +26,7 @@ public: /*! Constructor. \param tokenVector tokenized pattern which was used for searching */ - explicit ConcordiaSearchResult( - boost::shared_ptr tokenizedPattern); + explicit ConcordiaSearchResult(TokenizedSentence tokenizedPattern); /*! Destructor. */ @@ -51,7 +50,7 @@ public: /*! Getter for tokenized pattern. \returns tokenized search pattern */ - boost::shared_ptr getTokenizedPattern() const { + TokenizedSentence getTokenizedPattern() const { return _tokenizedPattern; } @@ -82,7 +81,7 @@ private: SUFFIX_MARKER_TYPE lastAddedPos, SUFFIX_MARKER_TYPE patternSize); - boost::shared_ptr _tokenizedPattern; + TokenizedSentence _tokenizedPattern; std::vector _matchedPatternFragments; diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index 8b93ce4..89d5997 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -27,13 +27,12 @@ HashGenerator::HashGenerator(boost::shared_ptr config) HashGenerator::~HashGenerator() { } -boost::shared_ptr HashGenerator::generateHash( +TokenizedSentence HashGenerator::generateHash( const std::string & sentence) throw(ConcordiaException) { - boost::shared_ptr ts = - _sentenceTokenizer->tokenize(sentence); - ts->generateHash(_wordMap); + TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence); + ts.generateHash(_wordMap); - if (ts->getTokens().size() > Utils::maxSentenceSize) { + if (ts.getTokens().size() > Utils::maxSentenceSize) { throw ConcordiaException("Trying to add too long sentence."); } diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index 6528dcf..e94f8d6 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -44,9 +44,8 @@ public: \param sentence sentence to generate hash from \returns tokenized sentence, containing the hash */ - boost::shared_ptr generateHash( - const std::string & sentence) - throw(ConcordiaException); + TokenizedSentence generateHash(const std::string & sentence) + throw(ConcordiaException); /*! Saves the contents of current WordMap to HDD. diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index 6012ba1..79d5b48 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -23,7 +23,7 @@ std::vector IndexSearcher::simpleSearch( int left; std::vector hash = - hashGenerator->generateHash(pattern)->getCodes(); + hashGenerator->generateHash(pattern).getCodes(); saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE); sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash); @@ -60,7 +60,7 @@ std::vector IndexSearcher::anubisSearch( boost::shared_ptr > SA, const std::string & pattern) throw(ConcordiaException) { std::vector hash = - hashGenerator->generateHash(pattern)->getCodes(); + hashGenerator->generateHash(pattern).getCodes(); return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash); } @@ -70,13 +70,12 @@ boost::shared_ptr IndexSearcher::concordiaSearch( boost::shared_ptr > markers, boost::shared_ptr > SA, const std::string & pattern) throw(ConcordiaException) { - boost::shared_ptr hashedPattern = - hashGenerator->generateHash(pattern); + TokenizedSentence hashedPattern = hashGenerator->generateHash(pattern); boost::shared_ptr result = boost::shared_ptr( new ConcordiaSearchResult(hashedPattern)); _concordiaSearcher->concordiaSearch(result, T, markers, - SA, hashedPattern->getCodes()); + SA, hashedPattern.getCodes()); return result; } diff --git a/concordia/regex_rule.cpp b/concordia/regex_rule.cpp index 04bb825..aad5e84 100644 --- a/concordia/regex_rule.cpp +++ b/concordia/regex_rule.cpp @@ -36,9 +36,9 @@ RegexRule::RegexRule(std::string patternString, RegexRule::~RegexRule() { } -void RegexRule::apply(boost::shared_ptr sentence) { +void RegexRule::apply(TokenizedSentence & sentence) { try { - UnicodeString s(sentence->getSentence().c_str()); + UnicodeString s(sentence.getSentence().c_str()); boost::u32regex_iterator begin( boost::make_u32regex_iterator(s, _pattern)); boost::u32regex_iterator end; @@ -58,12 +58,12 @@ void RegexRule::apply(boost::shared_ptr sentence) { _annotationType, value); annotations.push_back(annotation); } - sentence->addAnnotations(annotations); + sentence.addAnnotations(annotations); } catch(const std::exception & e) { std::stringstream ss; ss << "Exception while applying regex rule: " << _annotationType << " to text: " - << sentence->getSentence(); + << sentence.getSentence(); ss << ", message: " << e.what(); throw ConcordiaException(ss.str()); } diff --git a/concordia/regex_rule.hpp b/concordia/regex_rule.hpp index ce62fd1..878b088 100644 --- a/concordia/regex_rule.hpp +++ b/concordia/regex_rule.hpp @@ -42,7 +42,7 @@ public: /*! Applies regex annotation on tokenized sentence. \param sentence the input sentence */ - void apply(boost::shared_ptr sentence); + void apply(TokenizedSentence & sentence); private: int _annotationType; diff --git a/concordia/sentence_tokenizer.cpp b/concordia/sentence_tokenizer.cpp index 9ffe173..0666a5d 100644 --- a/concordia/sentence_tokenizer.cpp +++ b/concordia/sentence_tokenizer.cpp @@ -24,10 +24,8 @@ SentenceTokenizer::SentenceTokenizer( SentenceTokenizer::~SentenceTokenizer() { } -boost::shared_ptr - SentenceTokenizer::tokenize(const std::string & sentence) { - boost::shared_ptr - result(new TokenizedSentence(sentence)); +TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) { + TokenizedSentence result(sentence); _htmlTags->apply(result); @@ -35,7 +33,7 @@ boost::shared_ptr neRule.apply(result); } - result->toLowerCase(); + result.toLowerCase(); if (_stopWordsEnabled) { _stopWords->apply(result); diff --git a/concordia/sentence_tokenizer.hpp b/concordia/sentence_tokenizer.hpp index 7e354eb..6d92f1c 100644 --- a/concordia/sentence_tokenizer.hpp +++ b/concordia/sentence_tokenizer.hpp @@ -36,8 +36,7 @@ public: \param sentence input sentence \returns tokenized sentence object build on the input sentence */ - boost::shared_ptr - tokenize(const std::string & sentence); + TokenizedSentence tokenize(const std::string & sentence); private: void _createNeRules(std::string & namedEntitiesPath); diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index d5e5907..21548a0 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -27,17 +27,17 @@ BOOST_AUTO_TEST_CASE( ConcordiaVersion ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) { Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); - boost::shared_ptr ts = concordia.addExample(Example("Ala posiada kota",14)); + TokenizedSentence ts = concordia.addExample(Example("Ala posiada kota",14)); /* 0,3 type: 1 value: ala 4,11 type: 1 value: posiada 12,16 type: 1 value: kota */ - BOOST_CHECK_EQUAL(ts->getTokens().size(), 3); - BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 4); - BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 11); - BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1); - BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada"); + BOOST_CHECK_EQUAL(ts.getTokens().size(), 3); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 11); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada"); concordia.addExample(Example("Ala posiada rysia",51)); concordia.addExample(Example("Marysia posiada rysia",123)); @@ -293,24 +293,36 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) { Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); + /* concordia.addExample(Example("Alice has a cat", 56)); concordia.addExample(Example("Alice has a dog", 23)); concordia.addExample(Example("New test product has a mistake", 321)); - boost::shared_ptr ts = concordia.tokenize("This is just testing and it has nothing to do with the above"); + */ + std::vector sentences; + std::vector ids; + sentences.push_back("Alice has a cat"); + ids.push_back(56); + sentences.push_back("Alice has a dog"); + ids.push_back(23); + sentences.push_back("New test product has a mistake"); + ids.push_back(321); + std::vector tokenizedSentences = concordia.tokenizeAll(sentences); + concordia.addAllTokenizedExamples(tokenizedSentences, ids); + + TokenizedSentence ts = concordia.tokenize("This is just testing and it has nothing to do with the above"); concordia.addTokenizedExample(ts, 14); + concordia.refreshSAfromRAM(); boost::shared_ptr searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers"); // best overlay: - /* BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().size(), 2); - BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.695, 0.1); - BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 0); - BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 2); - BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 2); - BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 3); - */ + BOOST_CHECK_CLOSE(searchResult1->getBestOverlayScore(), 0.537, 0.1); + BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getStart(), 1); + BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(0).getEnd(), 5); + BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getStart(), 5); + BOOST_CHECK_EQUAL(searchResult1->getBestOverlay().at(1).getEnd(), 9); BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 8); @@ -338,7 +350,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 ) BOOST_AUTO_TEST_CASE( Tokenize ) { Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); - boost::shared_ptr ts = concordia.tokenize(" Ala posiada kota"); + TokenizedSentence ts = concordia.tokenize(" Ala posiada kota"); /* 0,3 type: 1 value: ala 4,11 type: 1 value: posiada @@ -347,10 +359,22 @@ BOOST_AUTO_TEST_CASE( Tokenize ) concordia.clearIndex(); - BOOST_CHECK_EQUAL(ts->getTokens().size(), 3); - BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9); - BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16); - BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1); - BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada"); + BOOST_CHECK_EQUAL(ts.getTokens().size(), 3); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 9); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 16); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1); + BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "posiada"); + + std::vector sentences; + sentences.push_back("Marysia, ma rysia;"); + sentences.push_back("Testing complete;"); + sentences.push_back("This, is (a) weird;! sentence <>"); + std::vector tokenizedSentences = concordia.tokenizeAll(sentences); + + BOOST_CHECK_EQUAL(tokenizedSentences.size(), 3); + BOOST_CHECK_EQUAL(tokenizedSentences.at(0).getTokens().size(), 3); + BOOST_CHECK_EQUAL(tokenizedSentences.at(1).getTokens().size(), 2); + BOOST_CHECK_EQUAL(tokenizedSentences.at(2).getTokens().size(), 5); + } BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_concordia_searcher.cpp b/concordia/t/test_concordia_searcher.cpp index 5e2d0b2..f8ab3fa 100644 --- a/concordia/t/test_concordia_searcher.cpp +++ b/concordia/t/test_concordia_searcher.cpp @@ -373,7 +373,7 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest ) // searching for pattern "Ola posiada rysia Marysia" (5 1 3 4) - std::vector pattern = hashGenerator->generateHash("Ola posiada rysia Marysia")->getCodes(); + std::vector pattern = hashGenerator->generateHash("Ola posiada rysia Marysia").getCodes(); boost::shared_ptr tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern); diff --git a/concordia/t/test_hash_generator.cpp b/concordia/t/test_hash_generator.cpp index 61e1cfe..c1fd782 100644 --- a/concordia/t/test_hash_generator.cpp +++ b/concordia/t/test_hash_generator.cpp @@ -23,7 +23,7 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest ) HashGenerator hashGenerator = HashGenerator(config); - std::vector hash = hashGenerator.generateHash("Ala posiada kota")->getCodes(); + std::vector hash = hashGenerator.generateHash("Ala posiada kota").getCodes(); std::vector expected; expected.push_back(0); expected.push_back(1); @@ -76,7 +76,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest ) HashGenerator hashGenerator1 = HashGenerator(config); - std::vector hash1 = hashGenerator1.generateHash("Ala posiada kota")->getCodes(); + std::vector hash1 = hashGenerator1.generateHash("Ala posiada kota").getCodes(); std::vector expected1; expected1.push_back(0); expected1.push_back(1); @@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest ) hashGenerator1.serializeWordMap(); HashGenerator hashGenerator2 = HashGenerator(config); - std::vector hash2 = hashGenerator2.generateHash("Ala posiada psa")->getCodes(); + std::vector hash2 = hashGenerator2.generateHash("Ala posiada psa").getCodes(); std::vector expected2; expected2.push_back(0); expected2.push_back(1); @@ -106,9 +106,9 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest ) HashGenerator hashGenerator = HashGenerator(config); - boost::shared_ptr tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód."); + TokenizedSentence tokenizedSentence = hashGenerator.generateHash("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód."); - std::vector tokens = tokenizedSentence->getTokens(); + std::vector tokens = tokenizedSentence.getTokens(); /* BOOST_FOREACH(TokenAnnotation annotation, tokens) { diff --git a/concordia/t/test_regex_rule.cpp b/concordia/t/test_regex_rule.cpp index ada81eb..4634f74 100644 --- a/concordia/t/test_regex_rule.cpp +++ b/concordia/t/test_regex_rule.cpp @@ -13,10 +13,10 @@ BOOST_AUTO_TEST_SUITE(regex_rule) BOOST_AUTO_TEST_CASE( SimpleAnnotation ) { RegexRule rr("a", TokenAnnotation::WORD, "b"); - boost::shared_ptr ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa")); + TokenizedSentence ts("xxxxxxxaxxxaxxaxaxa"); rr.apply(ts); - BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5); - std::list annotations = ts->getAnnotations(); + BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),7); @@ -56,10 +56,10 @@ BOOST_AUTO_TEST_CASE( BadRegex ) BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation ) { RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, ""); - boost::shared_ptr ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'.")); + TokenizedSentence ts("Don't stop believin' \\ Hold on to the feelin'."); rr.apply(ts); - BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5); - std::list annotations = ts->getAnnotations(); + BOOST_CHECK_EQUAL(ts.getAnnotations().size(),5); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),3); @@ -86,10 +86,10 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation ) BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation ) { RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false); - boost::shared_ptr ts(new TokenizedSentence("This is AbC and ABC and abc and aBC.")); + TokenizedSentence ts("This is AbC and ABC and abc and aBC."); rr.apply(ts); - BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4); - std::list annotations = ts->getAnnotations(); + BOOST_CHECK_EQUAL(ts.getAnnotations().size(),4); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),8); @@ -111,10 +111,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation ) BOOST_AUTO_TEST_CASE( UnicodeAnnotation ) { RegexRule rr("ą", TokenAnnotation::WORD, "x"); - boost::shared_ptr ts(new TokenizedSentence("zażółć gęślą jaźń")); + TokenizedSentence ts("zażółć gęślą jaźń"); rr.apply(ts); - BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1); - std::list annotations = ts->getAnnotations(); + BOOST_CHECK_EQUAL(ts.getAnnotations().size(),1); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),11); @@ -124,10 +124,10 @@ BOOST_AUTO_TEST_CASE( UnicodeAnnotation ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation ) { RegexRule rr("ą", TokenAnnotation::WORD, "x", false); - boost::shared_ptr ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); + TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"); rr.apply(ts); - BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2); - std::list annotations = ts->getAnnotations(); + BOOST_CHECK_EQUAL(ts.getAnnotations().size(),2); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),11); @@ -141,10 +141,10 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement ) { RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false); - boost::shared_ptr ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); + TokenizedSentence ts("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"); rr.apply(ts); - BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18); - std::list annotations = ts->getAnnotations(); + BOOST_CHECK_EQUAL(ts.getAnnotations().size(),18); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),2); diff --git a/concordia/t/test_sentence_tokenizer.cpp b/concordia/t/test_sentence_tokenizer.cpp index 626fdc9..8c5580f 100644 --- a/concordia/t/test_sentence_tokenizer.cpp +++ b/concordia/t/test_sentence_tokenizer.cpp @@ -20,8 +20,8 @@ BOOST_AUTO_TEST_CASE( NETest ) std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ"; - boost::shared_ptr ts = tokenizer.tokenize(sentence); - std::list annotations = ts->getAnnotations(); + TokenizedSentence ts = tokenizer.tokenize(sentence); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(14,annotations.size()); @@ -134,8 +134,8 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest ) SentenceTokenizer tokenizer(config); std::string sentence = "link and bold and newline
"; - boost::shared_ptr ts = tokenizer.tokenize(sentence); - std::list annotations = ts->getAnnotations(); + TokenizedSentence ts = tokenizer.tokenize(sentence); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); /* @@ -214,8 +214,8 @@ BOOST_AUTO_TEST_CASE( InWordSymbolsTest ) SentenceTokenizer tokenizer(config); std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń"; - boost::shared_ptr ts = tokenizer.tokenize(sentence); - std::list annotations = ts->getAnnotations(); + TokenizedSentence ts = tokenizer.tokenize(sentence); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); /* @@ -322,7 +322,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest ) if (config->isStopWordsEnabled()) { SentenceTokenizer tokenizer(config); std::string sentence = "Aczkolwiek nie wiem, czy to konieczne"; - BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence()," wiem konieczne"); + BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence).getSentence()," wiem konieczne"); } } @@ -332,8 +332,8 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest ) SentenceTokenizer tokenizer(config); std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |"; - boost::shared_ptr ts = tokenizer.tokenize(sentence); - std::list annotations = ts->getAnnotations(); + TokenizedSentence ts = tokenizer.tokenize(sentence); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(161, annotations.size());