diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index 8f69b48..8dcadd4 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -7,7 +7,7 @@ #include #include "concordia/concordia.hpp" -#include "concordia/substring_occurence.hpp" +#include "concordia/matched_pattern_fragment.hpp" #include "concordia/token_annotation.hpp" #include "concordia/common/config.hpp" #include "concordia/common/utils.hpp" @@ -158,7 +158,7 @@ int main(int argc, char** argv) { std::cout << "\tSearching for pattern: \"" << pattern << "\"" << std::endl; time_start = boost::posix_time::microsec_clock::local_time(); - std::vector result = + std::vector result = concordia.simpleSearch(pattern); time_end = boost::posix_time::microsec_clock::local_time(); msdiff = time_end - time_start; @@ -166,9 +166,9 @@ int main(int argc, char** argv) { << "Search took: " << msdiff.total_milliseconds() << "ms." << std::endl; if (!cli.count("silent")) { - BOOST_FOREACH(SubstringOccurence occurence, result) { + BOOST_FOREACH(MatchedPatternFragment occurence, result) { std::cout << "\t\tfound match in sentence number: " - << occurence.getId() << std::endl; + << occurence.getExampleId() << std::endl; } } } else if (cli.count("anubis-search")) { diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index d92b167..af1a3c3 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -49,7 +49,6 @@ boost::shared_ptr _hashGenerator->generateHash(sentence); _hashGenerator->serializeWordMap(); return result; - } @@ -156,14 +155,14 @@ void Concordia::_initializeIndex() throw(ConcordiaException) { } } -std::vector Concordia::simpleSearch( +std::vector Concordia::simpleSearch( const std::string & pattern) throw(ConcordiaException) { if (_T->size() > 0) { return _searcher->simpleSearch(_hashGenerator, _T, _markers, _SA, pattern); } else { - std::vector result; + std::vector result; return result; } } diff --git a/concordia/concordia.hpp b/concordia/concordia.hpp index 901a893..33628e8 100644 --- a/concordia/concordia.hpp +++ b/concordia/concordia.hpp @@ -8,7 +8,7 @@ #include "concordia/common/config.hpp" #include "concordia/example.hpp" -#include "concordia/substring_occurence.hpp" +#include "concordia/matched_pattern_fragment.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_index.hpp" #include "concordia/index_searcher.hpp" @@ -96,7 +96,8 @@ public: \returns vector of matched results \throws ConcordiaException */ - std::vector simpleSearch(const std::string & pattern) + std::vector simpleSearch( + const std::string & pattern) throw(ConcordiaException); /*! \deprecated diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index 43a4d81..fc7493e 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -150,7 +150,7 @@ boost::shared_ptr ConcordiaIndex::_addSingleExample( hashGenerator->generateHash(example.getSentence()); _addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator, T, markers, hashedPattern, example.getId()); - + return hashedPattern; } diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index c2c119e..6012ba1 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -13,13 +13,13 @@ IndexSearcher::IndexSearcher() { IndexSearcher::~IndexSearcher() { } -std::vector IndexSearcher::simpleSearch( +std::vector IndexSearcher::simpleSearch( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, const std::string & pattern) throw(ConcordiaException) { - std::vector result; + std::vector result; int left; std::vector hash = @@ -40,8 +40,11 @@ std::vector IndexSearcher::simpleSearch( // removes these accidental results. saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); - - result.push_back(SubstringOccurence(marker)); + result.push_back(MatchedPatternFragment( + Utils::getIdFromMarker(marker), + Utils::getOffsetFromMarker(marker), + 0, + hash.size())); } } diff --git a/concordia/index_searcher.hpp b/concordia/index_searcher.hpp index f2186b1..7393fee 100644 --- a/concordia/index_searcher.hpp +++ b/concordia/index_searcher.hpp @@ -7,7 +7,7 @@ #include #include "concordia/common/config.hpp" -#include "concordia/substring_occurence.hpp" +#include "concordia/matched_pattern_fragment.hpp" #include "concordia/hash_generator.hpp" #include "concordia/concordia_exception.hpp" #include "concordia/concordia_searcher.hpp" @@ -45,7 +45,7 @@ public: \returns vector of occurences of the pattern in the index \throws ConcordiaException */ - std::vector simpleSearch( + std::vector simpleSearch( boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 96fbebc..d5e5907 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -3,6 +3,7 @@ #include "concordia/anubis_search_result.hpp" #include "concordia/tokenized_sentence.hpp" #include "concordia/token_annotation.hpp" +#include "concordia/matched_pattern_fragment.hpp" #include "tests/common/test_resources_manager.hpp" #include "concordia/common/config.hpp" @@ -64,16 +65,18 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 ) */ - std::vector searchResult1 = concordia.simpleSearch("posiada rysia"); - std::vector searchResult2 = concordia.simpleSearch("posiada kota Ala"); + std::vector searchResult1 = concordia.simpleSearch("posiada rysia"); + std::vector searchResult2 = concordia.simpleSearch("posiada kota Ala"); concordia.clearIndex(); BOOST_CHECK_EQUAL(searchResult1.size(), 2); - BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123); - BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51); - BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 123); + BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2); + BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 51); + BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2); // Checking pattern spanning over 2 segments BOOST_CHECK_EQUAL(searchResult2.size(), 0); @@ -126,24 +129,29 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 ) */ Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); - std::vector searchResult1 = concordia2.simpleSearch("xto xjest"); - std::vector searchResult2 = concordia2.simpleSearch("xjest okno"); + std::vector searchResult1 = concordia2.simpleSearch("xto xjest"); + std::vector searchResult2 = concordia2.simpleSearch("xjest okno"); concordia2.clearIndex(); BOOST_CHECK_EQUAL(searchResult1.size(), 3); - BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); - BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 0); - BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 45); - BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1); - BOOST_CHECK_EQUAL(searchResult1.at(2).getId(), 29); - BOOST_CHECK_EQUAL(searchResult1.at(2).getOffset(), 0); + BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312); + BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 0); + BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2); + BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 45); + BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1); + BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2); + BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleId(), 29); + BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleOffset(), 0); + BOOST_CHECK_EQUAL(searchResult1.at(2).getMatchedLength(), 2); BOOST_CHECK_EQUAL(searchResult2.size(), 2); - BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202); - BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1); - BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312); - BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1); + BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 202); + BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleOffset(), 1); + BOOST_CHECK_EQUAL(searchResult2.at(0).getMatchedLength(), 2); + BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 312); + BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleOffset(), 1); + BOOST_CHECK_EQUAL(searchResult2.at(1).getMatchedLength(), 2); } BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) @@ -155,13 +163,14 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) concordia.addAllExamples(testExamples); Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); - std::vector searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia"); + std::vector searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia"); concordia2.clearIndex(); BOOST_CHECK_EQUAL(searchResult1.size(), 1); - BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); - BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 2); + BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312); + BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 2); + BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 6); } BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) @@ -335,6 +344,9 @@ BOOST_AUTO_TEST_CASE( Tokenize ) 4,11 type: 1 value: posiada 12,16 type: 1 value: kota */ + + concordia.clearIndex(); + BOOST_CHECK_EQUAL(ts->getTokens().size(), 3); BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9); BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16); diff --git a/concordia/tutorial.dox b/concordia/tutorial.dox index 00d65dc..3525f02 100644 --- a/concordia/tutorial.dox +++ b/concordia/tutorial.dox @@ -41,7 +41,7 @@ This code snippet shows the basic Concordia functionality - simple substring loo File simple_search.cpp: \verbatim #include -#include +#include #include #include "config.hpp" @@ -65,12 +65,12 @@ int main() { // searching cout << "Searching for pattern: has a" << endl; - vector result = concordia.simpleSearch("has a"); + vector result = concordia.simpleSearch("has a"); // printing results - for(vector::iterator it = result.begin(); + for(vector::iterator it = result.begin(); it != result.end(); ++it) { - cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl; + cout << "Found substring in sentence: " << it->getExampleId() << " at offset: " << it->getExampleOffset() << endl; } // clearing index @@ -82,7 +82,7 @@ First, sentences are added to the index along with their integer IDs. The pair ( After adding the examples, index needs to be generated using the method refreshSAfromRAM. Details of this operation are covered in \ref tutorial2. -The search returns a vector of SubstringOccurence objects, which is then printed out. Each occurence represents a single match of the pattern. The pattern has to be matched within a single sentence. Information about the match consists of two integer values: ID of the sentence where the match occured and word-level, 0-based offset of the matched pattern in the sentence. The above code should return the following results: +The search returns a vector of MatchedPatternFragment objects, which is then printed out. Each matched fragment represents a single match of the pattern. The pattern has to be matched within a single sentence. Information about the match consists of two integer values: ID of the sentence where the match occured and word-level, 0-based offset of the matched pattern in the sentence. The above code should return the following results: \verbatim Found substring in sentence: 56 at offset: 1 diff --git a/examples/simple_search.cpp b/examples/simple_search.cpp index dbac325..9c02b55 100644 --- a/examples/simple_search.cpp +++ b/examples/simple_search.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include "config.hpp" @@ -23,12 +23,12 @@ int main() { // searching cout << "Searching for pattern: has a" << endl; - vector result = concordia.simpleSearch("has a"); + vector result = concordia.simpleSearch("has a"); // printing results - for(vector::iterator it = result.begin(); + for(vector::iterator it = result.begin(); it != result.end(); ++it) { - cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl; + cout << "Found substring in sentence: " << it->getExampleId() << " at offset: " << it->getExampleOffset() << endl; } // clearing index