simple search returns matched pattern fragments

This commit is contained in:
rjawor 2015-08-07 12:54:57 +02:00
parent 28704c2f43
commit a765443a01
9 changed files with 61 additions and 46 deletions

View File

@ -7,7 +7,7 @@
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include "concordia/concordia.hpp" #include "concordia/concordia.hpp"
#include "concordia/substring_occurence.hpp" #include "concordia/matched_pattern_fragment.hpp"
#include "concordia/token_annotation.hpp" #include "concordia/token_annotation.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp" #include "concordia/common/utils.hpp"
@ -158,7 +158,7 @@ int main(int argc, char** argv) {
std::cout << "\tSearching for pattern: \"" << pattern << std::cout << "\tSearching for pattern: \"" << pattern <<
"\"" << std::endl; "\"" << std::endl;
time_start = boost::posix_time::microsec_clock::local_time(); time_start = boost::posix_time::microsec_clock::local_time();
std::vector<SubstringOccurence> result = std::vector<MatchedPatternFragment> result =
concordia.simpleSearch(pattern); concordia.simpleSearch(pattern);
time_end = boost::posix_time::microsec_clock::local_time(); time_end = boost::posix_time::microsec_clock::local_time();
msdiff = time_end - time_start; msdiff = time_end - time_start;
@ -166,9 +166,9 @@ int main(int argc, char** argv) {
<< "Search took: " << << "Search took: " <<
msdiff.total_milliseconds() << "ms." << std::endl; msdiff.total_milliseconds() << "ms." << std::endl;
if (!cli.count("silent")) { if (!cli.count("silent")) {
BOOST_FOREACH(SubstringOccurence occurence, result) { BOOST_FOREACH(MatchedPatternFragment occurence, result) {
std::cout << "\t\tfound match in sentence number: " std::cout << "\t\tfound match in sentence number: "
<< occurence.getId() << std::endl; << occurence.getExampleId() << std::endl;
} }
} }
} else if (cli.count("anubis-search")) { } else if (cli.count("anubis-search")) {

View File

@ -49,7 +49,6 @@ boost::shared_ptr<TokenizedSentence>
_hashGenerator->generateHash(sentence); _hashGenerator->generateHash(sentence);
_hashGenerator->serializeWordMap(); _hashGenerator->serializeWordMap();
return result; return result;
} }
@ -156,14 +155,14 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
} }
} }
std::vector<SubstringOccurence> Concordia::simpleSearch( std::vector<MatchedPatternFragment> Concordia::simpleSearch(
const std::string & pattern) const std::string & pattern)
throw(ConcordiaException) { throw(ConcordiaException) {
if (_T->size() > 0) { if (_T->size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T, return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern); _markers, _SA, pattern);
} else { } else {
std::vector<SubstringOccurence> result; std::vector<MatchedPatternFragment> result;
return result; return result;
} }
} }

View File

@ -8,7 +8,7 @@
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/example.hpp" #include "concordia/example.hpp"
#include "concordia/substring_occurence.hpp" #include "concordia/matched_pattern_fragment.hpp"
#include "concordia/concordia_config.hpp" #include "concordia/concordia_config.hpp"
#include "concordia/concordia_index.hpp" #include "concordia/concordia_index.hpp"
#include "concordia/index_searcher.hpp" #include "concordia/index_searcher.hpp"
@ -96,7 +96,8 @@ public:
\returns vector of matched results \returns vector of matched results
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<SubstringOccurence> simpleSearch(const std::string & pattern) std::vector<MatchedPatternFragment> simpleSearch(
const std::string & pattern)
throw(ConcordiaException); throw(ConcordiaException);
/*! \deprecated /*! \deprecated

View File

@ -13,13 +13,13 @@ IndexSearcher::IndexSearcher() {
IndexSearcher::~IndexSearcher() { IndexSearcher::~IndexSearcher() {
} }
std::vector<SubstringOccurence> IndexSearcher::simpleSearch( std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
const std::string & pattern) throw(ConcordiaException) { const std::string & pattern) throw(ConcordiaException) {
std::vector<SubstringOccurence> result; std::vector<MatchedPatternFragment> result;
int left; int left;
std::vector<INDEX_CHARACTER_TYPE> hash = std::vector<INDEX_CHARACTER_TYPE> hash =
@ -40,8 +40,11 @@ std::vector<SubstringOccurence> IndexSearcher::simpleSearch(
// removes these accidental results. // removes these accidental results.
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE); saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos); SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
result.push_back(MatchedPatternFragment(
result.push_back(SubstringOccurence(marker)); Utils::getIdFromMarker(marker),
Utils::getOffsetFromMarker(marker),
0,
hash.size()));
} }
} }

View File

@ -7,7 +7,7 @@
#include <vector> #include <vector>
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/substring_occurence.hpp" #include "concordia/matched_pattern_fragment.hpp"
#include "concordia/hash_generator.hpp" #include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
#include "concordia/concordia_searcher.hpp" #include "concordia/concordia_searcher.hpp"
@ -45,7 +45,7 @@ public:
\returns vector of occurences of the pattern in the index \returns vector of occurences of the pattern in the index
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<SubstringOccurence> simpleSearch( std::vector<MatchedPatternFragment> simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,

View File

@ -3,6 +3,7 @@
#include "concordia/anubis_search_result.hpp" #include "concordia/anubis_search_result.hpp"
#include "concordia/tokenized_sentence.hpp" #include "concordia/tokenized_sentence.hpp"
#include "concordia/token_annotation.hpp" #include "concordia/token_annotation.hpp"
#include "concordia/matched_pattern_fragment.hpp"
#include "tests/common/test_resources_manager.hpp" #include "tests/common/test_resources_manager.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
@ -64,16 +65,18 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
*/ */
std::vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("posiada rysia"); std::vector<MatchedPatternFragment> searchResult1 = concordia.simpleSearch("posiada rysia");
std::vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("posiada kota Ala"); std::vector<MatchedPatternFragment> searchResult2 = concordia.simpleSearch("posiada kota Ala");
concordia.clearIndex(); concordia.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 2); BOOST_CHECK_EQUAL(searchResult1.size(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123); BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 123);
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51); BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 51);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
// Checking pattern spanning over 2 segments // Checking pattern spanning over 2 segments
BOOST_CHECK_EQUAL(searchResult2.size(), 0); BOOST_CHECK_EQUAL(searchResult2.size(), 0);
@ -126,24 +129,29 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
*/ */
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("xto xjest"); std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("xto xjest");
std::vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("xjest okno"); std::vector<MatchedPatternFragment> searchResult2 = concordia2.simpleSearch("xjest okno");
concordia2.clearIndex(); concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 3); BOOST_CHECK_EQUAL(searchResult1.size(), 3);
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 0); BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 45); BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 45);
BOOST_CHECK_EQUAL(searchResult1.at(2).getId(), 29); BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(2).getOffset(), 0); BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleId(), 29);
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleOffset(), 0);
BOOST_CHECK_EQUAL(searchResult1.at(2).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult2.size(), 2); BOOST_CHECK_EQUAL(searchResult2.size(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202); BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 202);
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312); BOOST_CHECK_EQUAL(searchResult2.at(0).getMatchedLength(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 312);
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleOffset(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getMatchedLength(), 2);
} }
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 ) BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
@ -155,13 +163,14 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
concordia.addAllExamples(testExamples); concordia.addAllExamples(testExamples);
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")); Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
std::vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia"); std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
concordia2.clearIndex(); concordia2.clearIndex();
BOOST_CHECK_EQUAL(searchResult1.size(), 1); BOOST_CHECK_EQUAL(searchResult1.size(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312); BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 2); BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 6);
} }
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
@ -335,6 +344,9 @@ BOOST_AUTO_TEST_CASE( Tokenize )
4,11 type: 1 value: posiada 4,11 type: 1 value: posiada
12,16 type: 1 value: kota 12,16 type: 1 value: kota
*/ */
concordia.clearIndex();
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3); BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9); BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16); BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);

View File

@ -41,7 +41,7 @@ This code snippet shows the basic Concordia functionality - simple substring loo
File simple_search.cpp: File simple_search.cpp:
\verbatim \verbatim
#include <concordia/concordia.hpp> #include <concordia/concordia.hpp>
#include <concordia/substring_occurence.hpp> #include <concordia/matched_pattern_fragment.hpp>
#include <concordia/example.hpp> #include <concordia/example.hpp>
#include "config.hpp" #include "config.hpp"
@ -65,12 +65,12 @@ int main() {
// searching // searching
cout << "Searching for pattern: has a" << endl; cout << "Searching for pattern: has a" << endl;
vector<SubstringOccurence> result = concordia.simpleSearch("has a"); vector<MatchedPatternFragment> result = concordia.simpleSearch("has a");
// printing results // printing results
for(vector<SubstringOccurence>::iterator it = result.begin(); for(vector<MatchedPatternFragment>::iterator it = result.begin();
it != result.end(); ++it) { it != result.end(); ++it) {
cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl; cout << "Found substring in sentence: " << it->getExampleId() << " at offset: " << it->getExampleOffset() << endl;
} }
// clearing index // clearing index
@ -82,7 +82,7 @@ First, sentences are added to the index along with their integer IDs. The pair (
After adding the examples, index needs to be generated using the method refreshSAfromRAM. Details of this operation are covered in \ref tutorial2. After adding the examples, index needs to be generated using the method refreshSAfromRAM. Details of this operation are covered in \ref tutorial2.
The search returns a vector of SubstringOccurence objects, which is then printed out. Each occurence represents a single match of the pattern. The pattern has to be matched within a single sentence. Information about the match consists of two integer values: ID of the sentence where the match occured and word-level, 0-based offset of the matched pattern in the sentence. The above code should return the following results: The search returns a vector of MatchedPatternFragment objects, which is then printed out. Each matched fragment represents a single match of the pattern. The pattern has to be matched within a single sentence. Information about the match consists of two integer values: ID of the sentence where the match occured and word-level, 0-based offset of the matched pattern in the sentence. The above code should return the following results:
\verbatim \verbatim
Found substring in sentence: 56 at offset: 1 Found substring in sentence: 56 at offset: 1

View File

@ -1,5 +1,5 @@
#include <concordia/concordia.hpp> #include <concordia/concordia.hpp>
#include <concordia/substring_occurence.hpp> #include <concordia/matched_pattern_fragment.hpp>
#include <concordia/example.hpp> #include <concordia/example.hpp>
#include "config.hpp" #include "config.hpp"
@ -23,12 +23,12 @@ int main() {
// searching // searching
cout << "Searching for pattern: has a" << endl; cout << "Searching for pattern: has a" << endl;
vector<SubstringOccurence> result = concordia.simpleSearch("has a"); vector<MatchedPatternFragment> result = concordia.simpleSearch("has a");
// printing results // printing results
for(vector<SubstringOccurence>::iterator it = result.begin(); for(vector<MatchedPatternFragment>::iterator it = result.begin();
it != result.end(); ++it) { it != result.end(); ++it) {
cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl; cout << "Found substring in sentence: " << it->getExampleId() << " at offset: " << it->getExampleOffset() << endl;
} }
// clearing index // clearing index