simple search returns matched pattern fragments
This commit is contained in:
parent
28704c2f43
commit
a765443a01
@ -7,7 +7,7 @@
|
|||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
#include "concordia/concordia.hpp"
|
#include "concordia/concordia.hpp"
|
||||||
#include "concordia/substring_occurence.hpp"
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
#include "concordia/token_annotation.hpp"
|
#include "concordia/token_annotation.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/common/utils.hpp"
|
#include "concordia/common/utils.hpp"
|
||||||
@ -158,7 +158,7 @@ int main(int argc, char** argv) {
|
|||||||
std::cout << "\tSearching for pattern: \"" << pattern <<
|
std::cout << "\tSearching for pattern: \"" << pattern <<
|
||||||
"\"" << std::endl;
|
"\"" << std::endl;
|
||||||
time_start = boost::posix_time::microsec_clock::local_time();
|
time_start = boost::posix_time::microsec_clock::local_time();
|
||||||
std::vector<SubstringOccurence> result =
|
std::vector<MatchedPatternFragment> result =
|
||||||
concordia.simpleSearch(pattern);
|
concordia.simpleSearch(pattern);
|
||||||
time_end = boost::posix_time::microsec_clock::local_time();
|
time_end = boost::posix_time::microsec_clock::local_time();
|
||||||
msdiff = time_end - time_start;
|
msdiff = time_end - time_start;
|
||||||
@ -166,9 +166,9 @@ int main(int argc, char** argv) {
|
|||||||
<< "Search took: " <<
|
<< "Search took: " <<
|
||||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||||
if (!cli.count("silent")) {
|
if (!cli.count("silent")) {
|
||||||
BOOST_FOREACH(SubstringOccurence occurence, result) {
|
BOOST_FOREACH(MatchedPatternFragment occurence, result) {
|
||||||
std::cout << "\t\tfound match in sentence number: "
|
std::cout << "\t\tfound match in sentence number: "
|
||||||
<< occurence.getId() << std::endl;
|
<< occurence.getExampleId() << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (cli.count("anubis-search")) {
|
} else if (cli.count("anubis-search")) {
|
||||||
|
@ -49,7 +49,6 @@ boost::shared_ptr<TokenizedSentence>
|
|||||||
_hashGenerator->generateHash(sentence);
|
_hashGenerator->generateHash(sentence);
|
||||||
_hashGenerator->serializeWordMap();
|
_hashGenerator->serializeWordMap();
|
||||||
return result;
|
return result;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -156,14 +155,14 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<SubstringOccurence> Concordia::simpleSearch(
|
std::vector<MatchedPatternFragment> Concordia::simpleSearch(
|
||||||
const std::string & pattern)
|
const std::string & pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
if (_T->size() > 0) {
|
if (_T->size() > 0) {
|
||||||
return _searcher->simpleSearch(_hashGenerator, _T,
|
return _searcher->simpleSearch(_hashGenerator, _T,
|
||||||
_markers, _SA, pattern);
|
_markers, _SA, pattern);
|
||||||
} else {
|
} else {
|
||||||
std::vector<SubstringOccurence> result;
|
std::vector<MatchedPatternFragment> result;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/example.hpp"
|
#include "concordia/example.hpp"
|
||||||
#include "concordia/substring_occurence.hpp"
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
#include "concordia/concordia_config.hpp"
|
#include "concordia/concordia_config.hpp"
|
||||||
#include "concordia/concordia_index.hpp"
|
#include "concordia/concordia_index.hpp"
|
||||||
#include "concordia/index_searcher.hpp"
|
#include "concordia/index_searcher.hpp"
|
||||||
@ -96,7 +96,8 @@ public:
|
|||||||
\returns vector of matched results
|
\returns vector of matched results
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
std::vector<SubstringOccurence> simpleSearch(const std::string & pattern)
|
std::vector<MatchedPatternFragment> simpleSearch(
|
||||||
|
const std::string & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! \deprecated
|
/*! \deprecated
|
||||||
|
@ -13,13 +13,13 @@ IndexSearcher::IndexSearcher() {
|
|||||||
IndexSearcher::~IndexSearcher() {
|
IndexSearcher::~IndexSearcher() {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
std::vector<MatchedPatternFragment> IndexSearcher::simpleSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const std::string & pattern) throw(ConcordiaException) {
|
const std::string & pattern) throw(ConcordiaException) {
|
||||||
std::vector<SubstringOccurence> result;
|
std::vector<MatchedPatternFragment> result;
|
||||||
|
|
||||||
int left;
|
int left;
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
@ -40,8 +40,11 @@ std::vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
|||||||
// removes these accidental results.
|
// removes these accidental results.
|
||||||
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
||||||
|
result.push_back(MatchedPatternFragment(
|
||||||
result.push_back(SubstringOccurence(marker));
|
Utils::getIdFromMarker(marker),
|
||||||
|
Utils::getOffsetFromMarker(marker),
|
||||||
|
0,
|
||||||
|
hash.size()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/substring_occurence.hpp"
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include "concordia/concordia_searcher.hpp"
|
#include "concordia/concordia_searcher.hpp"
|
||||||
@ -45,7 +45,7 @@ public:
|
|||||||
\returns vector of occurences of the pattern in the index
|
\returns vector of occurences of the pattern in the index
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
std::vector<SubstringOccurence> simpleSearch(
|
std::vector<MatchedPatternFragment> simpleSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include "concordia/anubis_search_result.hpp"
|
#include "concordia/anubis_search_result.hpp"
|
||||||
#include "concordia/tokenized_sentence.hpp"
|
#include "concordia/tokenized_sentence.hpp"
|
||||||
#include "concordia/token_annotation.hpp"
|
#include "concordia/token_annotation.hpp"
|
||||||
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
#include "tests/common/test_resources_manager.hpp"
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
|
||||||
@ -64,16 +65,18 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
std::vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("posiada rysia");
|
std::vector<MatchedPatternFragment> searchResult1 = concordia.simpleSearch("posiada rysia");
|
||||||
std::vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("posiada kota Ala");
|
std::vector<MatchedPatternFragment> searchResult2 = concordia.simpleSearch("posiada kota Ala");
|
||||||
|
|
||||||
concordia.clearIndex();
|
concordia.clearIndex();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
|
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 123);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 51);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
|
||||||
|
|
||||||
// Checking pattern spanning over 2 segments
|
// Checking pattern spanning over 2 segments
|
||||||
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
|
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
|
||||||
@ -126,24 +129,29 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
std::vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("xto xjest");
|
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("xto xjest");
|
||||||
std::vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("xjest okno");
|
std::vector<MatchedPatternFragment> searchResult2 = concordia2.simpleSearch("xjest okno");
|
||||||
|
|
||||||
concordia2.clearIndex();
|
concordia2.clearIndex();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1.size(), 3);
|
BOOST_CHECK_EQUAL(searchResult1.size(), 3);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 0);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 0);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 45);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 45);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(2).getId(), 29);
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(2).getOffset(), 0);
|
BOOST_CHECK_EQUAL(searchResult1.at(1).getMatchedLength(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleId(), 29);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(2).getExampleOffset(), 0);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(2).getMatchedLength(), 2);
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getId(), 202);
|
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 202);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleOffset(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getId(), 312);
|
BOOST_CHECK_EQUAL(searchResult2.at(0).getMatchedLength(), 2);
|
||||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getOffset(), 1);
|
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 312);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult2.at(1).getMatchedLength(), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||||
@ -155,13 +163,14 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
|||||||
concordia.addAllExamples(testExamples);
|
concordia.addAllExamples(testExamples);
|
||||||
|
|
||||||
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
std::vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
std::vector<MatchedPatternFragment> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
||||||
|
|
||||||
concordia2.clearIndex();
|
concordia2.clearIndex();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
|
BOOST_CHECK_EQUAL(searchResult1.size(), 1);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 312);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 312);
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 2);
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleOffset(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1.at(0).getMatchedLength(), 6);
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
||||||
@ -335,6 +344,9 @@ BOOST_AUTO_TEST_CASE( Tokenize )
|
|||||||
4,11 type: 1 value: posiada
|
4,11 type: 1 value: posiada
|
||||||
12,16 type: 1 value: kota
|
12,16 type: 1 value: kota
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
concordia.clearIndex();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
|
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
|
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
|
||||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);
|
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);
|
||||||
|
@ -41,7 +41,7 @@ This code snippet shows the basic Concordia functionality - simple substring loo
|
|||||||
File simple_search.cpp:
|
File simple_search.cpp:
|
||||||
\verbatim
|
\verbatim
|
||||||
#include <concordia/concordia.hpp>
|
#include <concordia/concordia.hpp>
|
||||||
#include <concordia/substring_occurence.hpp>
|
#include <concordia/matched_pattern_fragment.hpp>
|
||||||
#include <concordia/example.hpp>
|
#include <concordia/example.hpp>
|
||||||
|
|
||||||
#include "config.hpp"
|
#include "config.hpp"
|
||||||
@ -65,12 +65,12 @@ int main() {
|
|||||||
|
|
||||||
// searching
|
// searching
|
||||||
cout << "Searching for pattern: has a" << endl;
|
cout << "Searching for pattern: has a" << endl;
|
||||||
vector<SubstringOccurence> result = concordia.simpleSearch("has a");
|
vector<MatchedPatternFragment> result = concordia.simpleSearch("has a");
|
||||||
|
|
||||||
// printing results
|
// printing results
|
||||||
for(vector<SubstringOccurence>::iterator it = result.begin();
|
for(vector<MatchedPatternFragment>::iterator it = result.begin();
|
||||||
it != result.end(); ++it) {
|
it != result.end(); ++it) {
|
||||||
cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl;
|
cout << "Found substring in sentence: " << it->getExampleId() << " at offset: " << it->getExampleOffset() << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
// clearing index
|
// clearing index
|
||||||
@ -82,7 +82,7 @@ First, sentences are added to the index along with their integer IDs. The pair (
|
|||||||
|
|
||||||
After adding the examples, index needs to be generated using the method refreshSAfromRAM. Details of this operation are covered in \ref tutorial2.
|
After adding the examples, index needs to be generated using the method refreshSAfromRAM. Details of this operation are covered in \ref tutorial2.
|
||||||
|
|
||||||
The search returns a vector of SubstringOccurence objects, which is then printed out. Each occurence represents a single match of the pattern. The pattern has to be matched within a single sentence. Information about the match consists of two integer values: ID of the sentence where the match occured and word-level, 0-based offset of the matched pattern in the sentence. The above code should return the following results:
|
The search returns a vector of MatchedPatternFragment objects, which is then printed out. Each matched fragment represents a single match of the pattern. The pattern has to be matched within a single sentence. Information about the match consists of two integer values: ID of the sentence where the match occured and word-level, 0-based offset of the matched pattern in the sentence. The above code should return the following results:
|
||||||
|
|
||||||
\verbatim
|
\verbatim
|
||||||
Found substring in sentence: 56 at offset: 1
|
Found substring in sentence: 56 at offset: 1
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#include <concordia/concordia.hpp>
|
#include <concordia/concordia.hpp>
|
||||||
#include <concordia/substring_occurence.hpp>
|
#include <concordia/matched_pattern_fragment.hpp>
|
||||||
#include <concordia/example.hpp>
|
#include <concordia/example.hpp>
|
||||||
|
|
||||||
#include "config.hpp"
|
#include "config.hpp"
|
||||||
@ -23,12 +23,12 @@ int main() {
|
|||||||
|
|
||||||
// searching
|
// searching
|
||||||
cout << "Searching for pattern: has a" << endl;
|
cout << "Searching for pattern: has a" << endl;
|
||||||
vector<SubstringOccurence> result = concordia.simpleSearch("has a");
|
vector<MatchedPatternFragment> result = concordia.simpleSearch("has a");
|
||||||
|
|
||||||
// printing results
|
// printing results
|
||||||
for(vector<SubstringOccurence>::iterator it = result.begin();
|
for(vector<MatchedPatternFragment>::iterator it = result.begin();
|
||||||
it != result.end(); ++it) {
|
it != result.end(); ++it) {
|
||||||
cout << "Found substring in sentence: " << it->getId() << " at offset: " << it->getOffset() << endl;
|
cout << "Found substring in sentence: " << it->getExampleId() << " at offset: " << it->getExampleOffset() << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
// clearing index
|
// clearing index
|
||||||
|
Loading…
Reference in New Issue
Block a user