added an extremely important improvement to the concordia search algorithm - gapped overlays cut-off

This commit is contained in:
rjawor 2015-08-24 13:10:06 +02:00
parent 209e374226
commit 0a3fd8a04e
2 changed files with 71 additions and 7 deletions

View File

@ -60,7 +60,8 @@ void checkConcordiaResults(
void performSearch(Concordia & concordia,
std::vector<std::string> & buffer,
long lineCount) {
long lineCount,
int checkingScheme) {
long baseLineCount = 0;
if (lineCount % READ_BUFFER_LENGTH == 0) {
baseLineCount = lineCount - READ_BUFFER_LENGTH;
@ -93,9 +94,11 @@ void performSearch(Concordia & concordia,
sentencesSearched << " sentences in "
<< timeElapsed << "ms. The speed: " <<
speed << " sentences per second" << std::endl;
std::cout << "\tChecking this portion..." << std::endl;
checkConcordiaResults(results, baseLineCount);
std::cout << "\tno errors." << std::endl;
if (checkingScheme == 1) { // 100% correctness
std::cout << "\tChecking this portion..." << std::endl;
checkConcordiaResults(results, baseLineCount);
std::cout << "\tno errors." << std::endl;
}
std::cout << "\tTotal search progress: " <<
lineCount << " sentences searched." << std::endl;
}
@ -115,6 +118,9 @@ int main(int argc, char** argv) {
"Pattern to be searched by anubis search in the index")
("concordia-search,x", boost::program_options::value<std::string>(),
"Pattern to be searched by concordia search in the index")
("concordia-search-all,f",
boost::program_options::value<std::string>(),
"File with pattern to be searched by concordia search in the index")
("read-file,r", boost::program_options::value<std::string>(),
"File to be read and added to index")
("test,t", boost::program_options::value<std::string>(),
@ -241,6 +247,36 @@ int main(int argc, char** argv) {
<< std::endl;
}
}
} else if (cli.count("concordia-search-all")) {
std::string filePath =
cli["concordia-search-all"].as<std::string>();
std::cout << "\tConcordia searching from file: " << filePath <<
std::endl;
std::ifstream test_file(filePath.c_str());
std::string line;
if (test_file.is_open()) {
long lineCount = 0;
std::vector<std::string> buffer;
while (getline(test_file, line)) {
lineCount++;
buffer.push_back(line);
if (lineCount % READ_BUFFER_LENGTH == 0) {
performSearch(concordia, buffer, lineCount, 2);
}
}
if (buffer.size() > 0) {
performSearch(concordia, buffer, lineCount, 2);
}
test_file.close();
std::cout << "\tSearching finished. No errors reported."
<< std::endl;
} else {
std::cerr << "Unable to open file: "<< filePath;
return 1;
}
} else if (cli.count("read-file")) {
std::string filePath = cli["read-file"].as<std::string>();
std::cout << "\tReading sentences from file: " << filePath <<
@ -377,11 +413,11 @@ int main(int argc, char** argv) {
lineCount++;
buffer.push_back(line);
if (lineCount % READ_BUFFER_LENGTH == 0) {
performSearch(concordia, buffer, lineCount);
performSearch(concordia, buffer, lineCount, 1);
}
}
if (buffer.size() > 0) {
performSearch(concordia, buffer, lineCount);
performSearch(concordia, buffer, lineCount, 1);
}
test_file.close();
std::cout << "\tSearching finished. No errors reported."
@ -394,7 +430,8 @@ int main(int argc, char** argv) {
} else {
std::cerr << "One of the options: simple-search, anubis-search, "
<< "concordia-search or read-file must be provided."
<< "concordia-search, concordia-search-file, test-file "
<< "or read-file must be provided."
<< "See the options specification: "
<< std::endl << desc << std::endl;
return 1;

View File

@ -35,13 +35,40 @@ void ConcordiaSearchResult::_checkPossibleOverlays(
SUFFIX_MARKER_TYPE lastAddedPos,
SUFFIX_MARKER_TYPE patternSize) {
bool allTerminal = true;
// startedFragments is a list of fragments which started the recurrent
// check in this _checkPossibleOverlays method invocation.
// If this list is non empty, there is no point in starting
// new invocations for fragments that do not intersect any in this list.
// In this case we would choose to add a fragment to the currentOverlay
// that could be preceded by a non intersecting fragment.
std::vector<MatchedPatternFragment> startedFragments;
for (int i = lastAddedPos + 1; i < _matchedPatternFragments.size(); i++) {
MatchedPatternFragment fragment = _matchedPatternFragments.at(i);
bool nonFilledGap = false;
if (startedFragments.size() > 0) {
BOOST_FOREACH(MatchedPatternFragment & startedFragment,
startedFragments) {
if (!fragment.intersects(startedFragment)) {
nonFilledGap = true;
}
}
}
if (nonFilledGap) {
// If the new fragment does not intersect with any of the
// started fragments, break out of the loop.
// There is no point in checking a currentOverlay with a gap
// that could have been filled.
break;
}
// if fragment does not intersect currentOverlay
if (currentOverlay.size() == 0 ||
!currentOverlay.at(currentOverlay.size()-1).intersects(fragment)) {
currentOverlay.push_back(fragment);
startedFragments.push_back(fragment);
_checkPossibleOverlays(currentOverlay, i, patternSize);
allTerminal = false;
}