added an extremely important improvement to the concordia search algorithm - gapped overlays cut-off
This commit is contained in:
parent
209e374226
commit
0a3fd8a04e
@ -60,7 +60,8 @@ void checkConcordiaResults(
|
||||
|
||||
void performSearch(Concordia & concordia,
|
||||
std::vector<std::string> & buffer,
|
||||
long lineCount) {
|
||||
long lineCount,
|
||||
int checkingScheme) {
|
||||
long baseLineCount = 0;
|
||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||
baseLineCount = lineCount - READ_BUFFER_LENGTH;
|
||||
@ -93,9 +94,11 @@ void performSearch(Concordia & concordia,
|
||||
sentencesSearched << " sentences in "
|
||||
<< timeElapsed << "ms. The speed: " <<
|
||||
speed << " sentences per second" << std::endl;
|
||||
std::cout << "\tChecking this portion..." << std::endl;
|
||||
checkConcordiaResults(results, baseLineCount);
|
||||
std::cout << "\tno errors." << std::endl;
|
||||
if (checkingScheme == 1) { // 100% correctness
|
||||
std::cout << "\tChecking this portion..." << std::endl;
|
||||
checkConcordiaResults(results, baseLineCount);
|
||||
std::cout << "\tno errors." << std::endl;
|
||||
}
|
||||
std::cout << "\tTotal search progress: " <<
|
||||
lineCount << " sentences searched." << std::endl;
|
||||
}
|
||||
@ -115,6 +118,9 @@ int main(int argc, char** argv) {
|
||||
"Pattern to be searched by anubis search in the index")
|
||||
("concordia-search,x", boost::program_options::value<std::string>(),
|
||||
"Pattern to be searched by concordia search in the index")
|
||||
("concordia-search-all,f",
|
||||
boost::program_options::value<std::string>(),
|
||||
"File with pattern to be searched by concordia search in the index")
|
||||
("read-file,r", boost::program_options::value<std::string>(),
|
||||
"File to be read and added to index")
|
||||
("test,t", boost::program_options::value<std::string>(),
|
||||
@ -241,6 +247,36 @@ int main(int argc, char** argv) {
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
} else if (cli.count("concordia-search-all")) {
|
||||
std::string filePath =
|
||||
cli["concordia-search-all"].as<std::string>();
|
||||
std::cout << "\tConcordia searching from file: " << filePath <<
|
||||
std::endl;
|
||||
|
||||
std::ifstream test_file(filePath.c_str());
|
||||
std::string line;
|
||||
if (test_file.is_open()) {
|
||||
long lineCount = 0;
|
||||
std::vector<std::string> buffer;
|
||||
while (getline(test_file, line)) {
|
||||
lineCount++;
|
||||
buffer.push_back(line);
|
||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||
performSearch(concordia, buffer, lineCount, 2);
|
||||
}
|
||||
}
|
||||
if (buffer.size() > 0) {
|
||||
performSearch(concordia, buffer, lineCount, 2);
|
||||
}
|
||||
test_file.close();
|
||||
std::cout << "\tSearching finished. No errors reported."
|
||||
<< std::endl;
|
||||
|
||||
} else {
|
||||
std::cerr << "Unable to open file: "<< filePath;
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else if (cli.count("read-file")) {
|
||||
std::string filePath = cli["read-file"].as<std::string>();
|
||||
std::cout << "\tReading sentences from file: " << filePath <<
|
||||
@ -377,11 +413,11 @@ int main(int argc, char** argv) {
|
||||
lineCount++;
|
||||
buffer.push_back(line);
|
||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||
performSearch(concordia, buffer, lineCount);
|
||||
performSearch(concordia, buffer, lineCount, 1);
|
||||
}
|
||||
}
|
||||
if (buffer.size() > 0) {
|
||||
performSearch(concordia, buffer, lineCount);
|
||||
performSearch(concordia, buffer, lineCount, 1);
|
||||
}
|
||||
test_file.close();
|
||||
std::cout << "\tSearching finished. No errors reported."
|
||||
@ -394,7 +430,8 @@ int main(int argc, char** argv) {
|
||||
|
||||
} else {
|
||||
std::cerr << "One of the options: simple-search, anubis-search, "
|
||||
<< "concordia-search or read-file must be provided."
|
||||
<< "concordia-search, concordia-search-file, test-file "
|
||||
<< "or read-file must be provided."
|
||||
<< "See the options specification: "
|
||||
<< std::endl << desc << std::endl;
|
||||
return 1;
|
||||
|
@ -35,13 +35,40 @@ void ConcordiaSearchResult::_checkPossibleOverlays(
|
||||
SUFFIX_MARKER_TYPE lastAddedPos,
|
||||
SUFFIX_MARKER_TYPE patternSize) {
|
||||
bool allTerminal = true;
|
||||
|
||||
// startedFragments is a list of fragments which started the recurrent
|
||||
// check in this _checkPossibleOverlays method invocation.
|
||||
// If this list is non empty, there is no point in starting
|
||||
// new invocations for fragments that do not intersect any in this list.
|
||||
// In this case we would choose to add a fragment to the currentOverlay
|
||||
// that could be preceded by a non intersecting fragment.
|
||||
|
||||
std::vector<MatchedPatternFragment> startedFragments;
|
||||
for (int i = lastAddedPos + 1; i < _matchedPatternFragments.size(); i++) {
|
||||
MatchedPatternFragment fragment = _matchedPatternFragments.at(i);
|
||||
|
||||
bool nonFilledGap = false;
|
||||
if (startedFragments.size() > 0) {
|
||||
BOOST_FOREACH(MatchedPatternFragment & startedFragment,
|
||||
startedFragments) {
|
||||
if (!fragment.intersects(startedFragment)) {
|
||||
nonFilledGap = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nonFilledGap) {
|
||||
// If the new fragment does not intersect with any of the
|
||||
// started fragments, break out of the loop.
|
||||
// There is no point in checking a currentOverlay with a gap
|
||||
// that could have been filled.
|
||||
break;
|
||||
}
|
||||
|
||||
// if fragment does not intersect currentOverlay
|
||||
if (currentOverlay.size() == 0 ||
|
||||
!currentOverlay.at(currentOverlay.size()-1).intersects(fragment)) {
|
||||
currentOverlay.push_back(fragment);
|
||||
startedFragments.push_back(fragment);
|
||||
_checkPossibleOverlays(currentOverlay, i, patternSize);
|
||||
allTerminal = false;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user