added an extremely important improvement to the concordia search algorithm - gapped overlays cut-off
This commit is contained in:
parent
209e374226
commit
0a3fd8a04e
@ -60,7 +60,8 @@ void checkConcordiaResults(
|
|||||||
|
|
||||||
void performSearch(Concordia & concordia,
|
void performSearch(Concordia & concordia,
|
||||||
std::vector<std::string> & buffer,
|
std::vector<std::string> & buffer,
|
||||||
long lineCount) {
|
long lineCount,
|
||||||
|
int checkingScheme) {
|
||||||
long baseLineCount = 0;
|
long baseLineCount = 0;
|
||||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||||
baseLineCount = lineCount - READ_BUFFER_LENGTH;
|
baseLineCount = lineCount - READ_BUFFER_LENGTH;
|
||||||
@ -93,9 +94,11 @@ void performSearch(Concordia & concordia,
|
|||||||
sentencesSearched << " sentences in "
|
sentencesSearched << " sentences in "
|
||||||
<< timeElapsed << "ms. The speed: " <<
|
<< timeElapsed << "ms. The speed: " <<
|
||||||
speed << " sentences per second" << std::endl;
|
speed << " sentences per second" << std::endl;
|
||||||
std::cout << "\tChecking this portion..." << std::endl;
|
if (checkingScheme == 1) { // 100% correctness
|
||||||
checkConcordiaResults(results, baseLineCount);
|
std::cout << "\tChecking this portion..." << std::endl;
|
||||||
std::cout << "\tno errors." << std::endl;
|
checkConcordiaResults(results, baseLineCount);
|
||||||
|
std::cout << "\tno errors." << std::endl;
|
||||||
|
}
|
||||||
std::cout << "\tTotal search progress: " <<
|
std::cout << "\tTotal search progress: " <<
|
||||||
lineCount << " sentences searched." << std::endl;
|
lineCount << " sentences searched." << std::endl;
|
||||||
}
|
}
|
||||||
@ -115,6 +118,9 @@ int main(int argc, char** argv) {
|
|||||||
"Pattern to be searched by anubis search in the index")
|
"Pattern to be searched by anubis search in the index")
|
||||||
("concordia-search,x", boost::program_options::value<std::string>(),
|
("concordia-search,x", boost::program_options::value<std::string>(),
|
||||||
"Pattern to be searched by concordia search in the index")
|
"Pattern to be searched by concordia search in the index")
|
||||||
|
("concordia-search-all,f",
|
||||||
|
boost::program_options::value<std::string>(),
|
||||||
|
"File with pattern to be searched by concordia search in the index")
|
||||||
("read-file,r", boost::program_options::value<std::string>(),
|
("read-file,r", boost::program_options::value<std::string>(),
|
||||||
"File to be read and added to index")
|
"File to be read and added to index")
|
||||||
("test,t", boost::program_options::value<std::string>(),
|
("test,t", boost::program_options::value<std::string>(),
|
||||||
@ -241,6 +247,36 @@ int main(int argc, char** argv) {
|
|||||||
<< std::endl;
|
<< std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (cli.count("concordia-search-all")) {
|
||||||
|
std::string filePath =
|
||||||
|
cli["concordia-search-all"].as<std::string>();
|
||||||
|
std::cout << "\tConcordia searching from file: " << filePath <<
|
||||||
|
std::endl;
|
||||||
|
|
||||||
|
std::ifstream test_file(filePath.c_str());
|
||||||
|
std::string line;
|
||||||
|
if (test_file.is_open()) {
|
||||||
|
long lineCount = 0;
|
||||||
|
std::vector<std::string> buffer;
|
||||||
|
while (getline(test_file, line)) {
|
||||||
|
lineCount++;
|
||||||
|
buffer.push_back(line);
|
||||||
|
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||||
|
performSearch(concordia, buffer, lineCount, 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (buffer.size() > 0) {
|
||||||
|
performSearch(concordia, buffer, lineCount, 2);
|
||||||
|
}
|
||||||
|
test_file.close();
|
||||||
|
std::cout << "\tSearching finished. No errors reported."
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
std::cerr << "Unable to open file: "<< filePath;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
} else if (cli.count("read-file")) {
|
} else if (cli.count("read-file")) {
|
||||||
std::string filePath = cli["read-file"].as<std::string>();
|
std::string filePath = cli["read-file"].as<std::string>();
|
||||||
std::cout << "\tReading sentences from file: " << filePath <<
|
std::cout << "\tReading sentences from file: " << filePath <<
|
||||||
@ -377,11 +413,11 @@ int main(int argc, char** argv) {
|
|||||||
lineCount++;
|
lineCount++;
|
||||||
buffer.push_back(line);
|
buffer.push_back(line);
|
||||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||||
performSearch(concordia, buffer, lineCount);
|
performSearch(concordia, buffer, lineCount, 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (buffer.size() > 0) {
|
if (buffer.size() > 0) {
|
||||||
performSearch(concordia, buffer, lineCount);
|
performSearch(concordia, buffer, lineCount, 1);
|
||||||
}
|
}
|
||||||
test_file.close();
|
test_file.close();
|
||||||
std::cout << "\tSearching finished. No errors reported."
|
std::cout << "\tSearching finished. No errors reported."
|
||||||
@ -394,7 +430,8 @@ int main(int argc, char** argv) {
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
std::cerr << "One of the options: simple-search, anubis-search, "
|
std::cerr << "One of the options: simple-search, anubis-search, "
|
||||||
<< "concordia-search or read-file must be provided."
|
<< "concordia-search, concordia-search-file, test-file "
|
||||||
|
<< "or read-file must be provided."
|
||||||
<< "See the options specification: "
|
<< "See the options specification: "
|
||||||
<< std::endl << desc << std::endl;
|
<< std::endl << desc << std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -35,13 +35,40 @@ void ConcordiaSearchResult::_checkPossibleOverlays(
|
|||||||
SUFFIX_MARKER_TYPE lastAddedPos,
|
SUFFIX_MARKER_TYPE lastAddedPos,
|
||||||
SUFFIX_MARKER_TYPE patternSize) {
|
SUFFIX_MARKER_TYPE patternSize) {
|
||||||
bool allTerminal = true;
|
bool allTerminal = true;
|
||||||
|
|
||||||
|
// startedFragments is a list of fragments which started the recurrent
|
||||||
|
// check in this _checkPossibleOverlays method invocation.
|
||||||
|
// If this list is non empty, there is no point in starting
|
||||||
|
// new invocations for fragments that do not intersect any in this list.
|
||||||
|
// In this case we would choose to add a fragment to the currentOverlay
|
||||||
|
// that could be preceded by a non intersecting fragment.
|
||||||
|
|
||||||
|
std::vector<MatchedPatternFragment> startedFragments;
|
||||||
for (int i = lastAddedPos + 1; i < _matchedPatternFragments.size(); i++) {
|
for (int i = lastAddedPos + 1; i < _matchedPatternFragments.size(); i++) {
|
||||||
MatchedPatternFragment fragment = _matchedPatternFragments.at(i);
|
MatchedPatternFragment fragment = _matchedPatternFragments.at(i);
|
||||||
|
|
||||||
|
bool nonFilledGap = false;
|
||||||
|
if (startedFragments.size() > 0) {
|
||||||
|
BOOST_FOREACH(MatchedPatternFragment & startedFragment,
|
||||||
|
startedFragments) {
|
||||||
|
if (!fragment.intersects(startedFragment)) {
|
||||||
|
nonFilledGap = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (nonFilledGap) {
|
||||||
|
// If the new fragment does not intersect with any of the
|
||||||
|
// started fragments, break out of the loop.
|
||||||
|
// There is no point in checking a currentOverlay with a gap
|
||||||
|
// that could have been filled.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// if fragment does not intersect currentOverlay
|
// if fragment does not intersect currentOverlay
|
||||||
if (currentOverlay.size() == 0 ||
|
if (currentOverlay.size() == 0 ||
|
||||||
!currentOverlay.at(currentOverlay.size()-1).intersects(fragment)) {
|
!currentOverlay.at(currentOverlay.size()-1).intersects(fragment)) {
|
||||||
currentOverlay.push_back(fragment);
|
currentOverlay.push_back(fragment);
|
||||||
|
startedFragments.push_back(fragment);
|
||||||
_checkPossibleOverlays(currentOverlay, i, patternSize);
|
_checkPossibleOverlays(currentOverlay, i, patternSize);
|
||||||
allTerminal = false;
|
allTerminal = false;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user