From 0a3fd8a04e51069bbe8095b8d20602bdd86351d0 Mon Sep 17 00:00:00 2001 From: rjawor Date: Mon, 24 Aug 2015 13:10:06 +0200 Subject: [PATCH] added an extremely important improvement to the concordia search algorithm - gapped overlays cut-off --- concordia-console/concordia-console.cpp | 51 +++++++++++++++++++++---- concordia/concordia_search_result.cpp | 27 +++++++++++++ 2 files changed, 71 insertions(+), 7 deletions(-) diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index 1702262..23951e1 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -60,7 +60,8 @@ void checkConcordiaResults( void performSearch(Concordia & concordia, std::vector & buffer, - long lineCount) { + long lineCount, + int checkingScheme) { long baseLineCount = 0; if (lineCount % READ_BUFFER_LENGTH == 0) { baseLineCount = lineCount - READ_BUFFER_LENGTH; @@ -93,9 +94,11 @@ void performSearch(Concordia & concordia, sentencesSearched << " sentences in " << timeElapsed << "ms. The speed: " << speed << " sentences per second" << std::endl; - std::cout << "\tChecking this portion..." << std::endl; - checkConcordiaResults(results, baseLineCount); - std::cout << "\tno errors." << std::endl; + if (checkingScheme == 1) { // 100% correctness + std::cout << "\tChecking this portion..." << std::endl; + checkConcordiaResults(results, baseLineCount); + std::cout << "\tno errors." << std::endl; + } std::cout << "\tTotal search progress: " << lineCount << " sentences searched." << std::endl; } @@ -115,6 +118,9 @@ int main(int argc, char** argv) { "Pattern to be searched by anubis search in the index") ("concordia-search,x", boost::program_options::value(), "Pattern to be searched by concordia search in the index") + ("concordia-search-all,f", + boost::program_options::value(), + "File with pattern to be searched by concordia search in the index") ("read-file,r", boost::program_options::value(), "File to be read and added to index") ("test,t", boost::program_options::value(), @@ -241,6 +247,36 @@ int main(int argc, char** argv) { << std::endl; } } + } else if (cli.count("concordia-search-all")) { + std::string filePath = + cli["concordia-search-all"].as(); + std::cout << "\tConcordia searching from file: " << filePath << + std::endl; + + std::ifstream test_file(filePath.c_str()); + std::string line; + if (test_file.is_open()) { + long lineCount = 0; + std::vector buffer; + while (getline(test_file, line)) { + lineCount++; + buffer.push_back(line); + if (lineCount % READ_BUFFER_LENGTH == 0) { + performSearch(concordia, buffer, lineCount, 2); + } + } + if (buffer.size() > 0) { + performSearch(concordia, buffer, lineCount, 2); + } + test_file.close(); + std::cout << "\tSearching finished. No errors reported." + << std::endl; + + } else { + std::cerr << "Unable to open file: "<< filePath; + return 1; + } + } else if (cli.count("read-file")) { std::string filePath = cli["read-file"].as(); std::cout << "\tReading sentences from file: " << filePath << @@ -377,11 +413,11 @@ int main(int argc, char** argv) { lineCount++; buffer.push_back(line); if (lineCount % READ_BUFFER_LENGTH == 0) { - performSearch(concordia, buffer, lineCount); + performSearch(concordia, buffer, lineCount, 1); } } if (buffer.size() > 0) { - performSearch(concordia, buffer, lineCount); + performSearch(concordia, buffer, lineCount, 1); } test_file.close(); std::cout << "\tSearching finished. No errors reported." @@ -394,7 +430,8 @@ int main(int argc, char** argv) { } else { std::cerr << "One of the options: simple-search, anubis-search, " - << "concordia-search or read-file must be provided." + << "concordia-search, concordia-search-file, test-file " + << "or read-file must be provided." << "See the options specification: " << std::endl << desc << std::endl; return 1; diff --git a/concordia/concordia_search_result.cpp b/concordia/concordia_search_result.cpp index dbd3bc3..11e2284 100644 --- a/concordia/concordia_search_result.cpp +++ b/concordia/concordia_search_result.cpp @@ -35,13 +35,40 @@ void ConcordiaSearchResult::_checkPossibleOverlays( SUFFIX_MARKER_TYPE lastAddedPos, SUFFIX_MARKER_TYPE patternSize) { bool allTerminal = true; + + // startedFragments is a list of fragments which started the recurrent + // check in this _checkPossibleOverlays method invocation. + // If this list is non empty, there is no point in starting + // new invocations for fragments that do not intersect any in this list. + // In this case we would choose to add a fragment to the currentOverlay + // that could be preceded by a non intersecting fragment. + + std::vector startedFragments; for (int i = lastAddedPos + 1; i < _matchedPatternFragments.size(); i++) { MatchedPatternFragment fragment = _matchedPatternFragments.at(i); + bool nonFilledGap = false; + if (startedFragments.size() > 0) { + BOOST_FOREACH(MatchedPatternFragment & startedFragment, + startedFragments) { + if (!fragment.intersects(startedFragment)) { + nonFilledGap = true; + } + } + } + if (nonFilledGap) { + // If the new fragment does not intersect with any of the + // started fragments, break out of the loop. + // There is no point in checking a currentOverlay with a gap + // that could have been filled. + break; + } + // if fragment does not intersect currentOverlay if (currentOverlay.size() == 0 || !currentOverlay.at(currentOverlay.size()-1).intersects(fragment)) { currentOverlay.push_back(fragment); + startedFragments.push_back(fragment); _checkPossibleOverlays(currentOverlay, i, patternSize); allTerminal = false; }