diff --git a/TODO.txt b/TODO.txt index 24624fe..f1fc806 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,13 +1,14 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? - testy zużycia pamięci - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła. -- Multi-threading? -- concordia-server (zastanowić się, czy nie napisać CAT-a) -- puścić 100% search test na jrc +- Multi-threading? (przy concordia search jak najbardziej. Tylko wtedy trzebaby zastosować sortowanie po końcach przedziału przed liczeniem best overlay, co nawiasem mówiąc jest gotowe). +- concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server). ---------------------------- Archive ----------------------------- +DONE - puścić 100% search test na jrc + REJECTED - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek). DONE - wyłączyć stopWords diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index 5ce5500..0b7555f 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -16,6 +16,83 @@ namespace po = boost::program_options; +void reportError(long lineNumber, const std::string & error) { + std::stringstream ss; + ss << "Search error in line " << lineNumber << ": " << error; + throw ConcordiaException(ss.str()); +} + +void checkConcordiaResults( + const std::vector & results, + long baseLineCount) { + long lineIndex = 1; + BOOST_FOREACH(ConcordiaSearchResult result, results) { + SUFFIX_MARKER_TYPE patternSize = result.getTokenVector().size(); + + if (result.getBestOverlay().size() != 1) { + reportError(baseLineCount + lineIndex, + "best overlay has more than one fragment."); + } + if (result.getBestOverlay().at(0).getMatchedLength() + != patternSize) { + reportError(baseLineCount + lineIndex, + "best overlay fragment has different size than pattern."); + } + if (result.getBestOverlayScore() != 1) { + reportError(baseLineCount + lineIndex, + "best overlay score is not 1."); + } + if (result.getFragments().size() == 0) { + reportError(baseLineCount + lineIndex, + "there are no matched fragments."); + } + if (result.getFragments().at(0).getMatchedLength() + != patternSize) { + reportError(baseLineCount + lineIndex, + "the first fragment does not cover the whole pattern."); + } + } +} + +void performSearch(Concordia & concordia, + std::vector & buffer, + long lineCount) { + long baseLineCount = 0; + if (lineCount % READ_BUFFER_LENGTH == 0) { + baseLineCount = lineCount - READ_BUFFER_LENGTH; + } else { + baseLineCount = (lineCount / READ_BUFFER_LENGTH) + * READ_BUFFER_LENGTH; + } + std::vector results; + boost::posix_time::ptime timeStart = + boost::posix_time::microsec_clock::local_time(); + // perform searching + BOOST_FOREACH(std::string pattern, buffer) { + results.push_back(*concordia.concordiaSearch(pattern)); + } + boost::posix_time::ptime timeEnd = + boost::posix_time::microsec_clock::local_time(); + boost::posix_time::time_duration msdiff = + timeEnd - timeStart; + + int sentencesSearched = buffer.size(); + buffer.clear(); + + long timeElapsed = msdiff.total_milliseconds(); + double speed = static_cast( + 1000 * sentencesSearched / timeElapsed); + std::cout << "\tSearched a portion of " << + sentencesSearched << " sentences in " + << timeElapsed << "ms. The speed: " << + speed << " sentences per second" << std::endl; + std::cout << "\tChecking this portion..." << std::endl; + checkConcordiaResults(results, baseLineCount); + std::cout << "\tno errors." << std::endl; + std::cout << "\tTotal search progress: " << + lineCount << " sentences searched." << std::endl; +} + int main(int argc, char** argv) { po::options_description desc("Allowed options"); @@ -32,7 +109,9 @@ int main(int argc, char** argv) { ("concordia-search,x", boost::program_options::value(), "Pattern to be searched by concordia search in the index") ("read-file,r", boost::program_options::value(), - "File to be read and added to index"); + "File to be read and added to index") + ("test,t", boost::program_options::value(), + "Run performance and correctness tests on file"); po::variables_map cli; po::store(po::parse_command_line(argc, argv, desc), cli); @@ -203,10 +282,98 @@ int main(int argc, char** argv) { std::cerr << "Unable to open file: "<< filePath; return 1; } + } else if (cli.count("test")) { + std::string filePath = cli["test"].as(); + std::cout << "\tTesting on file: " << filePath << + std::endl; + // adding to index + std::ifstream text_file(filePath.c_str()); + std::string line; + if (text_file.is_open()) { + long lineCount = 0; + std::vector buffer; + boost::posix_time::ptime timeStart = + boost::posix_time::microsec_clock::local_time(); + while (getline(text_file, line)) { + lineCount++; + buffer.push_back(Example(line, lineCount)); + if (lineCount % READ_BUFFER_LENGTH == 0) { + concordia.addAllExamples(buffer); + buffer.clear(); + boost::posix_time::ptime timeEnd = + boost::posix_time::microsec_clock::local_time(); + boost::posix_time::time_duration msdiff = + timeEnd - timeStart; + long timeElapsed = msdiff.total_milliseconds(); + double speed = static_cast( + 1000 * lineCount / timeElapsed); + std::cout << "\tRead and added to index " << + lineCount << " sentences in " << timeElapsed + << "ms. Current speed: " << speed << + " sentences per second" << std::endl; + } + } + if (buffer.size() > 0) { + concordia.addAllExamples(buffer); + } + text_file.close(); + boost::posix_time::ptime timeTotalEnd = + boost::posix_time::microsec_clock::local_time(); + boost::posix_time::time_duration totalMsdiff = + timeTotalEnd - timeStart; + long totalTimeElapsed = totalMsdiff.total_milliseconds(); + double totalSpeed = + static_cast(1000 * lineCount / totalTimeElapsed); + std::cout << "\tReading finished. Read and added to index " + << lineCount << " sentences in " << totalTimeElapsed << + "ms. Overall speed: " << totalSpeed << + " sentences per second" << std::endl; + } else { + std::cerr << "Unable to open file: "<< filePath; + return 1; + } + + // generating SA + std::cout << "\tGenerating SA from RAM..." << std::endl; + boost::posix_time::ptime SAStart = + boost::posix_time::microsec_clock::local_time(); + concordia.refreshSAfromRAM(); + boost::posix_time::ptime SAEnd = + boost::posix_time::microsec_clock::local_time(); + boost::posix_time::time_duration SAdiff = + SAEnd - SAStart; + long SAtimeElapsed = SAdiff.total_milliseconds(); + std::cout << "\tSA generated in " << SAtimeElapsed + << "ms." << std::endl; + + // searching + std::ifstream test_file(filePath.c_str()); + if (test_file.is_open()) { + long lineCount = 0; + std::vector buffer; + while (getline(test_file, line)) { + lineCount++; + buffer.push_back(line); + if (lineCount % READ_BUFFER_LENGTH == 0) { + performSearch(concordia, buffer, lineCount); + } + } + if (buffer.size() > 0) { + performSearch(concordia, buffer, lineCount); + } + test_file.close(); + std::cout << "\tSearching finished. No errors reported." + << std::endl; + + } else { + std::cerr << "Unable to open file: "<< filePath; + return 1; + } + } else { - std::cerr << "One of the options: generate-index, simple-search, " - << "read-file must be provided. See the " - "options specification: " + std::cerr << "One of the options: simple-search, anubis-search, " + << "concordia-search or read-file must be provided." + << "See the options specification: " << std::endl << desc << std::endl; return 1; } diff --git a/concordia-test-jrc.sh b/concordia-test-jrc.sh new file mode 100755 index 0000000..083c29d --- /dev/null +++ b/concordia-test-jrc.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +echo "CONCORDIA RUNNER: Decompressing test file" + +xz -dk prod/resources/text-files/jrc_smaller.txt.xz + +echo "CONCORDIA RUNNER: Running Concordia" + +rm prod/resources/temp/* + +echo "CONCORDIA RUNNER: testing" +./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -t prod/resources/text-files/jrc_smaller.txt + +#rm prod/resources/text-files/jrc_smaller.txt diff --git a/concordia-test-medium.sh b/concordia-test-medium.sh new file mode 100755 index 0000000..8ae78a7 --- /dev/null +++ b/concordia-test-medium.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +echo "CONCORDIA RUNNER: Running Concordia" + +rm prod/resources/temp/* + +echo "CONCORDIA RUNNER: testing" +./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -t prod/resources/text-files/medium.txt + diff --git a/concordia/sentence_anonymizer.cpp b/concordia/sentence_anonymizer.cpp index 33894cc..85598d5 100644 --- a/concordia/sentence_anonymizer.cpp +++ b/concordia/sentence_anonymizer.cpp @@ -13,7 +13,7 @@ SentenceAnonymizer::SentenceAnonymizer( _createNeRules(config->getNamedEntitiesFilePath()); _createHtmlTagsRule(config->getHtmlTagsFilePath()); _stopWordsEnabled = config->isStopWordsEnabled(); - if (_stopWordsEnabled) { + if (_stopWordsEnabled) { _stopWords = _getMultipleReplacementRule( config->getStopWordsFilePath(), "", true); }