100% test in concordia-console. All passed!

Former-commit-id: 6e6186a148d637ba5a0d324d6d68c78708f0942d
This commit is contained in:
rjawor 2015-04-22 16:50:12 +02:00
parent d9112e209a
commit 04df67c6f0
5 changed files with 199 additions and 8 deletions

View File

@ -1,13 +1,14 @@
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
- testy zużycia pamięci - testy zużycia pamięci
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła. - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
- Multi-threading? - Multi-threading? (przy concordia search jak najbardziej. Tylko wtedy trzebaby zastosować sortowanie po końcach przedziału przed liczeniem best overlay, co nawiasem mówiąc jest gotowe).
- concordia-server (zastanowić się, czy nie napisać CAT-a) - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
- puścić 100% search test na jrc
---------------------------- Archive ----------------------------- ---------------------------- Archive -----------------------------
DONE - puścić 100% search test na jrc
REJECTED - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek). REJECTED - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
DONE - wyłączyć stopWords DONE - wyłączyć stopWords

View File

@ -16,6 +16,83 @@
namespace po = boost::program_options; namespace po = boost::program_options;
void reportError(long lineNumber, const std::string & error) {
std::stringstream ss;
ss << "Search error in line " << lineNumber << ": " << error;
throw ConcordiaException(ss.str());
}
void checkConcordiaResults(
const std::vector<ConcordiaSearchResult> & results,
long baseLineCount) {
long lineIndex = 1;
BOOST_FOREACH(ConcordiaSearchResult result, results) {
SUFFIX_MARKER_TYPE patternSize = result.getTokenVector().size();
if (result.getBestOverlay().size() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay has more than one fragment.");
}
if (result.getBestOverlay().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"best overlay fragment has different size than pattern.");
}
if (result.getBestOverlayScore() != 1) {
reportError(baseLineCount + lineIndex,
"best overlay score is not 1.");
}
if (result.getFragments().size() == 0) {
reportError(baseLineCount + lineIndex,
"there are no matched fragments.");
}
if (result.getFragments().at(0).getMatchedLength()
!= patternSize) {
reportError(baseLineCount + lineIndex,
"the first fragment does not cover the whole pattern.");
}
}
}
void performSearch(Concordia & concordia,
std::vector<std::string> & buffer,
long lineCount) {
long baseLineCount = 0;
if (lineCount % READ_BUFFER_LENGTH == 0) {
baseLineCount = lineCount - READ_BUFFER_LENGTH;
} else {
baseLineCount = (lineCount / READ_BUFFER_LENGTH)
* READ_BUFFER_LENGTH;
}
std::vector<ConcordiaSearchResult> results;
boost::posix_time::ptime timeStart =
boost::posix_time::microsec_clock::local_time();
// perform searching
BOOST_FOREACH(std::string pattern, buffer) {
results.push_back(*concordia.concordiaSearch(pattern));
}
boost::posix_time::ptime timeEnd =
boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration msdiff =
timeEnd - timeStart;
int sentencesSearched = buffer.size();
buffer.clear();
long timeElapsed = msdiff.total_milliseconds();
double speed = static_cast<double>(
1000 * sentencesSearched / timeElapsed);
std::cout << "\tSearched a portion of " <<
sentencesSearched << " sentences in "
<< timeElapsed << "ms. The speed: " <<
speed << " sentences per second" << std::endl;
std::cout << "\tChecking this portion..." << std::endl;
checkConcordiaResults(results, baseLineCount);
std::cout << "\tno errors." << std::endl;
std::cout << "\tTotal search progress: " <<
lineCount << " sentences searched." << std::endl;
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
po::options_description desc("Allowed options"); po::options_description desc("Allowed options");
@ -32,7 +109,9 @@ int main(int argc, char** argv) {
("concordia-search,x", boost::program_options::value<std::string>(), ("concordia-search,x", boost::program_options::value<std::string>(),
"Pattern to be searched by concordia search in the index") "Pattern to be searched by concordia search in the index")
("read-file,r", boost::program_options::value<std::string>(), ("read-file,r", boost::program_options::value<std::string>(),
"File to be read and added to index"); "File to be read and added to index")
("test,t", boost::program_options::value<std::string>(),
"Run performance and correctness tests on file");
po::variables_map cli; po::variables_map cli;
po::store(po::parse_command_line(argc, argv, desc), cli); po::store(po::parse_command_line(argc, argv, desc), cli);
@ -203,10 +282,98 @@ int main(int argc, char** argv) {
std::cerr << "Unable to open file: "<< filePath; std::cerr << "Unable to open file: "<< filePath;
return 1; return 1;
} }
} else if (cli.count("test")) {
std::string filePath = cli["test"].as<std::string>();
std::cout << "\tTesting on file: " << filePath <<
std::endl;
// adding to index
std::ifstream text_file(filePath.c_str());
std::string line;
if (text_file.is_open()) {
long lineCount = 0;
std::vector<Example> buffer;
boost::posix_time::ptime timeStart =
boost::posix_time::microsec_clock::local_time();
while (getline(text_file, line)) {
lineCount++;
buffer.push_back(Example(line, lineCount));
if (lineCount % READ_BUFFER_LENGTH == 0) {
concordia.addAllExamples(buffer);
buffer.clear();
boost::posix_time::ptime timeEnd =
boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration msdiff =
timeEnd - timeStart;
long timeElapsed = msdiff.total_milliseconds();
double speed = static_cast<double>(
1000 * lineCount / timeElapsed);
std::cout << "\tRead and added to index " <<
lineCount << " sentences in " << timeElapsed
<< "ms. Current speed: " << speed <<
" sentences per second" << std::endl;
}
}
if (buffer.size() > 0) {
concordia.addAllExamples(buffer);
}
text_file.close();
boost::posix_time::ptime timeTotalEnd =
boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration totalMsdiff =
timeTotalEnd - timeStart;
long totalTimeElapsed = totalMsdiff.total_milliseconds();
double totalSpeed =
static_cast<double>(1000 * lineCount / totalTimeElapsed);
std::cout << "\tReading finished. Read and added to index "
<< lineCount << " sentences in " << totalTimeElapsed <<
"ms. Overall speed: " << totalSpeed <<
" sentences per second" << std::endl;
} else {
std::cerr << "Unable to open file: "<< filePath;
return 1;
}
// generating SA
std::cout << "\tGenerating SA from RAM..." << std::endl;
boost::posix_time::ptime SAStart =
boost::posix_time::microsec_clock::local_time();
concordia.refreshSAfromRAM();
boost::posix_time::ptime SAEnd =
boost::posix_time::microsec_clock::local_time();
boost::posix_time::time_duration SAdiff =
SAEnd - SAStart;
long SAtimeElapsed = SAdiff.total_milliseconds();
std::cout << "\tSA generated in " << SAtimeElapsed
<< "ms." << std::endl;
// searching
std::ifstream test_file(filePath.c_str());
if (test_file.is_open()) {
long lineCount = 0;
std::vector<std::string> buffer;
while (getline(test_file, line)) {
lineCount++;
buffer.push_back(line);
if (lineCount % READ_BUFFER_LENGTH == 0) {
performSearch(concordia, buffer, lineCount);
}
}
if (buffer.size() > 0) {
performSearch(concordia, buffer, lineCount);
}
test_file.close();
std::cout << "\tSearching finished. No errors reported."
<< std::endl;
} else {
std::cerr << "Unable to open file: "<< filePath;
return 1;
}
} else { } else {
std::cerr << "One of the options: generate-index, simple-search, " std::cerr << "One of the options: simple-search, anubis-search, "
<< "read-file must be provided. See the " << "concordia-search or read-file must be provided."
"options specification: " << "See the options specification: "
<< std::endl << desc << std::endl; << std::endl << desc << std::endl;
return 1; return 1;
} }

14
concordia-test-jrc.sh Executable file
View File

@ -0,0 +1,14 @@
#!/bin/sh
echo "CONCORDIA RUNNER: Decompressing test file"
xz -dk prod/resources/text-files/jrc_smaller.txt.xz
echo "CONCORDIA RUNNER: Running Concordia"
rm prod/resources/temp/*
echo "CONCORDIA RUNNER: testing"
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -t prod/resources/text-files/jrc_smaller.txt
#rm prod/resources/text-files/jrc_smaller.txt

9
concordia-test-medium.sh Executable file
View File

@ -0,0 +1,9 @@
#!/bin/sh
echo "CONCORDIA RUNNER: Running Concordia"
rm prod/resources/temp/*
echo "CONCORDIA RUNNER: testing"
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -t prod/resources/text-files/medium.txt