100% test in concordia-console. All passed!
Former-commit-id: 6e6186a148d637ba5a0d324d6d68c78708f0942d
This commit is contained in:
parent
d9112e209a
commit
04df67c6f0
7
TODO.txt
7
TODO.txt
@ -1,13 +1,14 @@
|
||||
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
|
||||
- testy zużycia pamięci
|
||||
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
||||
- Multi-threading?
|
||||
- concordia-server (zastanowić się, czy nie napisać CAT-a)
|
||||
- puścić 100% search test na jrc
|
||||
- Multi-threading? (przy concordia search jak najbardziej. Tylko wtedy trzebaby zastosować sortowanie po końcach przedziału przed liczeniem best overlay, co nawiasem mówiąc jest gotowe).
|
||||
- concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
|
||||
|
||||
|
||||
|
||||
---------------------------- Archive -----------------------------
|
||||
DONE - puścić 100% search test na jrc
|
||||
|
||||
REJECTED - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
|
||||
|
||||
DONE - wyłączyć stopWords
|
||||
|
@ -16,6 +16,83 @@
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
void reportError(long lineNumber, const std::string & error) {
|
||||
std::stringstream ss;
|
||||
ss << "Search error in line " << lineNumber << ": " << error;
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
|
||||
void checkConcordiaResults(
|
||||
const std::vector<ConcordiaSearchResult> & results,
|
||||
long baseLineCount) {
|
||||
long lineIndex = 1;
|
||||
BOOST_FOREACH(ConcordiaSearchResult result, results) {
|
||||
SUFFIX_MARKER_TYPE patternSize = result.getTokenVector().size();
|
||||
|
||||
if (result.getBestOverlay().size() != 1) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay has more than one fragment.");
|
||||
}
|
||||
if (result.getBestOverlay().at(0).getMatchedLength()
|
||||
!= patternSize) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay fragment has different size than pattern.");
|
||||
}
|
||||
if (result.getBestOverlayScore() != 1) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"best overlay score is not 1.");
|
||||
}
|
||||
if (result.getFragments().size() == 0) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"there are no matched fragments.");
|
||||
}
|
||||
if (result.getFragments().at(0).getMatchedLength()
|
||||
!= patternSize) {
|
||||
reportError(baseLineCount + lineIndex,
|
||||
"the first fragment does not cover the whole pattern.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void performSearch(Concordia & concordia,
|
||||
std::vector<std::string> & buffer,
|
||||
long lineCount) {
|
||||
long baseLineCount = 0;
|
||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||
baseLineCount = lineCount - READ_BUFFER_LENGTH;
|
||||
} else {
|
||||
baseLineCount = (lineCount / READ_BUFFER_LENGTH)
|
||||
* READ_BUFFER_LENGTH;
|
||||
}
|
||||
std::vector<ConcordiaSearchResult> results;
|
||||
boost::posix_time::ptime timeStart =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
// perform searching
|
||||
BOOST_FOREACH(std::string pattern, buffer) {
|
||||
results.push_back(*concordia.concordiaSearch(pattern));
|
||||
}
|
||||
boost::posix_time::ptime timeEnd =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration msdiff =
|
||||
timeEnd - timeStart;
|
||||
|
||||
int sentencesSearched = buffer.size();
|
||||
buffer.clear();
|
||||
|
||||
long timeElapsed = msdiff.total_milliseconds();
|
||||
double speed = static_cast<double>(
|
||||
1000 * sentencesSearched / timeElapsed);
|
||||
std::cout << "\tSearched a portion of " <<
|
||||
sentencesSearched << " sentences in "
|
||||
<< timeElapsed << "ms. The speed: " <<
|
||||
speed << " sentences per second" << std::endl;
|
||||
std::cout << "\tChecking this portion..." << std::endl;
|
||||
checkConcordiaResults(results, baseLineCount);
|
||||
std::cout << "\tno errors." << std::endl;
|
||||
std::cout << "\tTotal search progress: " <<
|
||||
lineCount << " sentences searched." << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
po::options_description desc("Allowed options");
|
||||
|
||||
@ -32,7 +109,9 @@ int main(int argc, char** argv) {
|
||||
("concordia-search,x", boost::program_options::value<std::string>(),
|
||||
"Pattern to be searched by concordia search in the index")
|
||||
("read-file,r", boost::program_options::value<std::string>(),
|
||||
"File to be read and added to index");
|
||||
"File to be read and added to index")
|
||||
("test,t", boost::program_options::value<std::string>(),
|
||||
"Run performance and correctness tests on file");
|
||||
|
||||
po::variables_map cli;
|
||||
po::store(po::parse_command_line(argc, argv, desc), cli);
|
||||
@ -203,10 +282,98 @@ int main(int argc, char** argv) {
|
||||
std::cerr << "Unable to open file: "<< filePath;
|
||||
return 1;
|
||||
}
|
||||
} else if (cli.count("test")) {
|
||||
std::string filePath = cli["test"].as<std::string>();
|
||||
std::cout << "\tTesting on file: " << filePath <<
|
||||
std::endl;
|
||||
// adding to index
|
||||
std::ifstream text_file(filePath.c_str());
|
||||
std::string line;
|
||||
if (text_file.is_open()) {
|
||||
long lineCount = 0;
|
||||
std::vector<Example> buffer;
|
||||
boost::posix_time::ptime timeStart =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
while (getline(text_file, line)) {
|
||||
lineCount++;
|
||||
buffer.push_back(Example(line, lineCount));
|
||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||
concordia.addAllExamples(buffer);
|
||||
buffer.clear();
|
||||
boost::posix_time::ptime timeEnd =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration msdiff =
|
||||
timeEnd - timeStart;
|
||||
long timeElapsed = msdiff.total_milliseconds();
|
||||
double speed = static_cast<double>(
|
||||
1000 * lineCount / timeElapsed);
|
||||
std::cout << "\tRead and added to index " <<
|
||||
lineCount << " sentences in " << timeElapsed
|
||||
<< "ms. Current speed: " << speed <<
|
||||
" sentences per second" << std::endl;
|
||||
}
|
||||
}
|
||||
if (buffer.size() > 0) {
|
||||
concordia.addAllExamples(buffer);
|
||||
}
|
||||
text_file.close();
|
||||
boost::posix_time::ptime timeTotalEnd =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration totalMsdiff =
|
||||
timeTotalEnd - timeStart;
|
||||
long totalTimeElapsed = totalMsdiff.total_milliseconds();
|
||||
double totalSpeed =
|
||||
static_cast<double>(1000 * lineCount / totalTimeElapsed);
|
||||
std::cout << "\tReading finished. Read and added to index "
|
||||
<< lineCount << " sentences in " << totalTimeElapsed <<
|
||||
"ms. Overall speed: " << totalSpeed <<
|
||||
" sentences per second" << std::endl;
|
||||
} else {
|
||||
std::cerr << "One of the options: generate-index, simple-search, "
|
||||
<< "read-file must be provided. See the "
|
||||
"options specification: "
|
||||
std::cerr << "Unable to open file: "<< filePath;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// generating SA
|
||||
std::cout << "\tGenerating SA from RAM..." << std::endl;
|
||||
boost::posix_time::ptime SAStart =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
concordia.refreshSAfromRAM();
|
||||
boost::posix_time::ptime SAEnd =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
boost::posix_time::time_duration SAdiff =
|
||||
SAEnd - SAStart;
|
||||
long SAtimeElapsed = SAdiff.total_milliseconds();
|
||||
std::cout << "\tSA generated in " << SAtimeElapsed
|
||||
<< "ms." << std::endl;
|
||||
|
||||
// searching
|
||||
std::ifstream test_file(filePath.c_str());
|
||||
if (test_file.is_open()) {
|
||||
long lineCount = 0;
|
||||
std::vector<std::string> buffer;
|
||||
while (getline(test_file, line)) {
|
||||
lineCount++;
|
||||
buffer.push_back(line);
|
||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||
performSearch(concordia, buffer, lineCount);
|
||||
}
|
||||
}
|
||||
if (buffer.size() > 0) {
|
||||
performSearch(concordia, buffer, lineCount);
|
||||
}
|
||||
test_file.close();
|
||||
std::cout << "\tSearching finished. No errors reported."
|
||||
<< std::endl;
|
||||
|
||||
} else {
|
||||
std::cerr << "Unable to open file: "<< filePath;
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
std::cerr << "One of the options: simple-search, anubis-search, "
|
||||
<< "concordia-search or read-file must be provided."
|
||||
<< "See the options specification: "
|
||||
<< std::endl << desc << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
14
concordia-test-jrc.sh
Executable file
14
concordia-test-jrc.sh
Executable file
@ -0,0 +1,14 @@
|
||||
#!/bin/sh
|
||||
|
||||
echo "CONCORDIA RUNNER: Decompressing test file"
|
||||
|
||||
xz -dk prod/resources/text-files/jrc_smaller.txt.xz
|
||||
|
||||
echo "CONCORDIA RUNNER: Running Concordia"
|
||||
|
||||
rm prod/resources/temp/*
|
||||
|
||||
echo "CONCORDIA RUNNER: testing"
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -t prod/resources/text-files/jrc_smaller.txt
|
||||
|
||||
#rm prod/resources/text-files/jrc_smaller.txt
|
9
concordia-test-medium.sh
Executable file
9
concordia-test-medium.sh
Executable file
@ -0,0 +1,9 @@
|
||||
#!/bin/sh
|
||||
|
||||
echo "CONCORDIA RUNNER: Running Concordia"
|
||||
|
||||
rm prod/resources/temp/*
|
||||
|
||||
echo "CONCORDIA RUNNER: testing"
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -t prod/resources/text-files/medium.txt
|
||||
|
Loading…
Reference in New Issue
Block a user