100% test in concordia-console. All passed!
Former-commit-id: 6e6186a148d637ba5a0d324d6d68c78708f0942d
This commit is contained in:
parent
d9112e209a
commit
04df67c6f0
7
TODO.txt
7
TODO.txt
@ -1,13 +1,14 @@
|
|||||||
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
|
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
|
||||||
- testy zużycia pamięci
|
- testy zużycia pamięci
|
||||||
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
||||||
- Multi-threading?
|
- Multi-threading? (przy concordia search jak najbardziej. Tylko wtedy trzebaby zastosować sortowanie po końcach przedziału przed liczeniem best overlay, co nawiasem mówiąc jest gotowe).
|
||||||
- concordia-server (zastanowić się, czy nie napisać CAT-a)
|
- concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
|
||||||
- puścić 100% search test na jrc
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
---------------------------- Archive -----------------------------
|
---------------------------- Archive -----------------------------
|
||||||
|
DONE - puścić 100% search test na jrc
|
||||||
|
|
||||||
REJECTED - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
|
REJECTED - zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
|
||||||
|
|
||||||
DONE - wyłączyć stopWords
|
DONE - wyłączyć stopWords
|
||||||
|
@ -16,6 +16,83 @@
|
|||||||
|
|
||||||
namespace po = boost::program_options;
|
namespace po = boost::program_options;
|
||||||
|
|
||||||
|
void reportError(long lineNumber, const std::string & error) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "Search error in line " << lineNumber << ": " << error;
|
||||||
|
throw ConcordiaException(ss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
void checkConcordiaResults(
|
||||||
|
const std::vector<ConcordiaSearchResult> & results,
|
||||||
|
long baseLineCount) {
|
||||||
|
long lineIndex = 1;
|
||||||
|
BOOST_FOREACH(ConcordiaSearchResult result, results) {
|
||||||
|
SUFFIX_MARKER_TYPE patternSize = result.getTokenVector().size();
|
||||||
|
|
||||||
|
if (result.getBestOverlay().size() != 1) {
|
||||||
|
reportError(baseLineCount + lineIndex,
|
||||||
|
"best overlay has more than one fragment.");
|
||||||
|
}
|
||||||
|
if (result.getBestOverlay().at(0).getMatchedLength()
|
||||||
|
!= patternSize) {
|
||||||
|
reportError(baseLineCount + lineIndex,
|
||||||
|
"best overlay fragment has different size than pattern.");
|
||||||
|
}
|
||||||
|
if (result.getBestOverlayScore() != 1) {
|
||||||
|
reportError(baseLineCount + lineIndex,
|
||||||
|
"best overlay score is not 1.");
|
||||||
|
}
|
||||||
|
if (result.getFragments().size() == 0) {
|
||||||
|
reportError(baseLineCount + lineIndex,
|
||||||
|
"there are no matched fragments.");
|
||||||
|
}
|
||||||
|
if (result.getFragments().at(0).getMatchedLength()
|
||||||
|
!= patternSize) {
|
||||||
|
reportError(baseLineCount + lineIndex,
|
||||||
|
"the first fragment does not cover the whole pattern.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void performSearch(Concordia & concordia,
|
||||||
|
std::vector<std::string> & buffer,
|
||||||
|
long lineCount) {
|
||||||
|
long baseLineCount = 0;
|
||||||
|
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||||
|
baseLineCount = lineCount - READ_BUFFER_LENGTH;
|
||||||
|
} else {
|
||||||
|
baseLineCount = (lineCount / READ_BUFFER_LENGTH)
|
||||||
|
* READ_BUFFER_LENGTH;
|
||||||
|
}
|
||||||
|
std::vector<ConcordiaSearchResult> results;
|
||||||
|
boost::posix_time::ptime timeStart =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
// perform searching
|
||||||
|
BOOST_FOREACH(std::string pattern, buffer) {
|
||||||
|
results.push_back(*concordia.concordiaSearch(pattern));
|
||||||
|
}
|
||||||
|
boost::posix_time::ptime timeEnd =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
boost::posix_time::time_duration msdiff =
|
||||||
|
timeEnd - timeStart;
|
||||||
|
|
||||||
|
int sentencesSearched = buffer.size();
|
||||||
|
buffer.clear();
|
||||||
|
|
||||||
|
long timeElapsed = msdiff.total_milliseconds();
|
||||||
|
double speed = static_cast<double>(
|
||||||
|
1000 * sentencesSearched / timeElapsed);
|
||||||
|
std::cout << "\tSearched a portion of " <<
|
||||||
|
sentencesSearched << " sentences in "
|
||||||
|
<< timeElapsed << "ms. The speed: " <<
|
||||||
|
speed << " sentences per second" << std::endl;
|
||||||
|
std::cout << "\tChecking this portion..." << std::endl;
|
||||||
|
checkConcordiaResults(results, baseLineCount);
|
||||||
|
std::cout << "\tno errors." << std::endl;
|
||||||
|
std::cout << "\tTotal search progress: " <<
|
||||||
|
lineCount << " sentences searched." << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
po::options_description desc("Allowed options");
|
po::options_description desc("Allowed options");
|
||||||
|
|
||||||
@ -32,7 +109,9 @@ int main(int argc, char** argv) {
|
|||||||
("concordia-search,x", boost::program_options::value<std::string>(),
|
("concordia-search,x", boost::program_options::value<std::string>(),
|
||||||
"Pattern to be searched by concordia search in the index")
|
"Pattern to be searched by concordia search in the index")
|
||||||
("read-file,r", boost::program_options::value<std::string>(),
|
("read-file,r", boost::program_options::value<std::string>(),
|
||||||
"File to be read and added to index");
|
"File to be read and added to index")
|
||||||
|
("test,t", boost::program_options::value<std::string>(),
|
||||||
|
"Run performance and correctness tests on file");
|
||||||
|
|
||||||
po::variables_map cli;
|
po::variables_map cli;
|
||||||
po::store(po::parse_command_line(argc, argv, desc), cli);
|
po::store(po::parse_command_line(argc, argv, desc), cli);
|
||||||
@ -203,10 +282,98 @@ int main(int argc, char** argv) {
|
|||||||
std::cerr << "Unable to open file: "<< filePath;
|
std::cerr << "Unable to open file: "<< filePath;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
} else if (cli.count("test")) {
|
||||||
|
std::string filePath = cli["test"].as<std::string>();
|
||||||
|
std::cout << "\tTesting on file: " << filePath <<
|
||||||
|
std::endl;
|
||||||
|
// adding to index
|
||||||
|
std::ifstream text_file(filePath.c_str());
|
||||||
|
std::string line;
|
||||||
|
if (text_file.is_open()) {
|
||||||
|
long lineCount = 0;
|
||||||
|
std::vector<Example> buffer;
|
||||||
|
boost::posix_time::ptime timeStart =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
while (getline(text_file, line)) {
|
||||||
|
lineCount++;
|
||||||
|
buffer.push_back(Example(line, lineCount));
|
||||||
|
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||||
|
concordia.addAllExamples(buffer);
|
||||||
|
buffer.clear();
|
||||||
|
boost::posix_time::ptime timeEnd =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
boost::posix_time::time_duration msdiff =
|
||||||
|
timeEnd - timeStart;
|
||||||
|
long timeElapsed = msdiff.total_milliseconds();
|
||||||
|
double speed = static_cast<double>(
|
||||||
|
1000 * lineCount / timeElapsed);
|
||||||
|
std::cout << "\tRead and added to index " <<
|
||||||
|
lineCount << " sentences in " << timeElapsed
|
||||||
|
<< "ms. Current speed: " << speed <<
|
||||||
|
" sentences per second" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (buffer.size() > 0) {
|
||||||
|
concordia.addAllExamples(buffer);
|
||||||
|
}
|
||||||
|
text_file.close();
|
||||||
|
boost::posix_time::ptime timeTotalEnd =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
boost::posix_time::time_duration totalMsdiff =
|
||||||
|
timeTotalEnd - timeStart;
|
||||||
|
long totalTimeElapsed = totalMsdiff.total_milliseconds();
|
||||||
|
double totalSpeed =
|
||||||
|
static_cast<double>(1000 * lineCount / totalTimeElapsed);
|
||||||
|
std::cout << "\tReading finished. Read and added to index "
|
||||||
|
<< lineCount << " sentences in " << totalTimeElapsed <<
|
||||||
|
"ms. Overall speed: " << totalSpeed <<
|
||||||
|
" sentences per second" << std::endl;
|
||||||
|
} else {
|
||||||
|
std::cerr << "Unable to open file: "<< filePath;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// generating SA
|
||||||
|
std::cout << "\tGenerating SA from RAM..." << std::endl;
|
||||||
|
boost::posix_time::ptime SAStart =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
concordia.refreshSAfromRAM();
|
||||||
|
boost::posix_time::ptime SAEnd =
|
||||||
|
boost::posix_time::microsec_clock::local_time();
|
||||||
|
boost::posix_time::time_duration SAdiff =
|
||||||
|
SAEnd - SAStart;
|
||||||
|
long SAtimeElapsed = SAdiff.total_milliseconds();
|
||||||
|
std::cout << "\tSA generated in " << SAtimeElapsed
|
||||||
|
<< "ms." << std::endl;
|
||||||
|
|
||||||
|
// searching
|
||||||
|
std::ifstream test_file(filePath.c_str());
|
||||||
|
if (test_file.is_open()) {
|
||||||
|
long lineCount = 0;
|
||||||
|
std::vector<std::string> buffer;
|
||||||
|
while (getline(test_file, line)) {
|
||||||
|
lineCount++;
|
||||||
|
buffer.push_back(line);
|
||||||
|
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||||
|
performSearch(concordia, buffer, lineCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (buffer.size() > 0) {
|
||||||
|
performSearch(concordia, buffer, lineCount);
|
||||||
|
}
|
||||||
|
test_file.close();
|
||||||
|
std::cout << "\tSearching finished. No errors reported."
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
std::cerr << "Unable to open file: "<< filePath;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
std::cerr << "One of the options: generate-index, simple-search, "
|
std::cerr << "One of the options: simple-search, anubis-search, "
|
||||||
<< "read-file must be provided. See the "
|
<< "concordia-search or read-file must be provided."
|
||||||
"options specification: "
|
<< "See the options specification: "
|
||||||
<< std::endl << desc << std::endl;
|
<< std::endl << desc << std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
14
concordia-test-jrc.sh
Executable file
14
concordia-test-jrc.sh
Executable file
@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
echo "CONCORDIA RUNNER: Decompressing test file"
|
||||||
|
|
||||||
|
xz -dk prod/resources/text-files/jrc_smaller.txt.xz
|
||||||
|
|
||||||
|
echo "CONCORDIA RUNNER: Running Concordia"
|
||||||
|
|
||||||
|
rm prod/resources/temp/*
|
||||||
|
|
||||||
|
echo "CONCORDIA RUNNER: testing"
|
||||||
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -t prod/resources/text-files/jrc_smaller.txt
|
||||||
|
|
||||||
|
#rm prod/resources/text-files/jrc_smaller.txt
|
9
concordia-test-medium.sh
Executable file
9
concordia-test-medium.sh
Executable file
@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
echo "CONCORDIA RUNNER: Running Concordia"
|
||||||
|
|
||||||
|
rm prod/resources/temp/*
|
||||||
|
|
||||||
|
echo "CONCORDIA RUNNER: testing"
|
||||||
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -t prod/resources/text-files/medium.txt
|
||||||
|
|
@ -13,7 +13,7 @@ SentenceAnonymizer::SentenceAnonymizer(
|
|||||||
_createNeRules(config->getNamedEntitiesFilePath());
|
_createNeRules(config->getNamedEntitiesFilePath());
|
||||||
_createHtmlTagsRule(config->getHtmlTagsFilePath());
|
_createHtmlTagsRule(config->getHtmlTagsFilePath());
|
||||||
_stopWordsEnabled = config->isStopWordsEnabled();
|
_stopWordsEnabled = config->isStopWordsEnabled();
|
||||||
if (_stopWordsEnabled) {
|
if (_stopWordsEnabled) {
|
||||||
_stopWords = _getMultipleReplacementRule(
|
_stopWords = _getMultipleReplacementRule(
|
||||||
config->getStopWordsFilePath(), "", true);
|
config->getStopWordsFilePath(), "", true);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user