anubis search v1 - very slow for some patterns

Former-commit-id: ae327d7d24f4bc959d3749745a8c395093a17a50
This commit is contained in:
rjawor 2015-04-16 11:39:39 +02:00
parent fc41bb251a
commit 4e02afc897
16 changed files with 115 additions and 26 deletions

5
concordia-anubissearch-jrc.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/sh
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "$1"

View File

@ -25,7 +25,9 @@ int main(int argc, char** argv) {
"Concordia configuration file (required)")
("simple-search,s", boost::program_options::value<std::string>(),
"Pattern to be searched in the index")
("silent,n", "While searching, do not output search results")
("silent,n", "While searching with simple-search, do not output search results")
("anubis-search,a", boost::program_options::value<std::string>(),
"Pattern to be searched by anubis search in the index")
("read-file,r", boost::program_options::value<std::string>(),
"File to be read and added to index");
@ -80,6 +82,26 @@ int main(int argc, char** argv) {
<< occurence.getId() << std::endl;
}
}
} else if (cli.count("anubis-search")) {
std::string pattern = cli["anubis-search"].as<std::string>();
std::cout << "\tAnubis searching for pattern: \"" << pattern <<
"\"" << std::endl;
time_start = boost::posix_time::microsec_clock::local_time();
std::vector<AnubisSearchResult> result =
concordia.anubisSearch(pattern);
time_end = boost::posix_time::microsec_clock::local_time();
msdiff = time_end - time_start;
std::cout << "\tFound: " << result.size() << " matches. "
<< "Search took: " <<
msdiff.total_milliseconds() << "ms." << std::endl;
if (!cli.count("silent")) {
BOOST_FOREACH(AnubisSearchResult searchResult, result) {
std::cout << "\t\tfound matching sentence number: "
<< searchResult.getExampleId()
<< ", score: " << searchResult.getScore()
<< std::endl;
}
}
} else if (cli.count("read-file")) {
std::string filePath = cli["read-file"].as<std::string>();
std::cout << "\tReading sentences from file: " << filePath <<

View File

@ -10,13 +10,18 @@ rm prod/resources/temp/*
echo "CONCORDIA RUNNER: reading from file"
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/jrc_smaller.txt
echo "CONCORDIA RUNNER: anubis searching for pattern: \"Współpraca Państw Członkowskich i Komisji Europejskiej\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "Współpraca Państw Członkowskich i Komisji Europejskiej"
echo "CONCORDIA RUNNER: anubis searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "8. W odniesieniu do artykułu 45 ustęp 12"
echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n
echo "CONCORDIA RUNNER: searching for pattern: \"Dostęp do zatrudnienia\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Dostęp do zatrudnienia" -n
echo "CONCORDIA RUNNER: searching for pattern: \"Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem\""
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem" -n
rm prod/resources/text-files/jrc_smaller.txt
#rm prod/resources/text-files/jrc_smaller.txt

View File

@ -25,7 +25,9 @@ public:
return _score;
}
bool operator > (const AnubisSearchResult & other) const {
return (_score > other.getScore());
}
private:
SUFFIX_MARKER_TYPE _exampleId;

View File

@ -13,6 +13,7 @@ AnubisSearcher::~AnubisSearcher() {
std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
@ -21,8 +22,25 @@ std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
boost::shared_ptr<TmMatchesMap> tmMatchesMap =
getTmMatches(T, markers, SA, pattern);
// get the tmMatches list sorted descending by score
// 1. iterate over tmMatchesMap
// 2. calculate score for each tmMatches
// 3. create AnubisSearchResult from tmMatches with scores over threshold
// 4. sort the AnubisSearchResult vector decending
std::vector<AnubisSearchResult> result;
for(TmMatchesMapIterator iterator = tmMatchesMap->begin();
iterator != tmMatchesMap->end(); iterator++) {
TmMatches * tmMatches = iterator->second;
tmMatches->calculateScore();
if (tmMatches->getScore() >= config->getAnubisThreshold()) {
result.push_back(AnubisSearchResult(tmMatches->getExampleId(),
tmMatches->getScore()));
}
}
std::sort(result.begin(), result.end(), std::greater<AnubisSearchResult>());
return result;
}

View File

@ -7,6 +7,7 @@
#include "concordia/common/utils.hpp"
#include "concordia/substring_occurence.hpp"
#include "concordia/concordia_exception.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/anubis_search_result.hpp"
#include "concordia/tm_matches.hpp"
@ -27,6 +28,7 @@ public:
virtual ~AnubisSearcher();
std::vector<AnubisSearchResult> anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,

View File

@ -148,7 +148,7 @@ std::vector<AnubisSearchResult> Concordia::anubisSearch(
const std::string & pattern)
throw(ConcordiaException) {
if (_T->size() > 0) {
return _searcher->anubisSearch(_hashGenerator, _T,
return _searcher->anubisSearch(_config, _hashGenerator, _T,
_markers, _SA, pattern);
} else {
std::vector<AnubisSearchResult> result;

View File

@ -1,4 +1,5 @@
#include <sstream>
#include <stdlib.h>
#include "concordia/concordia_config.hpp"
#include "concordia/common/logging.hpp"
@ -12,6 +13,7 @@
#define STOP_WORDS_PARAM "stop_words_path"
#define NAMED_ENTITIES_PARAM "named_entities_path"
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
#define ANUBIS_THRESHOLD_PARAM "anubis_threshold"
ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
throw(ConcordiaException) {
@ -44,6 +46,9 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
_stopSymbolsFilePath =
ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
_anubisThreshold =
atof(
ConcordiaConfig::_readConfigParameterStr(ANUBIS_THRESHOLD_PARAM).c_str());
}
ConcordiaConfig::~ConcordiaConfig() {

View File

@ -67,6 +67,10 @@ public:
return _stopSymbolsFilePath;
}
double getAnubisThreshold() {
return _anubisThreshold;
}
private:
libconfig::Config _config;
@ -90,6 +94,8 @@ private:
std::string _stopSymbolsFilePath;
double _anubisThreshold;
std::string _readConfigParameterStr(const std::string & name)
throw(ConcordiaException);
};

View File

@ -49,6 +49,7 @@ std::vector<SubstringOccurence> IndexSearcher::simpleSearch(
}
std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -56,5 +57,5 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
const std::string & pattern) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern);
return _anubisSearcher->anubisSearch(T, markers, SA, hash);
return _anubisSearcher->anubisSearch(config, T, markers, SA, hash);
}

View File

@ -36,6 +36,7 @@ public:
const std::string & pattern) throw(ConcordiaException);
std::vector<AnubisSearchResult> anubisSearch(
boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,

View File

@ -158,31 +158,41 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
14: "Ala posiada kota"
51: "Ala posiada rysia"
123: "Marysia posiada rysia"
*/
Test word map:
Ala -> 0
posiada -> 1
kota -> 2
rysia -> 3
Marysia -> 4
Test hashed index:
n: 0 1 2 3 4 5 6 7 8 9 10 11
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
Test suffix array:
n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
// the below expectations assume 0.3 anubis threshold
std::vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
std::vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 51);
BOOST_CHECK_CLOSE(searchResult1.at(0).getScore(), 0.5609, 0.1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 123);
BOOST_CHECK_CLOSE(searchResult1.at(1).getScore(), 0.5609, 0.1);
std::vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("Marysia posiada rysia");
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 123);
BOOST_CHECK_EQUAL(searchResult2.at(0).getScore(), 1);
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 51);
BOOST_CHECK_CLOSE(searchResult2.at(1).getScore(), 0.5609, 0.1);
std::vector<AnubisSearchResult> searchResult3 = concordia.anubisSearch("Nowe zdanie");
BOOST_CHECK_EQUAL(searchResult3.size(), 0);
std::vector<AnubisSearchResult> searchResult4 = concordia.anubisSearch("Ala posiada kota chyba");
BOOST_CHECK_EQUAL(searchResult4.size(), 2);
BOOST_CHECK_EQUAL(searchResult4.at(0).getExampleId(), 14);
BOOST_CHECK_CLOSE(searchResult4.at(0).getScore(), 0.848, 0.1);
BOOST_CHECK_EQUAL(searchResult4.at(1).getExampleId(), 51);
BOOST_CHECK_CLOSE(searchResult4.at(1).getScore(), 0.4707, 0.1);
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
/*
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123);
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51);

3
getSentenceFromJRC.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/sh
head -$1 prod/resources/text-files/jrc_smaller.txt | tail -1

View File

@ -3,8 +3,11 @@
#---------------------------
#
# Anubis score threshold
anubis_threshold = "0.3"
#Path to the Puddle tagset
puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@";
puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@"
#-------------------------------------------------------------------------------
#Word map, hashed index and suffix array files are in a temporary directory

View File

@ -3,6 +3,9 @@
#---------------------------
#
# Anubis score threshold
anubis_threshold = "0.3"
#Path to the Puddle tagset
puddle_tagset_path = "puddle/tagset.txt";

View File

@ -3,6 +3,9 @@
#---------------------------
#
# Anubis score threshold
anubis_threshold = "0.3"
#Path to the Puddle tagset
puddle_tagset_path = "@TEST_PUDDLE_TAGSET_PATH@";