anubis search v1 - very slow for some patterns
Former-commit-id: ae327d7d24f4bc959d3749745a8c395093a17a50
This commit is contained in:
parent
fc41bb251a
commit
4e02afc897
5
concordia-anubissearch-jrc.sh
Executable file
5
concordia-anubissearch-jrc.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
|
||||
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "$1"
|
||||
|
@ -25,7 +25,9 @@ int main(int argc, char** argv) {
|
||||
"Concordia configuration file (required)")
|
||||
("simple-search,s", boost::program_options::value<std::string>(),
|
||||
"Pattern to be searched in the index")
|
||||
("silent,n", "While searching, do not output search results")
|
||||
("silent,n", "While searching with simple-search, do not output search results")
|
||||
("anubis-search,a", boost::program_options::value<std::string>(),
|
||||
"Pattern to be searched by anubis search in the index")
|
||||
("read-file,r", boost::program_options::value<std::string>(),
|
||||
"File to be read and added to index");
|
||||
|
||||
@ -80,6 +82,26 @@ int main(int argc, char** argv) {
|
||||
<< occurence.getId() << std::endl;
|
||||
}
|
||||
}
|
||||
} else if (cli.count("anubis-search")) {
|
||||
std::string pattern = cli["anubis-search"].as<std::string>();
|
||||
std::cout << "\tAnubis searching for pattern: \"" << pattern <<
|
||||
"\"" << std::endl;
|
||||
time_start = boost::posix_time::microsec_clock::local_time();
|
||||
std::vector<AnubisSearchResult> result =
|
||||
concordia.anubisSearch(pattern);
|
||||
time_end = boost::posix_time::microsec_clock::local_time();
|
||||
msdiff = time_end - time_start;
|
||||
std::cout << "\tFound: " << result.size() << " matches. "
|
||||
<< "Search took: " <<
|
||||
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||
if (!cli.count("silent")) {
|
||||
BOOST_FOREACH(AnubisSearchResult searchResult, result) {
|
||||
std::cout << "\t\tfound matching sentence number: "
|
||||
<< searchResult.getExampleId()
|
||||
<< ", score: " << searchResult.getScore()
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
} else if (cli.count("read-file")) {
|
||||
std::string filePath = cli["read-file"].as<std::string>();
|
||||
std::cout << "\tReading sentences from file: " << filePath <<
|
||||
|
@ -10,13 +10,18 @@ rm prod/resources/temp/*
|
||||
|
||||
echo "CONCORDIA RUNNER: reading from file"
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/jrc_smaller.txt
|
||||
|
||||
|
||||
echo "CONCORDIA RUNNER: anubis searching for pattern: \"Współpraca Państw Członkowskich i Komisji Europejskiej\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "Współpraca Państw Członkowskich i Komisji Europejskiej"
|
||||
echo "CONCORDIA RUNNER: anubis searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "8. W odniesieniu do artykułu 45 ustęp 12"
|
||||
|
||||
echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n
|
||||
echo "CONCORDIA RUNNER: searching for pattern: \"Dostęp do zatrudnienia\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Dostęp do zatrudnienia" -n
|
||||
|
||||
echo "CONCORDIA RUNNER: searching for pattern: \"Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem\""
|
||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem" -n
|
||||
|
||||
|
||||
rm prod/resources/text-files/jrc_smaller.txt
|
||||
#rm prod/resources/text-files/jrc_smaller.txt
|
||||
|
@ -25,7 +25,9 @@ public:
|
||||
return _score;
|
||||
}
|
||||
|
||||
|
||||
bool operator > (const AnubisSearchResult & other) const {
|
||||
return (_score > other.getScore());
|
||||
}
|
||||
private:
|
||||
SUFFIX_MARKER_TYPE _exampleId;
|
||||
|
||||
|
@ -13,6 +13,7 @@ AnubisSearcher::~AnubisSearcher() {
|
||||
|
||||
|
||||
std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
||||
boost::shared_ptr<ConcordiaConfig> config,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
@ -21,8 +22,25 @@ std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
||||
boost::shared_ptr<TmMatchesMap> tmMatchesMap =
|
||||
getTmMatches(T, markers, SA, pattern);
|
||||
|
||||
// get the tmMatches list sorted descending by score
|
||||
// 1. iterate over tmMatchesMap
|
||||
// 2. calculate score for each tmMatches
|
||||
// 3. create AnubisSearchResult from tmMatches with scores over threshold
|
||||
// 4. sort the AnubisSearchResult vector decending
|
||||
|
||||
std::vector<AnubisSearchResult> result;
|
||||
for(TmMatchesMapIterator iterator = tmMatchesMap->begin();
|
||||
iterator != tmMatchesMap->end(); iterator++) {
|
||||
TmMatches * tmMatches = iterator->second;
|
||||
tmMatches->calculateScore();
|
||||
|
||||
if (tmMatches->getScore() >= config->getAnubisThreshold()) {
|
||||
result.push_back(AnubisSearchResult(tmMatches->getExampleId(),
|
||||
tmMatches->getScore()));
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(result.begin(), result.end(), std::greater<AnubisSearchResult>());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "concordia/common/utils.hpp"
|
||||
#include "concordia/substring_occurence.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/anubis_search_result.hpp"
|
||||
#include "concordia/tm_matches.hpp"
|
||||
|
||||
@ -27,6 +28,7 @@ public:
|
||||
virtual ~AnubisSearcher();
|
||||
|
||||
std::vector<AnubisSearchResult> anubisSearch(
|
||||
boost::shared_ptr<ConcordiaConfig> config,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
|
@ -148,7 +148,7 @@ std::vector<AnubisSearchResult> Concordia::anubisSearch(
|
||||
const std::string & pattern)
|
||||
throw(ConcordiaException) {
|
||||
if (_T->size() > 0) {
|
||||
return _searcher->anubisSearch(_hashGenerator, _T,
|
||||
return _searcher->anubisSearch(_config, _hashGenerator, _T,
|
||||
_markers, _SA, pattern);
|
||||
} else {
|
||||
std::vector<AnubisSearchResult> result;
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <sstream>
|
||||
#include <stdlib.h>
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/common/logging.hpp"
|
||||
|
||||
@ -12,6 +13,7 @@
|
||||
#define STOP_WORDS_PARAM "stop_words_path"
|
||||
#define NAMED_ENTITIES_PARAM "named_entities_path"
|
||||
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
|
||||
#define ANUBIS_THRESHOLD_PARAM "anubis_threshold"
|
||||
|
||||
ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
||||
throw(ConcordiaException) {
|
||||
@ -44,6 +46,9 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
||||
ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
|
||||
_stopSymbolsFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
|
||||
_anubisThreshold =
|
||||
atof(
|
||||
ConcordiaConfig::_readConfigParameterStr(ANUBIS_THRESHOLD_PARAM).c_str());
|
||||
}
|
||||
|
||||
ConcordiaConfig::~ConcordiaConfig() {
|
||||
|
@ -67,6 +67,10 @@ public:
|
||||
return _stopSymbolsFilePath;
|
||||
}
|
||||
|
||||
double getAnubisThreshold() {
|
||||
return _anubisThreshold;
|
||||
}
|
||||
|
||||
private:
|
||||
libconfig::Config _config;
|
||||
|
||||
@ -90,6 +94,8 @@ private:
|
||||
|
||||
std::string _stopSymbolsFilePath;
|
||||
|
||||
double _anubisThreshold;
|
||||
|
||||
std::string _readConfigParameterStr(const std::string & name)
|
||||
throw(ConcordiaException);
|
||||
};
|
||||
|
@ -49,6 +49,7 @@ std::vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
||||
}
|
||||
|
||||
std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
||||
boost::shared_ptr<ConcordiaConfig> config,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
@ -56,5 +57,5 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
||||
const std::string & pattern) throw(ConcordiaException) {
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||
hashGenerator->generateHash(pattern);
|
||||
return _anubisSearcher->anubisSearch(T, markers, SA, hash);
|
||||
return _anubisSearcher->anubisSearch(config, T, markers, SA, hash);
|
||||
}
|
||||
|
@ -36,6 +36,7 @@ public:
|
||||
const std::string & pattern) throw(ConcordiaException);
|
||||
|
||||
std::vector<AnubisSearchResult> anubisSearch(
|
||||
boost::shared_ptr<ConcordiaConfig> config,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
|
@ -158,31 +158,41 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
||||
14: "Ala posiada kota"
|
||||
51: "Ala posiada rysia"
|
||||
123: "Marysia posiada rysia"
|
||||
*/
|
||||
|
||||
Test word map:
|
||||
Ala -> 0
|
||||
posiada -> 1
|
||||
kota -> 2
|
||||
rysia -> 3
|
||||
Marysia -> 4
|
||||
|
||||
Test hashed index:
|
||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||
|
||||
Test suffix array:
|
||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||
// the below expectations assume 0.3 anubis threshold
|
||||
|
||||
std::vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
|
||||
std::vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
|
||||
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 51);
|
||||
BOOST_CHECK_CLOSE(searchResult1.at(0).getScore(), 0.5609, 0.1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 123);
|
||||
BOOST_CHECK_CLOSE(searchResult1.at(1).getScore(), 0.5609, 0.1);
|
||||
|
||||
|
||||
|
||||
std::vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("Marysia posiada rysia");
|
||||
BOOST_CHECK_EQUAL(searchResult2.size(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(0).getScore(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 51);
|
||||
BOOST_CHECK_CLOSE(searchResult2.at(1).getScore(), 0.5609, 0.1);
|
||||
|
||||
std::vector<AnubisSearchResult> searchResult3 = concordia.anubisSearch("Nowe zdanie");
|
||||
BOOST_CHECK_EQUAL(searchResult3.size(), 0);
|
||||
|
||||
std::vector<AnubisSearchResult> searchResult4 = concordia.anubisSearch("Ala posiada kota chyba");
|
||||
BOOST_CHECK_EQUAL(searchResult4.size(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult4.at(0).getExampleId(), 14);
|
||||
BOOST_CHECK_CLOSE(searchResult4.at(0).getScore(), 0.848, 0.1);
|
||||
BOOST_CHECK_EQUAL(searchResult4.at(1).getExampleId(), 51);
|
||||
BOOST_CHECK_CLOSE(searchResult4.at(1).getScore(), 0.4707, 0.1);
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||
|
||||
/*
|
||||
BOOST_CHECK_EQUAL(searchResult1.size(), 2);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
|
||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51);
|
||||
|
3
getSentenceFromJRC.sh
Executable file
3
getSentenceFromJRC.sh
Executable file
@ -0,0 +1,3 @@
|
||||
#!/bin/sh
|
||||
|
||||
head -$1 prod/resources/text-files/jrc_smaller.txt | tail -1
|
@ -3,8 +3,11 @@
|
||||
#---------------------------
|
||||
#
|
||||
|
||||
# Anubis score threshold
|
||||
anubis_threshold = "0.3"
|
||||
|
||||
#Path to the Puddle tagset
|
||||
puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@";
|
||||
puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@"
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
#Word map, hashed index and suffix array files are in a temporary directory
|
||||
|
@ -3,6 +3,9 @@
|
||||
#---------------------------
|
||||
#
|
||||
|
||||
# Anubis score threshold
|
||||
anubis_threshold = "0.3"
|
||||
|
||||
#Path to the Puddle tagset
|
||||
puddle_tagset_path = "puddle/tagset.txt";
|
||||
|
||||
|
@ -3,6 +3,9 @@
|
||||
#---------------------------
|
||||
#
|
||||
|
||||
# Anubis score threshold
|
||||
anubis_threshold = "0.3"
|
||||
|
||||
#Path to the Puddle tagset
|
||||
puddle_tagset_path = "@TEST_PUDDLE_TAGSET_PATH@";
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user