From 4e02afc89703b4a63e73cdfe97efd53bffd653b8 Mon Sep 17 00:00:00 2001 From: rjawor Date: Thu, 16 Apr 2015 11:39:39 +0200 Subject: [PATCH] anubis search v1 - very slow for some patterns Former-commit-id: ae327d7d24f4bc959d3749745a8c395093a17a50 --- concordia-anubissearch-jrc.sh | 5 +++ concordia-console/concordia-console.cpp | 24 +++++++++- concordia-runner-jrc.sh | 11 +++-- concordia/anubis_search_result.hpp | 4 +- concordia/anubis_searcher.cpp | 20 ++++++++- concordia/anubis_searcher.hpp | 2 + concordia/concordia.cpp | 2 +- concordia/concordia_config.cpp | 5 +++ concordia/concordia_config.hpp | 6 +++ concordia/index_searcher.cpp | 3 +- concordia/index_searcher.hpp | 1 + concordia/t/test_concordia.cpp | 44 ++++++++++++------- getSentenceFromJRC.sh | 3 ++ .../concordia-config/concordia.cfg.in | 5 ++- .../concordia-config/concordia-mock.cfg | 3 ++ .../concordia-config/concordia.cfg.in | 3 ++ 16 files changed, 115 insertions(+), 26 deletions(-) create mode 100755 concordia-anubissearch-jrc.sh create mode 100755 getSentenceFromJRC.sh diff --git a/concordia-anubissearch-jrc.sh b/concordia-anubissearch-jrc.sh new file mode 100755 index 0000000..9988208 --- /dev/null +++ b/concordia-anubissearch-jrc.sh @@ -0,0 +1,5 @@ +#!/bin/sh + + +./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "$1" + diff --git a/concordia-console/concordia-console.cpp b/concordia-console/concordia-console.cpp index f458583..ce63c5c 100644 --- a/concordia-console/concordia-console.cpp +++ b/concordia-console/concordia-console.cpp @@ -25,7 +25,9 @@ int main(int argc, char** argv) { "Concordia configuration file (required)") ("simple-search,s", boost::program_options::value(), "Pattern to be searched in the index") - ("silent,n", "While searching, do not output search results") + ("silent,n", "While searching with simple-search, do not output search results") + ("anubis-search,a", boost::program_options::value(), + "Pattern to be searched by anubis search in the index") ("read-file,r", boost::program_options::value(), "File to be read and added to index"); @@ -80,6 +82,26 @@ int main(int argc, char** argv) { << occurence.getId() << std::endl; } } + } else if (cli.count("anubis-search")) { + std::string pattern = cli["anubis-search"].as(); + std::cout << "\tAnubis searching for pattern: \"" << pattern << + "\"" << std::endl; + time_start = boost::posix_time::microsec_clock::local_time(); + std::vector result = + concordia.anubisSearch(pattern); + time_end = boost::posix_time::microsec_clock::local_time(); + msdiff = time_end - time_start; + std::cout << "\tFound: " << result.size() << " matches. " + << "Search took: " << + msdiff.total_milliseconds() << "ms." << std::endl; + if (!cli.count("silent")) { + BOOST_FOREACH(AnubisSearchResult searchResult, result) { + std::cout << "\t\tfound matching sentence number: " + << searchResult.getExampleId() + << ", score: " << searchResult.getScore() + << std::endl; + } + } } else if (cli.count("read-file")) { std::string filePath = cli["read-file"].as(); std::cout << "\tReading sentences from file: " << filePath << diff --git a/concordia-runner-jrc.sh b/concordia-runner-jrc.sh index 2495213..396f79a 100755 --- a/concordia-runner-jrc.sh +++ b/concordia-runner-jrc.sh @@ -10,13 +10,18 @@ rm prod/resources/temp/* echo "CONCORDIA RUNNER: reading from file" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/jrc_smaller.txt + + +echo "CONCORDIA RUNNER: anubis searching for pattern: \"Współpraca Państw Członkowskich i Komisji Europejskiej\"" +./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "Współpraca Państw Członkowskich i Komisji Europejskiej" +echo "CONCORDIA RUNNER: anubis searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\"" +./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "8. W odniesieniu do artykułu 45 ustęp 12" + echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\"" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n echo "CONCORDIA RUNNER: searching for pattern: \"Dostęp do zatrudnienia\"" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Dostęp do zatrudnienia" -n - echo "CONCORDIA RUNNER: searching for pattern: \"Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem\"" ./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem" -n - -rm prod/resources/text-files/jrc_smaller.txt +#rm prod/resources/text-files/jrc_smaller.txt diff --git a/concordia/anubis_search_result.hpp b/concordia/anubis_search_result.hpp index b036ecc..bb87efd 100644 --- a/concordia/anubis_search_result.hpp +++ b/concordia/anubis_search_result.hpp @@ -25,7 +25,9 @@ public: return _score; } - + bool operator > (const AnubisSearchResult & other) const { + return (_score > other.getScore()); + } private: SUFFIX_MARKER_TYPE _exampleId; diff --git a/concordia/anubis_searcher.cpp b/concordia/anubis_searcher.cpp index 6896757..4595c21 100644 --- a/concordia/anubis_searcher.cpp +++ b/concordia/anubis_searcher.cpp @@ -13,6 +13,7 @@ AnubisSearcher::~AnubisSearcher() { std::vector AnubisSearcher::anubisSearch( + boost::shared_ptr config, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, @@ -21,8 +22,25 @@ std::vector AnubisSearcher::anubisSearch( boost::shared_ptr tmMatchesMap = getTmMatches(T, markers, SA, pattern); - // get the tmMatches list sorted descending by score + // 1. iterate over tmMatchesMap + // 2. calculate score for each tmMatches + // 3. create AnubisSearchResult from tmMatches with scores over threshold + // 4. sort the AnubisSearchResult vector decending + std::vector result; + for(TmMatchesMapIterator iterator = tmMatchesMap->begin(); + iterator != tmMatchesMap->end(); iterator++) { + TmMatches * tmMatches = iterator->second; + tmMatches->calculateScore(); + + if (tmMatches->getScore() >= config->getAnubisThreshold()) { + result.push_back(AnubisSearchResult(tmMatches->getExampleId(), + tmMatches->getScore())); + } + } + + std::sort(result.begin(), result.end(), std::greater()); + return result; } diff --git a/concordia/anubis_searcher.hpp b/concordia/anubis_searcher.hpp index 72df7a4..c44b99c 100644 --- a/concordia/anubis_searcher.hpp +++ b/concordia/anubis_searcher.hpp @@ -7,6 +7,7 @@ #include "concordia/common/utils.hpp" #include "concordia/substring_occurence.hpp" #include "concordia/concordia_exception.hpp" +#include "concordia/concordia_config.hpp" #include "concordia/anubis_search_result.hpp" #include "concordia/tm_matches.hpp" @@ -27,6 +28,7 @@ public: virtual ~AnubisSearcher(); std::vector anubisSearch( + boost::shared_ptr config, boost::shared_ptr > T, boost::shared_ptr > markers, boost::shared_ptr > SA, diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 976d21c..c9434f8 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -148,7 +148,7 @@ std::vector Concordia::anubisSearch( const std::string & pattern) throw(ConcordiaException) { if (_T->size() > 0) { - return _searcher->anubisSearch(_hashGenerator, _T, + return _searcher->anubisSearch(_config, _hashGenerator, _T, _markers, _SA, pattern); } else { std::vector result; diff --git a/concordia/concordia_config.cpp b/concordia/concordia_config.cpp index a60d42c..03462a0 100644 --- a/concordia/concordia_config.cpp +++ b/concordia/concordia_config.cpp @@ -1,4 +1,5 @@ #include +#include #include "concordia/concordia_config.hpp" #include "concordia/common/logging.hpp" @@ -12,6 +13,7 @@ #define STOP_WORDS_PARAM "stop_words_path" #define NAMED_ENTITIES_PARAM "named_entities_path" #define STOP_SYMBOLS_PARAM "stop_symbols_path" +#define ANUBIS_THRESHOLD_PARAM "anubis_threshold" ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath) throw(ConcordiaException) { @@ -44,6 +46,9 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath) ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM); _stopSymbolsFilePath = ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM); + _anubisThreshold = + atof( + ConcordiaConfig::_readConfigParameterStr(ANUBIS_THRESHOLD_PARAM).c_str()); } ConcordiaConfig::~ConcordiaConfig() { diff --git a/concordia/concordia_config.hpp b/concordia/concordia_config.hpp index b8cebea..be69330 100644 --- a/concordia/concordia_config.hpp +++ b/concordia/concordia_config.hpp @@ -67,6 +67,10 @@ public: return _stopSymbolsFilePath; } + double getAnubisThreshold() { + return _anubisThreshold; + } + private: libconfig::Config _config; @@ -90,6 +94,8 @@ private: std::string _stopSymbolsFilePath; + double _anubisThreshold; + std::string _readConfigParameterStr(const std::string & name) throw(ConcordiaException); }; diff --git a/concordia/index_searcher.cpp b/concordia/index_searcher.cpp index f277a96..d42beb6 100644 --- a/concordia/index_searcher.cpp +++ b/concordia/index_searcher.cpp @@ -49,6 +49,7 @@ std::vector IndexSearcher::simpleSearch( } std::vector IndexSearcher::anubisSearch( + boost::shared_ptr config, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, @@ -56,5 +57,5 @@ std::vector IndexSearcher::anubisSearch( const std::string & pattern) throw(ConcordiaException) { std::vector hash = hashGenerator->generateHash(pattern); - return _anubisSearcher->anubisSearch(T, markers, SA, hash); + return _anubisSearcher->anubisSearch(config, T, markers, SA, hash); } diff --git a/concordia/index_searcher.hpp b/concordia/index_searcher.hpp index 5fdcbad..068832d 100644 --- a/concordia/index_searcher.hpp +++ b/concordia/index_searcher.hpp @@ -36,6 +36,7 @@ public: const std::string & pattern) throw(ConcordiaException); std::vector anubisSearch( + boost::shared_ptr config, boost::shared_ptr hashGenerator, boost::shared_ptr > T, boost::shared_ptr > markers, diff --git a/concordia/t/test_concordia.cpp b/concordia/t/test_concordia.cpp index 6e07757..0554636 100644 --- a/concordia/t/test_concordia.cpp +++ b/concordia/t/test_concordia.cpp @@ -158,31 +158,41 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 ) 14: "Ala posiada kota" 51: "Ala posiada rysia" 123: "Marysia posiada rysia" - - Test word map: - Ala -> 0 - posiada -> 1 - kota -> 2 - rysia -> 3 - Marysia -> 4 - - Test hashed index: - n: 0 1 2 3 4 5 6 7 8 9 10 11 - T[n]: 0 1 2 | 0 1 3 | 4 1 3 | - - Test suffix array: - n: 0 1 2 3 4 5 6 7 8 9 10 11 - SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 + */ + + // the below expectations assume 0.3 anubis threshold std::vector searchResult1 = concordia.anubisSearch("posiada rysia chyba"); - std::vector searchResult2 = concordia.anubisSearch("posiada kota Ala"); + BOOST_CHECK_EQUAL(searchResult1.size(), 2); + BOOST_CHECK_EQUAL(searchResult1.at(0).getExampleId(), 51); + BOOST_CHECK_CLOSE(searchResult1.at(0).getScore(), 0.5609, 0.1); + BOOST_CHECK_EQUAL(searchResult1.at(1).getExampleId(), 123); + BOOST_CHECK_CLOSE(searchResult1.at(1).getScore(), 0.5609, 0.1); + + + + std::vector searchResult2 = concordia.anubisSearch("Marysia posiada rysia"); + BOOST_CHECK_EQUAL(searchResult2.size(), 2); + BOOST_CHECK_EQUAL(searchResult2.at(0).getExampleId(), 123); + BOOST_CHECK_EQUAL(searchResult2.at(0).getScore(), 1); + BOOST_CHECK_EQUAL(searchResult2.at(1).getExampleId(), 51); + BOOST_CHECK_CLOSE(searchResult2.at(1).getScore(), 0.5609, 0.1); + + std::vector searchResult3 = concordia.anubisSearch("Nowe zdanie"); + BOOST_CHECK_EQUAL(searchResult3.size(), 0); + + std::vector searchResult4 = concordia.anubisSearch("Ala posiada kota chyba"); + BOOST_CHECK_EQUAL(searchResult4.size(), 2); + BOOST_CHECK_EQUAL(searchResult4.at(0).getExampleId(), 14); + BOOST_CHECK_CLOSE(searchResult4.at(0).getScore(), 0.848, 0.1); + BOOST_CHECK_EQUAL(searchResult4.at(1).getExampleId(), 51); + BOOST_CHECK_CLOSE(searchResult4.at(1).getScore(), 0.4707, 0.1); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS)); boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX)); /* - BOOST_CHECK_EQUAL(searchResult1.size(), 2); BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123); BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1); BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51); diff --git a/getSentenceFromJRC.sh b/getSentenceFromJRC.sh new file mode 100755 index 0000000..3401a89 --- /dev/null +++ b/getSentenceFromJRC.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +head -$1 prod/resources/text-files/jrc_smaller.txt | tail -1 diff --git a/prod/resources/concordia-config/concordia.cfg.in b/prod/resources/concordia-config/concordia.cfg.in index 416674c..38a13ed 100644 --- a/prod/resources/concordia-config/concordia.cfg.in +++ b/prod/resources/concordia-config/concordia.cfg.in @@ -3,8 +3,11 @@ #--------------------------- # +# Anubis score threshold +anubis_threshold = "0.3" + #Path to the Puddle tagset -puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@"; +puddle_tagset_path = "@PROD_PUDDLE_TAGSET_PATH@" #------------------------------------------------------------------------------- #Word map, hashed index and suffix array files are in a temporary directory diff --git a/tests/resources/concordia-config/concordia-mock.cfg b/tests/resources/concordia-config/concordia-mock.cfg index 1d9f7bd..cb7cab7 100644 --- a/tests/resources/concordia-config/concordia-mock.cfg +++ b/tests/resources/concordia-config/concordia-mock.cfg @@ -3,6 +3,9 @@ #--------------------------- # +# Anubis score threshold +anubis_threshold = "0.3" + #Path to the Puddle tagset puddle_tagset_path = "puddle/tagset.txt"; diff --git a/tests/resources/concordia-config/concordia.cfg.in b/tests/resources/concordia-config/concordia.cfg.in index 688f796..a5a2bea 100644 --- a/tests/resources/concordia-config/concordia.cfg.in +++ b/tests/resources/concordia-config/concordia.cfg.in @@ -3,6 +3,9 @@ #--------------------------- # +# Anubis score threshold +anubis_threshold = "0.3" + #Path to the Puddle tagset puddle_tagset_path = "@TEST_PUDDLE_TAGSET_PATH@";