anubis searcher -> concordia searcher

Former-commit-id: 8afe194adf3163ee62caa30732d9c9dd095df66b
This commit is contained in:
rjawor 2015-04-24 11:48:32 +02:00
parent 23aa113747
commit bb7608d05e
8 changed files with 31 additions and 29 deletions

View File

@ -1,6 +1,6 @@
---------------------------- Developer's private notes ----------------------------- ---------------------------- Developer's private notes -----------------------------
- document the code
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
- testy zużycia pamięci - testy zużycia pamięci
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła. - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.

View File

@ -8,7 +8,7 @@ endforeach(dir)
add_library(concordia SHARED add_library(concordia SHARED
concordia_search_result.cpp concordia_search_result.cpp
matched_pattern_fragment.cpp matched_pattern_fragment.cpp
anubis_searcher.cpp concordia_searcher.cpp
regex_replacement.cpp regex_replacement.cpp
sentence_anonymizer.cpp sentence_anonymizer.cpp
interval.cpp interval.cpp
@ -33,7 +33,9 @@ add_subdirectory(t)
install(TARGETS concordia DESTINATION lib/) install(TARGETS concordia DESTINATION lib/)
install(FILES install(FILES
anubis_searcher.hpp concordia_search_result.hpp
matched_pattern_fragment.hpp
concordia_searcher.hpp
regex_replacement.hpp regex_replacement.hpp
sentence_anonymizer.hpp sentence_anonymizer.hpp
interval.hpp interval.hpp

View File

@ -1,17 +1,17 @@
#include "concordia/anubis_searcher.hpp" #include "concordia/concordia_searcher.hpp"
#include "concordia/common/logging.hpp" #include "concordia/common/logging.hpp"
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <iostream> #include <iostream>
AnubisSearcher::AnubisSearcher() { ConcordiaSearcher::ConcordiaSearcher() {
} }
AnubisSearcher::~AnubisSearcher() { ConcordiaSearcher::~ConcordiaSearcher() {
} }
void AnubisSearcher::concordiaSearch( void ConcordiaSearcher::concordiaSearch(
boost::shared_ptr<ConcordiaSearchResult> result, boost::shared_ptr<ConcordiaSearchResult> result,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -51,7 +51,7 @@ void AnubisSearcher::concordiaSearch(
result->sortFragments(); result->sortFragments();
} }
std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch( std::vector<AnubisSearchResult> ConcordiaSearcher::anubisSearch(
boost::shared_ptr<ConcordiaConfig> config, boost::shared_ptr<ConcordiaConfig> config,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
@ -83,7 +83,7 @@ std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
return result; return result;
} }
boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches( boost::shared_ptr<TmMatchesMap> ConcordiaSearcher::getTmMatches(
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
@ -156,7 +156,7 @@ boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
return tmMatchesMap; return tmMatchesMap;
} }
std::vector<SubstringOccurence> AnubisSearcher::lcpSearch( std::vector<SubstringOccurence> ConcordiaSearcher::lcpSearch(
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
@ -209,7 +209,7 @@ std::vector<SubstringOccurence> AnubisSearcher::lcpSearch(
return result; return result;
} }
void AnubisSearcher::_collectResults( void ConcordiaSearcher::_collectResults(
std::vector<SubstringOccurence> & result, std::vector<SubstringOccurence> & result,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
@ -232,7 +232,7 @@ void AnubisSearcher::_collectResults(
} }
} }
void AnubisSearcher::_addToMap(boost::shared_ptr<std::vector<saidx_t> > SA, void ConcordiaSearcher::_addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TmMatchesMap> tmMatchesMap, boost::shared_ptr<TmMatchesMap> tmMatchesMap,
saidx_t sa_pos, saidx_t sa_pos,
@ -249,7 +249,7 @@ void AnubisSearcher::_addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
} }
} }
bool AnubisSearcher::_getOccurenceFromSA( bool ConcordiaSearcher::_getOccurenceFromSA(
boost::shared_ptr<std::vector<saidx_t> > SA, boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
saidx_t sa_pos, saidx_t sa_pos,
@ -263,7 +263,7 @@ bool AnubisSearcher::_getOccurenceFromSA(
} }
} }
void AnubisSearcher::_addOccurenceToMap( void ConcordiaSearcher::_addOccurenceToMap(
boost::shared_ptr<TmMatchesMap> tmMatchesMap, boost::shared_ptr<TmMatchesMap> tmMatchesMap,
SubstringOccurence & occurence, SubstringOccurence & occurence,
SUFFIX_MARKER_TYPE totalPatternLength, SUFFIX_MARKER_TYPE totalPatternLength,

View File

@ -16,17 +16,17 @@
#include <divsufsort.h> #include <divsufsort.h>
/*! /*!
Class for searching using Anubis algorithm. Class for searching using Concordia algorithm.
*/ */
class AnubisSearcher { class ConcordiaSearcher {
public: public:
explicit AnubisSearcher(); explicit ConcordiaSearcher();
/*! Destructor. /*! Destructor.
*/ */
virtual ~AnubisSearcher(); virtual ~ConcordiaSearcher();
void concordiaSearch( void concordiaSearch(
boost::shared_ptr<ConcordiaSearchResult> result, boost::shared_ptr<ConcordiaSearchResult> result,

View File

@ -4,8 +4,8 @@
#include <boost/filesystem.hpp> #include <boost/filesystem.hpp>
IndexSearcher::IndexSearcher() { IndexSearcher::IndexSearcher() {
_anubisSearcher = boost::shared_ptr<AnubisSearcher>( _concordiaSearcher = boost::shared_ptr<ConcordiaSearcher>(
new AnubisSearcher()); new ConcordiaSearcher());
} }
@ -57,7 +57,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
const std::string & pattern) throw(ConcordiaException) { const std::string & pattern) throw(ConcordiaException) {
std::vector<INDEX_CHARACTER_TYPE> hash = std::vector<INDEX_CHARACTER_TYPE> hash =
hashGenerator->generateHash(pattern); hashGenerator->generateHash(pattern);
return _anubisSearcher->anubisSearch(config, T, markers, SA, hash); return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
} }
boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch( boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
@ -72,6 +72,6 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
boost::shared_ptr<ConcordiaSearchResult>( boost::shared_ptr<ConcordiaSearchResult>(
new ConcordiaSearchResult(hashGenerator->generateTokenVector(pattern))); new ConcordiaSearchResult(hashGenerator->generateTokenVector(pattern)));
_anubisSearcher->concordiaSearch(result, T, markers, SA, hash); _concordiaSearcher->concordiaSearch(result, T, markers, SA, hash);
return result; return result;
} }

View File

@ -10,7 +10,7 @@
#include "concordia/substring_occurence.hpp" #include "concordia/substring_occurence.hpp"
#include "concordia/hash_generator.hpp" #include "concordia/hash_generator.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
#include "concordia/anubis_searcher.hpp" #include "concordia/concordia_searcher.hpp"
#include "concordia/anubis_search_result.hpp" #include "concordia/anubis_search_result.hpp"
#include <divsufsort.h> #include <divsufsort.h>
@ -51,7 +51,7 @@ public:
const std::string & pattern) throw(ConcordiaException); const std::string & pattern) throw(ConcordiaException);
private: private:
boost::shared_ptr<AnubisSearcher> _anubisSearcher; boost::shared_ptr<ConcordiaSearcher> _concordiaSearcher;
}; };
#endif #endif

View File

@ -1,5 +1,5 @@
add_library(concordia-tests add_library(concordia-tests
test_anubis_searcher.cpp test_concordia_searcher.cpp
test_sentence_anonymizer.cpp test_sentence_anonymizer.cpp
test_text_utils.cpp test_text_utils.cpp
test_regex_replacement.cpp test_regex_replacement.cpp

View File

@ -2,7 +2,7 @@
#include "tests/unit-tests/unit_tests_globals.hpp" #include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/tm_matches.hpp" #include "concordia/tm_matches.hpp"
#include "concordia/anubis_searcher.hpp" #include "concordia/concordia_searcher.hpp"
#include "concordia/concordia_index.hpp" #include "concordia/concordia_index.hpp"
#include "concordia/concordia_config.hpp" #include "concordia/concordia_config.hpp"
#include "concordia/example.hpp" #include "concordia/example.hpp"
@ -12,11 +12,11 @@
#include "concordia/common/logging.hpp" #include "concordia/common/logging.hpp"
#include "tests/common/test_resources_manager.hpp" #include "tests/common/test_resources_manager.hpp"
BOOST_AUTO_TEST_SUITE(anubis_searcher) BOOST_AUTO_TEST_SUITE(concordia_searcher)
BOOST_AUTO_TEST_CASE( LcpSearch1 ) BOOST_AUTO_TEST_CASE( LcpSearch1 )
{ {
AnubisSearcher searcher; ConcordiaSearcher searcher;
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>()); boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>()); boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
boost::shared_ptr<std::vector<saidx_t> > SA(new std::vector<saidx_t>()); boost::shared_ptr<std::vector<saidx_t> > SA(new std::vector<saidx_t>());
@ -333,7 +333,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
BOOST_AUTO_TEST_CASE( TmMatchesTest ) BOOST_AUTO_TEST_CASE( TmMatchesTest )
{ {
AnubisSearcher searcher; ConcordiaSearcher searcher;
/*The test index contains 3 sentences: /*The test index contains 3 sentences:
14: "Ala posiada kota" 14: "Ala posiada kota"