anubis searcher -> concordia searcher
Former-commit-id: 8afe194adf3163ee62caa30732d9c9dd095df66b
This commit is contained in:
parent
23aa113747
commit
bb7608d05e
2
TODO.txt
2
TODO.txt
@ -1,6 +1,6 @@
|
|||||||
---------------------------- Developer's private notes -----------------------------
|
---------------------------- Developer's private notes -----------------------------
|
||||||
|
|
||||||
|
- document the code
|
||||||
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
|
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie?
|
||||||
- testy zużycia pamięci
|
- testy zużycia pamięci
|
||||||
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
||||||
|
@ -8,7 +8,7 @@ endforeach(dir)
|
|||||||
add_library(concordia SHARED
|
add_library(concordia SHARED
|
||||||
concordia_search_result.cpp
|
concordia_search_result.cpp
|
||||||
matched_pattern_fragment.cpp
|
matched_pattern_fragment.cpp
|
||||||
anubis_searcher.cpp
|
concordia_searcher.cpp
|
||||||
regex_replacement.cpp
|
regex_replacement.cpp
|
||||||
sentence_anonymizer.cpp
|
sentence_anonymizer.cpp
|
||||||
interval.cpp
|
interval.cpp
|
||||||
@ -33,7 +33,9 @@ add_subdirectory(t)
|
|||||||
|
|
||||||
install(TARGETS concordia DESTINATION lib/)
|
install(TARGETS concordia DESTINATION lib/)
|
||||||
install(FILES
|
install(FILES
|
||||||
anubis_searcher.hpp
|
concordia_search_result.hpp
|
||||||
|
matched_pattern_fragment.hpp
|
||||||
|
concordia_searcher.hpp
|
||||||
regex_replacement.hpp
|
regex_replacement.hpp
|
||||||
sentence_anonymizer.hpp
|
sentence_anonymizer.hpp
|
||||||
interval.hpp
|
interval.hpp
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
#include "concordia/anubis_searcher.hpp"
|
#include "concordia/concordia_searcher.hpp"
|
||||||
|
|
||||||
#include "concordia/common/logging.hpp"
|
#include "concordia/common/logging.hpp"
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
AnubisSearcher::AnubisSearcher() {
|
ConcordiaSearcher::ConcordiaSearcher() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
AnubisSearcher::~AnubisSearcher() {
|
ConcordiaSearcher::~ConcordiaSearcher() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void AnubisSearcher::concordiaSearch(
|
void ConcordiaSearcher::concordiaSearch(
|
||||||
boost::shared_ptr<ConcordiaSearchResult> result,
|
boost::shared_ptr<ConcordiaSearchResult> result,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
@ -51,7 +51,7 @@ void AnubisSearcher::concordiaSearch(
|
|||||||
result->sortFragments();
|
result->sortFragments();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
std::vector<AnubisSearchResult> ConcordiaSearcher::anubisSearch(
|
||||||
boost::shared_ptr<ConcordiaConfig> config,
|
boost::shared_ptr<ConcordiaConfig> config,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
@ -83,7 +83,7 @@ std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
|
boost::shared_ptr<TmMatchesMap> ConcordiaSearcher::getTmMatches(
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
@ -156,7 +156,7 @@ boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
|
|||||||
return tmMatchesMap;
|
return tmMatchesMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
std::vector<SubstringOccurence> ConcordiaSearcher::lcpSearch(
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
@ -209,7 +209,7 @@ std::vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void AnubisSearcher::_collectResults(
|
void ConcordiaSearcher::_collectResults(
|
||||||
std::vector<SubstringOccurence> & result,
|
std::vector<SubstringOccurence> & result,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
@ -232,7 +232,7 @@ void AnubisSearcher::_collectResults(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void AnubisSearcher::_addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
|
void ConcordiaSearcher::_addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
||||||
saidx_t sa_pos,
|
saidx_t sa_pos,
|
||||||
@ -249,7 +249,7 @@ void AnubisSearcher::_addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AnubisSearcher::_getOccurenceFromSA(
|
bool ConcordiaSearcher::_getOccurenceFromSA(
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
saidx_t sa_pos,
|
saidx_t sa_pos,
|
||||||
@ -263,7 +263,7 @@ bool AnubisSearcher::_getOccurenceFromSA(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void AnubisSearcher::_addOccurenceToMap(
|
void ConcordiaSearcher::_addOccurenceToMap(
|
||||||
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
||||||
SubstringOccurence & occurence,
|
SubstringOccurence & occurence,
|
||||||
SUFFIX_MARKER_TYPE totalPatternLength,
|
SUFFIX_MARKER_TYPE totalPatternLength,
|
@ -16,17 +16,17 @@
|
|||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for searching using Anubis algorithm.
|
Class for searching using Concordia algorithm.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class AnubisSearcher {
|
class ConcordiaSearcher {
|
||||||
public:
|
public:
|
||||||
explicit AnubisSearcher();
|
explicit ConcordiaSearcher();
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~AnubisSearcher();
|
virtual ~ConcordiaSearcher();
|
||||||
|
|
||||||
void concordiaSearch(
|
void concordiaSearch(
|
||||||
boost::shared_ptr<ConcordiaSearchResult> result,
|
boost::shared_ptr<ConcordiaSearchResult> result,
|
@ -4,8 +4,8 @@
|
|||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
IndexSearcher::IndexSearcher() {
|
IndexSearcher::IndexSearcher() {
|
||||||
_anubisSearcher = boost::shared_ptr<AnubisSearcher>(
|
_concordiaSearcher = boost::shared_ptr<ConcordiaSearcher>(
|
||||||
new AnubisSearcher());
|
new ConcordiaSearcher());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -57,7 +57,7 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
|||||||
const std::string & pattern) throw(ConcordiaException) {
|
const std::string & pattern) throw(ConcordiaException) {
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash =
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
hashGenerator->generateHash(pattern);
|
hashGenerator->generateHash(pattern);
|
||||||
return _anubisSearcher->anubisSearch(config, T, markers, SA, hash);
|
return _concordiaSearcher->anubisSearch(config, T, markers, SA, hash);
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
|
boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
|
||||||
@ -72,6 +72,6 @@ boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
|
|||||||
boost::shared_ptr<ConcordiaSearchResult>(
|
boost::shared_ptr<ConcordiaSearchResult>(
|
||||||
new ConcordiaSearchResult(hashGenerator->generateTokenVector(pattern)));
|
new ConcordiaSearchResult(hashGenerator->generateTokenVector(pattern)));
|
||||||
|
|
||||||
_anubisSearcher->concordiaSearch(result, T, markers, SA, hash);
|
_concordiaSearcher->concordiaSearch(result, T, markers, SA, hash);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
#include "concordia/substring_occurence.hpp"
|
#include "concordia/substring_occurence.hpp"
|
||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include "concordia/anubis_searcher.hpp"
|
#include "concordia/concordia_searcher.hpp"
|
||||||
#include "concordia/anubis_search_result.hpp"
|
#include "concordia/anubis_search_result.hpp"
|
||||||
|
|
||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
@ -51,7 +51,7 @@ public:
|
|||||||
const std::string & pattern) throw(ConcordiaException);
|
const std::string & pattern) throw(ConcordiaException);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
boost::shared_ptr<AnubisSearcher> _anubisSearcher;
|
boost::shared_ptr<ConcordiaSearcher> _concordiaSearcher;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
add_library(concordia-tests
|
add_library(concordia-tests
|
||||||
test_anubis_searcher.cpp
|
test_concordia_searcher.cpp
|
||||||
test_sentence_anonymizer.cpp
|
test_sentence_anonymizer.cpp
|
||||||
test_text_utils.cpp
|
test_text_utils.cpp
|
||||||
test_regex_replacement.cpp
|
test_regex_replacement.cpp
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
#include "concordia/tm_matches.hpp"
|
#include "concordia/tm_matches.hpp"
|
||||||
#include "concordia/anubis_searcher.hpp"
|
#include "concordia/concordia_searcher.hpp"
|
||||||
#include "concordia/concordia_index.hpp"
|
#include "concordia/concordia_index.hpp"
|
||||||
#include "concordia/concordia_config.hpp"
|
#include "concordia/concordia_config.hpp"
|
||||||
#include "concordia/example.hpp"
|
#include "concordia/example.hpp"
|
||||||
@ -12,11 +12,11 @@
|
|||||||
#include "concordia/common/logging.hpp"
|
#include "concordia/common/logging.hpp"
|
||||||
#include "tests/common/test_resources_manager.hpp"
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(anubis_searcher)
|
BOOST_AUTO_TEST_SUITE(concordia_searcher)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||||
{
|
{
|
||||||
AnubisSearcher searcher;
|
ConcordiaSearcher searcher;
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
|
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA(new std::vector<saidx_t>());
|
boost::shared_ptr<std::vector<saidx_t> > SA(new std::vector<saidx_t>());
|
||||||
@ -333,7 +333,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( TmMatchesTest )
|
BOOST_AUTO_TEST_CASE( TmMatchesTest )
|
||||||
{
|
{
|
||||||
AnubisSearcher searcher;
|
ConcordiaSearcher searcher;
|
||||||
|
|
||||||
/*The test index contains 3 sentences:
|
/*The test index contains 3 sentences:
|
||||||
14: "Ala posiada kota"
|
14: "Ala posiada kota"
|
Loading…
Reference in New Issue
Block a user