concordia search
Former-commit-id: 609c3a54e930ebae45a2e9a07f63991ec4abc9a6
This commit is contained in:
parent
0927e2ed1f
commit
024fbf72aa
5
TODO.txt
5
TODO.txt
@ -1,8 +1,8 @@
|
|||||||
- wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości. Dodatkowo obliczany jest zestaw optymalnego pokrycia patternu.
|
- wyłączyć stopWords
|
||||||
|
- Przy concordia searh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle)
|
||||||
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
||||||
- Multi-threading?
|
- Multi-threading?
|
||||||
- concordia-server (zastanowić się, czy nie napisać CAT-a)
|
- concordia-server (zastanowić się, czy nie napisać CAT-a)
|
||||||
- wyłączyć stopWords
|
|
||||||
- zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
|
- zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
|
||||||
- puścić 100% search test na jrc
|
- puścić 100% search test na jrc
|
||||||
|
|
||||||
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
---------------------------- Archive -----------------------------
|
---------------------------- Archive -----------------------------
|
||||||
|
|
||||||
|
DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości.
|
||||||
|
|
||||||
DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
||||||
DONE 2. anonimizacja zdań
|
DONE 2. anonimizacja zdań
|
||||||
|
5
concordia-concordiasearch-jrc.sh
Executable file
5
concordia-concordiasearch-jrc.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
|
||||||
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "$1"
|
||||||
|
|
@ -25,9 +25,12 @@ int main(int argc, char** argv) {
|
|||||||
"Concordia configuration file (required)")
|
"Concordia configuration file (required)")
|
||||||
("simple-search,s", boost::program_options::value<std::string>(),
|
("simple-search,s", boost::program_options::value<std::string>(),
|
||||||
"Pattern to be searched in the index")
|
"Pattern to be searched in the index")
|
||||||
("silent,n", "While searching with simple-search, do not output search results")
|
("silent,n",
|
||||||
|
"While searching with simple-search, do not output search results")
|
||||||
("anubis-search,a", boost::program_options::value<std::string>(),
|
("anubis-search,a", boost::program_options::value<std::string>(),
|
||||||
"Pattern to be searched by anubis search in the index")
|
"Pattern to be searched by anubis search in the index")
|
||||||
|
("concordia-search,x", boost::program_options::value<std::string>(),
|
||||||
|
"Pattern to be searched by concordia search in the index")
|
||||||
("read-file,r", boost::program_options::value<std::string>(),
|
("read-file,r", boost::program_options::value<std::string>(),
|
||||||
"File to be read and added to index");
|
"File to be read and added to index");
|
||||||
|
|
||||||
@ -102,6 +105,38 @@ int main(int argc, char** argv) {
|
|||||||
<< std::endl;
|
<< std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (cli.count("concordia-search")) {
|
||||||
|
std::string pattern = cli["concordia-search"].as<std::string>();
|
||||||
|
std::cout << "\tConcordia searching for pattern: \"" << pattern <<
|
||||||
|
"\"" << std::endl;
|
||||||
|
time_start = boost::posix_time::microsec_clock::local_time();
|
||||||
|
boost::shared_ptr<ConcordiaSearchResult> result =
|
||||||
|
concordia.concordiaSearch(pattern);
|
||||||
|
time_end = boost::posix_time::microsec_clock::local_time();
|
||||||
|
msdiff = time_end - time_start;
|
||||||
|
|
||||||
|
std::cout << "\tPattern used: " << std::endl << "\t\t";
|
||||||
|
BOOST_FOREACH(std::string token, result->getTokenVector()) {
|
||||||
|
std::cout << token << " ";
|
||||||
|
}
|
||||||
|
std::cout << std::endl;
|
||||||
|
|
||||||
|
std::cout << "\tFound: " << result->getFragments().size()
|
||||||
|
<< " matches. " << "Search took: " <<
|
||||||
|
msdiff.total_milliseconds() << "ms." << std::endl;
|
||||||
|
if (!cli.count("silent")) {
|
||||||
|
BOOST_FOREACH(MatchedPatternFragment fragment,
|
||||||
|
result->getFragments()) {
|
||||||
|
std::cout << "\t\tfound matching fragment "
|
||||||
|
<< "(exampleId, exampleOffset,"
|
||||||
|
<< " patternOffset, length): "
|
||||||
|
<< fragment.getExampleId() << ","
|
||||||
|
<< fragment.getExampleOffset() << ","
|
||||||
|
<< fragment.getPatternOffset() << ","
|
||||||
|
<< fragment.getMatchedLength() << ","
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (cli.count("read-file")) {
|
} else if (cli.count("read-file")) {
|
||||||
std::string filePath = cli["read-file"].as<std::string>();
|
std::string filePath = cli["read-file"].as<std::string>();
|
||||||
std::cout << "\tReading sentences from file: " << filePath <<
|
std::cout << "\tReading sentences from file: " << filePath <<
|
||||||
|
@ -12,10 +12,10 @@ echo "CONCORDIA RUNNER: reading from file"
|
|||||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/jrc_smaller.txt
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -r prod/resources/text-files/jrc_smaller.txt
|
||||||
|
|
||||||
|
|
||||||
echo "CONCORDIA RUNNER: anubis searching for pattern: \"Współpraca Państw Członkowskich i Komisji Europejskiej\""
|
echo "CONCORDIA RUNNER: concordia searching for pattern: \"Współpraca Państw Członkowskich i Komisji Europejskiej\""
|
||||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "Współpraca Państw Członkowskich i Komisji Europejskiej"
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "Współpraca Państw Członkowskich i Komisji Europejskiej"
|
||||||
echo "CONCORDIA RUNNER: anubis searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\""
|
echo "CONCORDIA RUNNER: concordia searching for pattern: \"8. W odniesieniu do artykułu 45 ustęp 12\""
|
||||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -a "8. W odniesieniu do artykułu 45 ustęp 12"
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -x "8. W odniesieniu do artykułu 45 ustęp 12"
|
||||||
|
|
||||||
echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\""
|
echo "CONCORDIA RUNNER: searching for pattern: \"Parlamentu Europejskiego\""
|
||||||
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n
|
./build/concordia-console/concordia-console -c prod/resources/concordia-config/concordia.cfg -s "Parlamentu Europejskiego" -n
|
||||||
|
@ -6,6 +6,8 @@ foreach(dir ${ALL_DIRECTORIES})
|
|||||||
endforeach(dir)
|
endforeach(dir)
|
||||||
|
|
||||||
add_library(concordia SHARED
|
add_library(concordia SHARED
|
||||||
|
concordia_search_result.cpp
|
||||||
|
matched_pattern_fragment.cpp
|
||||||
anubis_searcher.cpp
|
anubis_searcher.cpp
|
||||||
regex_replacement.cpp
|
regex_replacement.cpp
|
||||||
sentence_anonymizer.cpp
|
sentence_anonymizer.cpp
|
||||||
|
@ -11,6 +11,42 @@ AnubisSearcher::AnubisSearcher() {
|
|||||||
AnubisSearcher::~AnubisSearcher() {
|
AnubisSearcher::~AnubisSearcher() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AnubisSearcher::concordiaSearch(
|
||||||
|
boost::shared_ptr<ConcordiaSearchResult> result,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
// add fragments to result and sort them
|
||||||
|
|
||||||
|
std::vector<sauchar_t> patternVector =
|
||||||
|
Utils::indexVectorToSaucharVector(pattern);
|
||||||
|
|
||||||
|
if (patternVector.size() !=
|
||||||
|
pattern.size() * sizeof(INDEX_CHARACTER_TYPE)) {
|
||||||
|
throw ConcordiaException("Increasing pattern resolution went wrong.");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int offset = 0; offset < pattern.size(); offset++) {
|
||||||
|
int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
std::vector<sauchar_t> currentPattern(
|
||||||
|
patternVector.begin()+highResOffset, patternVector.end());
|
||||||
|
SUFFIX_MARKER_TYPE lcpLength;
|
||||||
|
std::vector<SubstringOccurence> occurences =
|
||||||
|
lcpSearch(T, markers, SA, currentPattern, lcpLength);
|
||||||
|
|
||||||
|
BOOST_FOREACH(SubstringOccurence occurence, occurences) {
|
||||||
|
result->addFragment(MatchedPatternFragment(
|
||||||
|
occurence.getId(),
|
||||||
|
occurence.getOffset(),
|
||||||
|
offset,
|
||||||
|
lcpLength / sizeof(INDEX_CHARACTER_TYPE)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result->sortFragments();
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
||||||
boost::shared_ptr<ConcordiaConfig> config,
|
boost::shared_ptr<ConcordiaConfig> config,
|
||||||
@ -28,8 +64,8 @@ std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
|||||||
// 4. sort the AnubisSearchResult vector decending
|
// 4. sort the AnubisSearchResult vector decending
|
||||||
|
|
||||||
std::vector<AnubisSearchResult> result;
|
std::vector<AnubisSearchResult> result;
|
||||||
for(TmMatchesMapIterator iterator = tmMatchesMap->begin();
|
for (TmMatchesMapIterator iterator = tmMatchesMap->begin();
|
||||||
iterator != tmMatchesMap->end(); iterator++) {
|
iterator != tmMatchesMap->end(); ++iterator) {
|
||||||
TmMatches * tmMatches = iterator->second;
|
TmMatches * tmMatches = iterator->second;
|
||||||
tmMatches->calculateScore();
|
tmMatches->calculateScore();
|
||||||
|
|
||||||
@ -175,6 +211,7 @@ void AnubisSearcher::_collectResults(
|
|||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
saidx_t left, saidx_t size) {
|
saidx_t left, saidx_t size) {
|
||||||
|
int resultsCount = 0;
|
||||||
for (saidx_t i = 0; i < size; i++) {
|
for (saidx_t i = 0; i < size; i++) {
|
||||||
saidx_t resultPos = SA->at(left + i);
|
saidx_t resultPos = SA->at(left + i);
|
||||||
|
|
||||||
@ -182,6 +219,12 @@ void AnubisSearcher::_collectResults(
|
|||||||
SUFFIX_MARKER_TYPE marker =
|
SUFFIX_MARKER_TYPE marker =
|
||||||
markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
|
markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
|
||||||
result.push_back(SubstringOccurence(marker));
|
result.push_back(SubstringOccurence(marker));
|
||||||
|
|
||||||
|
// truncate results,
|
||||||
|
// we don't need too many identical pattern overlays
|
||||||
|
if (++resultsCount >= CONCORDIA_SEARCH_MAX_RESULTS) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8,10 +8,11 @@
|
|||||||
#include "concordia/substring_occurence.hpp"
|
#include "concordia/substring_occurence.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include "concordia/concordia_config.hpp"
|
#include "concordia/concordia_config.hpp"
|
||||||
|
#include "concordia/concordia_search_result.hpp"
|
||||||
#include "concordia/anubis_search_result.hpp"
|
#include "concordia/anubis_search_result.hpp"
|
||||||
#include "concordia/tm_matches.hpp"
|
#include "concordia/tm_matches.hpp"
|
||||||
|
|
||||||
#include<vector>
|
#include <vector>
|
||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -27,6 +28,14 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~AnubisSearcher();
|
virtual ~AnubisSearcher();
|
||||||
|
|
||||||
|
void concordiaSearch(
|
||||||
|
boost::shared_ptr<ConcordiaSearchResult> result,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
std::vector<AnubisSearchResult> anubisSearch(
|
std::vector<AnubisSearchResult> anubisSearch(
|
||||||
boost::shared_ptr<ConcordiaConfig> config,
|
boost::shared_ptr<ConcordiaConfig> config,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
@ -28,3 +28,5 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
|||||||
//The sentence marker is build as follows: its first bytes store the
|
//The sentence marker is build as follows: its first bytes store the
|
||||||
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
||||||
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
||||||
|
|
||||||
|
#define CONCORDIA_SEARCH_MAX_RESULTS 3
|
||||||
|
@ -156,3 +156,17 @@ std::vector<AnubisSearchResult> Concordia::anubisSearch(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<ConcordiaSearchResult> Concordia::concordiaSearch(
|
||||||
|
const std::string & pattern)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
if (_T->size() > 0) {
|
||||||
|
return _searcher->concordiaSearch(_hashGenerator, _T,
|
||||||
|
_markers, _SA, pattern);
|
||||||
|
} else {
|
||||||
|
std::vector<std::string> empty;
|
||||||
|
return boost::shared_ptr<ConcordiaSearchResult>(
|
||||||
|
new ConcordiaSearchResult(empty));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include "concordia/concordia_config.hpp"
|
#include "concordia/concordia_config.hpp"
|
||||||
#include "concordia/concordia_index.hpp"
|
#include "concordia/concordia_index.hpp"
|
||||||
#include "concordia/index_searcher.hpp"
|
#include "concordia/index_searcher.hpp"
|
||||||
|
#include "concordia/concordia_search_result.hpp"
|
||||||
#include "concordia/anubis_search_result.hpp"
|
#include "concordia/anubis_search_result.hpp"
|
||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
@ -49,6 +50,10 @@ public:
|
|||||||
std::vector<AnubisSearchResult> anubisSearch(const std::string & pattern)
|
std::vector<AnubisSearchResult> anubisSearch(const std::string & pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
|
||||||
|
const std::string & pattern)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
||||||
|
|
||||||
void refreshSAfromRAM() throw(ConcordiaException);
|
void refreshSAfromRAM() throw(ConcordiaException);
|
||||||
|
@ -46,9 +46,8 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
|||||||
ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
|
||||||
_stopSymbolsFilePath =
|
_stopSymbolsFilePath =
|
||||||
ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
|
||||||
_anubisThreshold =
|
_anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr(
|
||||||
atof(
|
ANUBIS_THRESHOLD_PARAM).c_str());
|
||||||
ConcordiaConfig::_readConfigParameterStr(ANUBIS_THRESHOLD_PARAM).c_str());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ConcordiaConfig::~ConcordiaConfig() {
|
ConcordiaConfig::~ConcordiaConfig() {
|
||||||
|
22
concordia/concordia_search_result.cpp
Normal file
22
concordia/concordia_search_result.cpp
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
#include "concordia/concordia_search_result.hpp"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
ConcordiaSearchResult::ConcordiaSearchResult(
|
||||||
|
const std::vector<std::string> & tokenVector):
|
||||||
|
_tokenVector(tokenVector) {
|
||||||
|
}
|
||||||
|
|
||||||
|
ConcordiaSearchResult::~ConcordiaSearchResult() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConcordiaSearchResult::addFragment(
|
||||||
|
const MatchedPatternFragment & fragment) {
|
||||||
|
_matchedPatternFragments.push_back(fragment);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConcordiaSearchResult::sortFragments() {
|
||||||
|
std::sort(_matchedPatternFragments.begin(),
|
||||||
|
_matchedPatternFragments.end(),
|
||||||
|
std::greater<MatchedPatternFragment>());
|
||||||
|
}
|
41
concordia/concordia_search_result.hpp
Normal file
41
concordia/concordia_search_result.hpp
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#ifndef CONCORDIA_SEARCH_RESULT_HDR
|
||||||
|
#define CONCORDIA_SEARCH_RESULT_HDR
|
||||||
|
|
||||||
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Class representing result of concordia search.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
class ConcordiaSearchResult {
|
||||||
|
public:
|
||||||
|
explicit ConcordiaSearchResult(
|
||||||
|
const std::vector<std::string> & tokenVector);
|
||||||
|
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~ConcordiaSearchResult();
|
||||||
|
|
||||||
|
void addFragment(const MatchedPatternFragment & fragment);
|
||||||
|
|
||||||
|
void sortFragments();
|
||||||
|
|
||||||
|
std::vector<std::string> getTokenVector() const {
|
||||||
|
return _tokenVector;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<MatchedPatternFragment> getFragments() const {
|
||||||
|
return _matchedPatternFragments;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<std::string> _tokenVector;
|
||||||
|
|
||||||
|
std::vector<MatchedPatternFragment> _matchedPatternFragments;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -59,3 +59,19 @@ std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
|||||||
hashGenerator->generateHash(pattern);
|
hashGenerator->generateHash(pattern);
|
||||||
return _anubisSearcher->anubisSearch(config, T, markers, SA, hash);
|
return _anubisSearcher->anubisSearch(config, T, markers, SA, hash);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<ConcordiaSearchResult> IndexSearcher::concordiaSearch(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
const std::string & pattern) throw(ConcordiaException) {
|
||||||
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
|
hashGenerator->generateHash(pattern);
|
||||||
|
boost::shared_ptr<ConcordiaSearchResult> result =
|
||||||
|
boost::shared_ptr<ConcordiaSearchResult>(
|
||||||
|
new ConcordiaSearchResult(hashGenerator->generateTokenVector(pattern)));
|
||||||
|
|
||||||
|
_anubisSearcher->concordiaSearch(result, T, markers, SA, hash);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
@ -42,6 +42,14 @@ public:
|
|||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const std::string & pattern) throw(ConcordiaException);
|
const std::string & pattern) throw(ConcordiaException);
|
||||||
|
|
||||||
|
boost::shared_ptr<ConcordiaSearchResult> concordiaSearch(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
const std::string & pattern) throw(ConcordiaException);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
boost::shared_ptr<AnubisSearcher> _anubisSearcher;
|
boost::shared_ptr<AnubisSearcher> _anubisSearcher;
|
||||||
};
|
};
|
||||||
|
16
concordia/matched_pattern_fragment.cpp
Normal file
16
concordia/matched_pattern_fragment.cpp
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#include "concordia/matched_pattern_fragment.hpp"
|
||||||
|
|
||||||
|
MatchedPatternFragment::MatchedPatternFragment(
|
||||||
|
const SUFFIX_MARKER_TYPE & exampleId,
|
||||||
|
const SUFFIX_MARKER_TYPE & exampleOffset,
|
||||||
|
const SUFFIX_MARKER_TYPE & patternOffset,
|
||||||
|
const SUFFIX_MARKER_TYPE & matchedLength):
|
||||||
|
_exampleId(exampleId),
|
||||||
|
_exampleOffset(exampleOffset),
|
||||||
|
_patternOffset(patternOffset),
|
||||||
|
_matchedLength(matchedLength) {
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchedPatternFragment::~MatchedPatternFragment() {
|
||||||
|
}
|
||||||
|
|
51
concordia/matched_pattern_fragment.hpp
Normal file
51
concordia/matched_pattern_fragment.hpp
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
#ifndef MATCHED_PATTERN_FRAGMENT_HDR
|
||||||
|
#define MATCHED_PATTERN_FRAGMENT_HDR
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Class representing matched pattern fragment in concordia search.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
class MatchedPatternFragment {
|
||||||
|
public:
|
||||||
|
MatchedPatternFragment(const SUFFIX_MARKER_TYPE & exampleId,
|
||||||
|
const SUFFIX_MARKER_TYPE & exampleOffset,
|
||||||
|
const SUFFIX_MARKER_TYPE & patternOffset,
|
||||||
|
const SUFFIX_MARKER_TYPE & matchedLength);
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~MatchedPatternFragment();
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE getExampleId() const {
|
||||||
|
return _exampleId;
|
||||||
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE getExampleOffset() const {
|
||||||
|
return _exampleOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE getPatternOffset() const {
|
||||||
|
return _patternOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE getMatchedLength() const {
|
||||||
|
return _matchedLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator > (const MatchedPatternFragment & other) const {
|
||||||
|
return (_matchedLength > other.getMatchedLength());
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
SUFFIX_MARKER_TYPE _exampleId;
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE _exampleOffset;
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE _patternOffset;
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE _matchedLength;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -191,16 +191,70 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
|||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( ConcordiaSearch1 )
|
||||||
|
{
|
||||||
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
concordia.addExample(Example("Ala posiada kota",14));
|
||||||
|
concordia.addExample(Example("Ala posiada rysia",51));
|
||||||
|
concordia.addExample(Example("Marysia posiada rysia",123));
|
||||||
|
concordia.addExample(Example("Gosia chyba posiada rysia też",167));
|
||||||
|
concordia.addExample(Example("Ania od wczoraj posiada rysia",45));
|
||||||
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
|
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("posiada rysia chyba");
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().size(), 7);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getId(), 123);
|
addFragment 45,2,0,2
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(0).getOffset(), 1);
|
addFragment 51,1,0,2
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getId(), 51);
|
addFragment 123,1,0,2
|
||||||
BOOST_CHECK_EQUAL(searchResult1.at(1).getOffset(), 1);
|
addFragment 45,3,1,1
|
||||||
|
addFragment 51,2,1,1
|
||||||
// Checking pattern spanning over 2 segments
|
addFragment 123,2,1,1
|
||||||
BOOST_CHECK_EQUAL(searchResult2.size(), 0);
|
addFragment 167,1,2,1
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleId(), 45);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getExampleOffset(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getPatternOffset(), 0);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(0).getMatchedLength(), 2);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleId(), 51);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getExampleOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getPatternOffset(), 0);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(1).getMatchedLength(), 2);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleId(), 123);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getExampleOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getPatternOffset(), 0);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(2).getMatchedLength(), 2);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleId(), 45);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getExampleOffset(), 3);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getPatternOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(3).getMatchedLength(), 1);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleId(), 51);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getExampleOffset(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getPatternOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(4).getMatchedLength(), 1);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleId(), 123);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getExampleOffset(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getPatternOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(5).getMatchedLength(), 1);
|
||||||
|
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleId(), 167);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getExampleOffset(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getPatternOffset(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(searchResult1->getFragments().at(6).getMatchedLength(), 1);
|
||||||
|
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
Loading…
Reference in New Issue
Block a user