getTmMatches
Former-commit-id: 94aa3db2db88195c61c6ac70006c0e1d743dc854
This commit is contained in:
parent
f03b4ad954
commit
e02bbaa0fa
8
TODO.txt
8
TODO.txt
@ -8,7 +8,7 @@ DONE 3. Dzielenie zdań (max 255 tokenów)
|
|||||||
DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
|
DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
|
||||||
|
|
||||||
DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
|
DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
|
||||||
2. Wykonać anubis search na nowych markerach z długością zdania
|
IN PROGRESS 2. Wykonać anubis search na nowych markerach z długością zdania
|
||||||
3. Multi-threading?
|
3. Multi-threading?
|
||||||
|
|
||||||
- concordia-server
|
- concordia-server
|
||||||
@ -19,6 +19,8 @@ DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz marker
|
|||||||
|
|
||||||
|
|
||||||
zastanowić się nad optymalizacją:
|
zastanowić się nad optymalizacją:
|
||||||
- unordered_map tmMatchesMap
|
- tmMatchesMap jako normalna mapa (nie ptr_map)
|
||||||
- LCP array
|
- REJECTED LCP array
|
||||||
|
- !important! rezygnacja z ptr_vector (wycieki!)
|
||||||
|
- zwracanie wektorów
|
||||||
|
|
||||||
|
@ -1,15 +1,8 @@
|
|||||||
#include "concordia/anubis_searcher.hpp"
|
#include "concordia/anubis_searcher.hpp"
|
||||||
#include "concordia/tm_matches.hpp"
|
|
||||||
#include "concordia/common/logging.hpp"
|
|
||||||
|
|
||||||
#include <boost/ptr_container/ptr_map.hpp>
|
#include "concordia/common/logging.hpp"
|
||||||
#include <boost/assign/ptr_map_inserter.hpp>
|
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <map>
|
|
||||||
|
|
||||||
typedef boost::ptr_map<SUFFIX_MARKER_TYPE, TmMatches> TmMatchesMap;
|
|
||||||
typedef TmMatchesMap::iterator TmMatchesMapIterator;
|
|
||||||
|
|
||||||
AnubisSearcher::AnubisSearcher() {
|
AnubisSearcher::AnubisSearcher() {
|
||||||
}
|
}
|
||||||
@ -25,11 +18,19 @@ boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
|||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
SET_LOGGER_FILE("/tmp/concordia.log");
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap = getTmMatches(T, markers, SA, pattern);
|
||||||
SET_LOGGING_LEVEL("ERROR");
|
|
||||||
INFO("AnubisSearcher::anubisSearch");
|
// get the tmMatches list sorted descending by score
|
||||||
|
|
||||||
boost::ptr_vector<AnubisSearchResult> result;
|
boost::ptr_vector<AnubisSearchResult> result;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > patternVector =
|
boost::shared_ptr<std::vector<sauchar_t> > patternVector =
|
||||||
Utils::indexVectorToSaucharVector(pattern);
|
Utils::indexVectorToSaucharVector(pattern);
|
||||||
@ -39,67 +40,61 @@ boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
|||||||
throw ConcordiaException("Increasing pattern resolution went wrong.");
|
throw ConcordiaException("Increasing pattern resolution went wrong.");
|
||||||
}
|
}
|
||||||
|
|
||||||
INFO("AnubisSearcher::anubisSearch - about to create tmMatchesMap");
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap(new TmMatchesMap());
|
||||||
TmMatchesMap tmMatchesMap;
|
|
||||||
for (int offset = 0; offset < pattern->size(); offset++) {
|
for (int offset = 0; offset < pattern->size(); offset++) {
|
||||||
INFO("AnubisSearcher::anubisSearch - offset: ");
|
|
||||||
INFO(offset);
|
|
||||||
|
|
||||||
int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
|
int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
|
||||||
INFO("AnubisSearcher::anubisSearch - high res offset: ");
|
|
||||||
INFO(highResOffset);
|
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > currentPattern =
|
boost::shared_ptr<std::vector<sauchar_t> > currentPattern =
|
||||||
boost::shared_ptr<std::vector<sauchar_t> >
|
boost::shared_ptr<std::vector<sauchar_t> >
|
||||||
(new std::vector<sauchar_t>(
|
(new std::vector<sauchar_t>(
|
||||||
patternVector->begin()+highResOffset, patternVector->end()));
|
patternVector->begin()+highResOffset, patternVector->end()));
|
||||||
SUFFIX_MARKER_TYPE highResLongestPrefixesLength;
|
|
||||||
INFO("AnubisSearcher::anubisSearch - about to get longest prefixes");
|
saidx_t patternLength = 0;
|
||||||
boost::ptr_vector<SubstringOccurence> longestPrefixes =
|
saidx_t size = SA->size();
|
||||||
lcpSearch(T, markers, SA, currentPattern, highResLongestPrefixesLength);
|
saidx_t left = 0;
|
||||||
|
|
||||||
INFO("AnubisSearcher::anubisSearch - longest prefixes got");
|
|
||||||
SUFFIX_MARKER_TYPE longestPrefixesLength = highResLongestPrefixesLength /
|
|
||||||
sizeof(INDEX_CHARACTER_TYPE);
|
|
||||||
INFO("AnubisSearcher::anubisSearch - longest prefixes high res length");
|
|
||||||
INFO(highResLongestPrefixesLength);
|
|
||||||
INFO("AnubisSearcher::anubisSearch - longest prefixes length");
|
|
||||||
INFO(longestPrefixesLength);
|
|
||||||
|
|
||||||
if (longestPrefixesLength > 0) {
|
sauchar_t * patternArray = currentPattern->data();
|
||||||
BOOST_FOREACH(SubstringOccurence & occurence, longestPrefixes) {
|
|
||||||
boost::shared_ptr<TmMatches> tmMatches;
|
|
||||||
|
|
||||||
TmMatchesMapIterator mapIterator = tmMatchesMap.find(
|
saidx_t * SAleft = SA->data();
|
||||||
occurence.getId());
|
|
||||||
if (mapIterator != tmMatchesMap.end()) {
|
saidx_t prevLeft;
|
||||||
tmMatches = boost::shared_ptr<TmMatches>(
|
saidx_t prevSize;
|
||||||
mapIterator->second
|
do {
|
||||||
);
|
prevLeft = left;
|
||||||
} else {
|
prevSize = size;
|
||||||
tmMatches = boost::shared_ptr<TmMatches>(
|
|
||||||
new TmMatches(
|
patternLength += sizeof(INDEX_CHARACTER_TYPE);
|
||||||
occurence.getId(),
|
|
||||||
occurence.getExampleLength(),
|
saidx_t localLeft;
|
||||||
patternVector->size()
|
size = sa_search(T->data(), (saidx_t) T->size(),
|
||||||
));
|
(const sauchar_t *) patternArray, patternLength,
|
||||||
}
|
SAleft, size, &localLeft);
|
||||||
|
|
||||||
|
|
||||||
|
left += localLeft;
|
||||||
|
SAleft += localLeft;
|
||||||
|
|
||||||
|
if (patternLength > sizeof(INDEX_CHARACTER_TYPE)) {
|
||||||
|
// Add to tm matches map results surrounding the main stream.
|
||||||
|
// from left
|
||||||
|
for (saidx_t i = prevLeft; i < left; i++) {
|
||||||
|
_addToMap(SA, markers, tmMatchesMap, i, pattern->size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
|
||||||
|
}
|
||||||
|
// from right
|
||||||
|
for (saidx_t i = left+size; i < prevLeft+prevSize; i++) {
|
||||||
|
_addToMap(SA, markers, tmMatchesMap, i, pattern->size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
|
||||||
|
}
|
||||||
|
|
||||||
// add intervals to tmMatches
|
|
||||||
tmMatches->addExampleInterval(
|
|
||||||
occurence.getOffset(),
|
|
||||||
occurence.getOffset() + longestPrefixesLength
|
|
||||||
);
|
|
||||||
tmMatches->addPatternInterval(
|
|
||||||
offset,
|
|
||||||
offset + longestPrefixesLength
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
} while (patternLength < currentPattern->size() && size > 0);
|
||||||
|
|
||||||
|
if (size > 0) {
|
||||||
|
for (saidx_t i = left; i < left+size; i++) {
|
||||||
|
_addToMap(SA, markers, tmMatchesMap, i, pattern->size(), patternLength / sizeof(INDEX_CHARACTER_TYPE), offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// get the tmMatches list sorted descending by score
|
|
||||||
|
|
||||||
return result;
|
return tmMatchesMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
||||||
@ -128,8 +123,7 @@ boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
|||||||
saidx_t localLeft;
|
saidx_t localLeft;
|
||||||
size = sa_search(T->data(), (saidx_t) T->size(),
|
size = sa_search(T->data(), (saidx_t) T->size(),
|
||||||
(const sauchar_t *) patternArray, patternLength,
|
(const sauchar_t *) patternArray, patternLength,
|
||||||
SAleft, size, &localLeft);
|
SAleft, size, &localLeft);
|
||||||
|
|
||||||
left += localLeft;
|
left += localLeft;
|
||||||
SAleft += localLeft;
|
SAleft += localLeft;
|
||||||
} while (patternLength < pattern->size() && size > 0);
|
} while (patternLength < pattern->size() && size > 0);
|
||||||
@ -170,3 +164,67 @@ void AnubisSearcher::_collectResults(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AnubisSearcher::_addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
||||||
|
saidx_t sa_pos,
|
||||||
|
SUFFIX_MARKER_TYPE totalPatternLength,
|
||||||
|
SUFFIX_MARKER_TYPE matchedFragmentLength,
|
||||||
|
SUFFIX_MARKER_TYPE patternOffset) {
|
||||||
|
SubstringOccurence occurence;
|
||||||
|
if (_getOccurenceFromSA(SA, markers, sa_pos, occurence)) {
|
||||||
|
_addOccurenceToMap(tmMatchesMap,
|
||||||
|
occurence,
|
||||||
|
totalPatternLength,
|
||||||
|
matchedFragmentLength,
|
||||||
|
patternOffset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool AnubisSearcher::_getOccurenceFromSA(
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
saidx_t sa_pos,
|
||||||
|
SubstringOccurence & occurence) {
|
||||||
|
saidx_t resultPos = SA->at(sa_pos);
|
||||||
|
|
||||||
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||||
|
SUFFIX_MARKER_TYPE marker = markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
|
||||||
|
occurence.enterDataFromMarker(marker);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void AnubisSearcher::_addOccurenceToMap(boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
||||||
|
SubstringOccurence & occurence,
|
||||||
|
SUFFIX_MARKER_TYPE totalPatternLength,
|
||||||
|
SUFFIX_MARKER_TYPE matchedFragmentLength,
|
||||||
|
SUFFIX_MARKER_TYPE patternOffset) {
|
||||||
|
TmMatches * tmMatches;
|
||||||
|
|
||||||
|
TmMatchesMapIterator mapIterator = tmMatchesMap->find(
|
||||||
|
occurence.getId());
|
||||||
|
if (mapIterator != tmMatchesMap->end()) {
|
||||||
|
tmMatches = mapIterator->second;
|
||||||
|
} else {
|
||||||
|
tmMatches = new TmMatches(occurence.getId(),
|
||||||
|
occurence.getExampleLength(),
|
||||||
|
totalPatternLength);
|
||||||
|
SUFFIX_MARKER_TYPE key = occurence.getId();
|
||||||
|
tmMatchesMap->insert(key, tmMatches);
|
||||||
|
}
|
||||||
|
|
||||||
|
// add intervals to tmMatches
|
||||||
|
tmMatches->addExampleInterval(
|
||||||
|
occurence.getOffset(),
|
||||||
|
occurence.getOffset() + matchedFragmentLength
|
||||||
|
);
|
||||||
|
tmMatches->addPatternInterval(
|
||||||
|
patternOffset,
|
||||||
|
patternOffset + matchedFragmentLength
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include "concordia/substring_occurence.hpp"
|
#include "concordia/substring_occurence.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include "concordia/anubis_search_result.hpp"
|
#include "concordia/anubis_search_result.hpp"
|
||||||
|
#include "concordia/tm_matches.hpp"
|
||||||
|
|
||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
@ -34,6 +35,13 @@ public:
|
|||||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
boost::shared_ptr<TmMatchesMap> getTmMatches(
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
boost::ptr_vector<SubstringOccurence> lcpSearch(
|
boost::ptr_vector<SubstringOccurence> lcpSearch(
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
@ -46,6 +54,25 @@ private:
|
|||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
saidx_t left, saidx_t size);
|
saidx_t left, saidx_t size);
|
||||||
|
|
||||||
|
void _addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
||||||
|
saidx_t sa_pos,
|
||||||
|
SUFFIX_MARKER_TYPE totalPatternLength,
|
||||||
|
SUFFIX_MARKER_TYPE matchedFragmentLength,
|
||||||
|
SUFFIX_MARKER_TYPE patternOffset);
|
||||||
|
|
||||||
|
bool _getOccurenceFromSA(boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
saidx_t sa_pos,
|
||||||
|
SubstringOccurence & occurence);
|
||||||
|
|
||||||
|
void _addOccurenceToMap(boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
||||||
|
SubstringOccurence & occurence,
|
||||||
|
SUFFIX_MARKER_TYPE totalPatternLength,
|
||||||
|
SUFFIX_MARKER_TYPE matchedFragmentLength,
|
||||||
|
SUFFIX_MARKER_TYPE patternOffset);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -28,4 +28,3 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
|||||||
//The sentence marker is build as follows: its first bytes store the
|
//The sentence marker is build as follows: its first bytes store the
|
||||||
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
||||||
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
||||||
|
|
||||||
|
@ -67,7 +67,7 @@ private:
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
void Utils::printVector(boost::shared_ptr<std::vector<T> > vector) {
|
void Utils::printVector(boost::shared_ptr<std::vector<T> > vector) {
|
||||||
for (int i = 0; i < vector->size(); i++) {
|
for (int i = 0; i < vector->size(); i++) {
|
||||||
cout << vector->at(i) << " ";
|
cout << static_cast<int>(vector->at(i)) << " ";
|
||||||
}
|
}
|
||||||
cout << endl;
|
cout << endl;
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#include "concordia/interval.hpp"
|
#include "concordia/interval.hpp"
|
||||||
|
|
||||||
|
|
||||||
Interval::Interval(const unsigned char start, const unsigned char end):
|
Interval::Interval(const SUFFIX_MARKER_TYPE start, const SUFFIX_MARKER_TYPE end):
|
||||||
_start(start),
|
_start(start),
|
||||||
_end(end) {
|
_end(end) {
|
||||||
}
|
}
|
||||||
@ -14,7 +14,7 @@ bool Interval::intersects(Interval & interval) {
|
|||||||
interval.getEnd() - 1 < _start);
|
interval.getEnd() - 1 < _start);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned char Interval::getLength() {
|
SUFFIX_MARKER_TYPE Interval::getLength() {
|
||||||
return _end - _start;
|
return _end - _start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
#ifndef INTERVAL_HDR
|
#ifndef INTERVAL_HDR
|
||||||
#define INTERVAL_HDR
|
#define INTERVAL_HDR
|
||||||
|
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing word interval.
|
Class representing word interval.
|
||||||
|
|
||||||
@ -10,7 +12,7 @@ using namespace std;
|
|||||||
|
|
||||||
class Interval {
|
class Interval {
|
||||||
public:
|
public:
|
||||||
explicit Interval(const unsigned char start, const unsigned char end);
|
explicit Interval(const SUFFIX_MARKER_TYPE start, const SUFFIX_MARKER_TYPE end);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
@ -18,20 +20,20 @@ public:
|
|||||||
|
|
||||||
bool intersects(Interval & interval);
|
bool intersects(Interval & interval);
|
||||||
|
|
||||||
unsigned char getLength();
|
SUFFIX_MARKER_TYPE getLength();
|
||||||
|
|
||||||
unsigned char getStart() const {
|
SUFFIX_MARKER_TYPE getStart() const {
|
||||||
return _start;
|
return _start;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned char getEnd() const {
|
SUFFIX_MARKER_TYPE getEnd() const {
|
||||||
return _end;
|
return _end;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
unsigned char _start;
|
SUFFIX_MARKER_TYPE _start;
|
||||||
|
|
||||||
unsigned char _end;
|
SUFFIX_MARKER_TYPE _end;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
#include "concordia/substring_occurence.hpp"
|
#include "concordia/substring_occurence.hpp"
|
||||||
#include "concordia/common/utils.hpp"
|
#include "concordia/common/utils.hpp"
|
||||||
|
|
||||||
|
SubstringOccurence::SubstringOccurence() {
|
||||||
|
}
|
||||||
|
|
||||||
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & marker) {
|
SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & marker) {
|
||||||
_id = Utils::getIdFromMarker(marker);
|
_id = Utils::getIdFromMarker(marker);
|
||||||
@ -8,6 +10,12 @@ SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & marker) {
|
|||||||
_exampleLength = Utils::getLengthFromMarker(marker);
|
_exampleLength = Utils::getLengthFromMarker(marker);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SubstringOccurence::enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker) {
|
||||||
|
_id = Utils::getIdFromMarker(marker);
|
||||||
|
_offset = Utils::getOffsetFromMarker(marker);
|
||||||
|
_exampleLength = Utils::getLengthFromMarker(marker);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
SubstringOccurence::SubstringOccurence(
|
SubstringOccurence::SubstringOccurence(
|
||||||
const SUFFIX_MARKER_TYPE & id,
|
const SUFFIX_MARKER_TYPE & id,
|
||||||
|
@ -13,6 +13,8 @@ using namespace std;
|
|||||||
|
|
||||||
class SubstringOccurence {
|
class SubstringOccurence {
|
||||||
public:
|
public:
|
||||||
|
SubstringOccurence();
|
||||||
|
|
||||||
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & marker);
|
explicit SubstringOccurence(const SUFFIX_MARKER_TYPE & marker);
|
||||||
|
|
||||||
SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
SubstringOccurence(const SUFFIX_MARKER_TYPE & id,
|
||||||
@ -33,6 +35,8 @@ public:
|
|||||||
SUFFIX_MARKER_TYPE getExampleLength() const {
|
SUFFIX_MARKER_TYPE getExampleLength() const {
|
||||||
return _exampleLength;
|
return _exampleLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
SUFFIX_MARKER_TYPE _id;
|
SUFFIX_MARKER_TYPE _id;
|
||||||
|
@ -1,7 +1,16 @@
|
|||||||
|
#include <iostream>
|
||||||
|
|
||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
|
#include "concordia/tm_matches.hpp"
|
||||||
#include "concordia/anubis_searcher.hpp"
|
#include "concordia/anubis_searcher.hpp"
|
||||||
|
#include "concordia/concordia_index.hpp"
|
||||||
|
#include "concordia/concordia_config.hpp"
|
||||||
|
#include "concordia/example.hpp"
|
||||||
|
#include "concordia/hash_generator.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/common/utils.hpp"
|
#include "concordia/common/utils.hpp"
|
||||||
|
#include "concordia/common/logging.hpp"
|
||||||
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -324,9 +333,109 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( AnubisSearch1 )
|
BOOST_AUTO_TEST_CASE( TmMatchesTest )
|
||||||
{
|
{
|
||||||
|
AnubisSearcher searcher;
|
||||||
|
|
||||||
|
/*The test index contains 3 sentences:
|
||||||
|
14: "Ala posiada kota"
|
||||||
|
51: "Ala posiada rysia"
|
||||||
|
123: "Marysia posiada rysia"
|
||||||
|
|
||||||
|
Test word map:
|
||||||
|
Ala -> 0
|
||||||
|
posiada -> 1
|
||||||
|
kota -> 2
|
||||||
|
rysia -> 3
|
||||||
|
Marysia -> 4
|
||||||
|
|
||||||
|
Test hashed index:
|
||||||
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
|
T[n]: 0 1 2 | 0 1 3 | 4 1 3 |
|
||||||
|
|
||||||
|
Test suffix array:
|
||||||
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
|
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX),
|
||||||
|
TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
|
boost::shared_ptr<ConcordiaConfig> config(
|
||||||
|
new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator(new HashGenerator(config));
|
||||||
|
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
|
||||||
|
|
||||||
|
index.addExample(hashGenerator, T, markers, Example("Ala posiada kota",14));
|
||||||
|
index.addExample(hashGenerator, T, markers, Example("Ala posiada rysia",51));
|
||||||
|
index.addExample(hashGenerator, T, markers, Example("Marysia posiada rysia",123));
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||||
|
|
||||||
|
|
||||||
|
// searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
|
||||||
|
|
||||||
|
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern = hashGenerator->generateHash("Ola posiada rysia Marysia");
|
||||||
|
|
||||||
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);
|
||||||
|
BOOST_CHECK_EQUAL(tmMatchesMap->size(), 3);
|
||||||
|
|
||||||
|
TmMatches * tmMatches14 = tmMatchesMap->find(14)->second;
|
||||||
|
TmMatches * tmMatches51 = tmMatchesMap->find(51)->second;
|
||||||
|
TmMatches * tmMatches123 = tmMatchesMap->find(123)->second;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(tmMatches14->getExampleId(), 14);
|
||||||
|
BOOST_CHECK_EQUAL(tmMatches51->getExampleId(), 51);
|
||||||
|
BOOST_CHECK_EQUAL(tmMatches123->getExampleId(), 123);
|
||||||
|
|
||||||
|
// example 14
|
||||||
|
// example interval list: [(1,2)]
|
||||||
|
boost::ptr_vector<Interval> exampleIntervals14 = tmMatches14->getExampleIntervals();
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals14.size(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals14[0].getStart(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals14[0].getEnd(), 2);
|
||||||
|
// pattern interval list: [(1,2)]
|
||||||
|
boost::ptr_vector<Interval> patternIntervals14 = tmMatches14->getPatternIntervals();
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals14.size(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals14[0].getStart(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals14[0].getEnd(), 2);
|
||||||
|
|
||||||
|
// example 51
|
||||||
|
// example interval list: [(1,3)]
|
||||||
|
boost::ptr_vector<Interval> exampleIntervals51 = tmMatches51->getExampleIntervals();
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals51.size(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals51[0].getStart(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals51[0].getEnd(), 3);
|
||||||
|
// pattern interval list: [(1,3)]
|
||||||
|
boost::ptr_vector<Interval> patternIntervals51 = tmMatches51->getPatternIntervals();
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals51.size(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals51[0].getStart(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals51[0].getEnd(), 3);
|
||||||
|
|
||||||
|
// example 123
|
||||||
|
// example interval list: [(1,3), (0,1)]
|
||||||
|
boost::ptr_vector<Interval> exampleIntervals123 = tmMatches123->getExampleIntervals();
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals123.size(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals123[0].getStart(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals123[0].getEnd(), 3);
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals123[1].getStart(), 0);
|
||||||
|
BOOST_CHECK_EQUAL(exampleIntervals123[1].getEnd(), 1);
|
||||||
|
// pattern interval list: [(1,3), (3,4)]
|
||||||
|
boost::ptr_vector<Interval> patternIntervals123 = tmMatches123->getPatternIntervals();
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals123.size(), 2);
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals123[0].getStart(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals123[0].getEnd(), 3);
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals123[1].getStart(), 3);
|
||||||
|
BOOST_CHECK_EQUAL(patternIntervals123[1].getEnd(), 4);
|
||||||
|
|
||||||
|
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_HASHED_INDEX));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -177,7 +177,6 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
|||||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
*/
|
|
||||||
boost::ptr_vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
|
boost::ptr_vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
|
||||||
boost::ptr_vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
|
boost::ptr_vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/interval.hpp"
|
#include "concordia/interval.hpp"
|
||||||
#include <boost/ptr_container/ptr_vector.hpp>
|
#include <boost/ptr_container/ptr_vector.hpp>
|
||||||
|
#include <boost/ptr_container/ptr_map.hpp>
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -28,6 +29,14 @@ public:
|
|||||||
return _score;
|
return _score;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boost::ptr_vector<Interval> getExampleIntervals() const {
|
||||||
|
return _exampleMatchedRegions;
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::ptr_vector<Interval> getPatternIntervals() const {
|
||||||
|
return _patternMatchedRegions;
|
||||||
|
}
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE getExampleId() const {
|
SUFFIX_MARKER_TYPE getExampleId() const {
|
||||||
return _exampleId;
|
return _exampleId;
|
||||||
}
|
}
|
||||||
@ -61,4 +70,7 @@ private:
|
|||||||
double _score;
|
double _score;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef boost::ptr_map<SUFFIX_MARKER_TYPE, TmMatches> TmMatchesMap;
|
||||||
|
typedef TmMatchesMap::iterator TmMatchesMapIterator;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user