std vectors
Former-commit-id: 5816e87c856f7edc242cc707851a0e2ad05aeb38
This commit is contained in:
parent
e02bbaa0fa
commit
3a03b01f42
4
TODO.txt
4
TODO.txt
@ -21,6 +21,8 @@ IN PROGRESS 2. Wykonać anubis search na nowych markerach z długością zdania
|
||||
zastanowić się nad optymalizacją:
|
||||
- tmMatchesMap jako normalna mapa (nie ptr_map)
|
||||
- REJECTED LCP array
|
||||
- !important! rezygnacja z ptr_vector (wycieki!)
|
||||
- !important! rezygnacja z ptr_vector
|
||||
- zwracanie wektorów
|
||||
- powyrzucać using namespace std
|
||||
- profiling
|
||||
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <boost/program_options.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/date_time/posix_time/posix_time.hpp>
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
@ -68,7 +67,7 @@ int main(int argc, char** argv) {
|
||||
std::cout << "\tSearching for pattern: \"" << pattern <<
|
||||
"\"" << std::endl;
|
||||
time_start = boost::posix_time::microsec_clock::local_time();
|
||||
boost::ptr_vector<SubstringOccurence> result =
|
||||
std::vector<SubstringOccurence> result =
|
||||
concordia.simpleSearch(pattern);
|
||||
time_end = boost::posix_time::microsec_clock::local_time();
|
||||
msdiff = time_end - time_start;
|
||||
@ -89,12 +88,12 @@ int main(int argc, char** argv) {
|
||||
std::string line;
|
||||
if (text_file.is_open()) {
|
||||
long lineCount = 0;
|
||||
boost::ptr_vector<Example> buffer;
|
||||
vector<Example> buffer;
|
||||
boost::posix_time::ptime timeStart =
|
||||
boost::posix_time::microsec_clock::local_time();
|
||||
while (getline(text_file, line)) {
|
||||
lineCount++;
|
||||
buffer.push_back(new Example(line, lineCount));
|
||||
buffer.push_back(Example(line, lineCount));
|
||||
if (lineCount % READ_BUFFER_LENGTH == 0) {
|
||||
concordia.addAllExamples(buffer);
|
||||
buffer.clear();
|
||||
|
@ -12,16 +12,16 @@ AnubisSearcher::~AnubisSearcher() {
|
||||
}
|
||||
|
||||
|
||||
boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
||||
std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||
throw(ConcordiaException) {
|
||||
boost::shared_ptr<TmMatchesMap> tmMatchesMap = getTmMatches(T, markers, SA, pattern);
|
||||
|
||||
// get the tmMatches list sorted descending by score
|
||||
boost::ptr_vector<AnubisSearchResult> result;
|
||||
std::vector<AnubisSearchResult> result;
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -29,30 +29,28 @@ boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||
throw(ConcordiaException) {
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > patternVector =
|
||||
std::vector<sauchar_t> patternVector =
|
||||
Utils::indexVectorToSaucharVector(pattern);
|
||||
|
||||
if (patternVector->size() !=
|
||||
pattern->size() * sizeof(INDEX_CHARACTER_TYPE)) {
|
||||
if (patternVector.size() !=
|
||||
pattern.size() * sizeof(INDEX_CHARACTER_TYPE)) {
|
||||
throw ConcordiaException("Increasing pattern resolution went wrong.");
|
||||
}
|
||||
|
||||
boost::shared_ptr<TmMatchesMap> tmMatchesMap(new TmMatchesMap());
|
||||
for (int offset = 0; offset < pattern->size(); offset++) {
|
||||
for (int offset = 0; offset < pattern.size(); offset++) {
|
||||
int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
|
||||
boost::shared_ptr<std::vector<sauchar_t> > currentPattern =
|
||||
boost::shared_ptr<std::vector<sauchar_t> >
|
||||
(new std::vector<sauchar_t>(
|
||||
patternVector->begin()+highResOffset, patternVector->end()));
|
||||
std::vector<sauchar_t> currentPattern(
|
||||
patternVector.begin()+highResOffset, patternVector.end());
|
||||
|
||||
saidx_t patternLength = 0;
|
||||
saidx_t size = SA->size();
|
||||
saidx_t left = 0;
|
||||
|
||||
sauchar_t * patternArray = currentPattern->data();
|
||||
sauchar_t * patternArray = currentPattern.data();
|
||||
|
||||
saidx_t * SAleft = SA->data();
|
||||
|
||||
@ -77,19 +75,19 @@ boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
|
||||
// Add to tm matches map results surrounding the main stream.
|
||||
// from left
|
||||
for (saidx_t i = prevLeft; i < left; i++) {
|
||||
_addToMap(SA, markers, tmMatchesMap, i, pattern->size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
|
||||
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
|
||||
}
|
||||
// from right
|
||||
for (saidx_t i = left+size; i < prevLeft+prevSize; i++) {
|
||||
_addToMap(SA, markers, tmMatchesMap, i, pattern->size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
|
||||
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
|
||||
}
|
||||
|
||||
}
|
||||
} while (patternLength < currentPattern->size() && size > 0);
|
||||
} while (patternLength < currentPattern.size() && size > 0);
|
||||
|
||||
if (size > 0) {
|
||||
for (saidx_t i = left; i < left+size; i++) {
|
||||
_addToMap(SA, markers, tmMatchesMap, i, pattern->size(), patternLength / sizeof(INDEX_CHARACTER_TYPE), offset);
|
||||
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(), patternLength / sizeof(INDEX_CHARACTER_TYPE), offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -97,18 +95,18 @@ boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
|
||||
return tmMatchesMap;
|
||||
}
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
||||
std::vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern,
|
||||
const std::vector<sauchar_t> & pattern,
|
||||
SUFFIX_MARKER_TYPE & length)
|
||||
throw(ConcordiaException) {
|
||||
saidx_t patternLength = 0;
|
||||
saidx_t size = SA->size();
|
||||
saidx_t left = 0;
|
||||
|
||||
sauchar_t * patternArray = pattern->data();
|
||||
const sauchar_t * patternArray = pattern.data();
|
||||
|
||||
saidx_t * SAleft = SA->data();
|
||||
|
||||
@ -126,9 +124,9 @@ boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
||||
SAleft, size, &localLeft);
|
||||
left += localLeft;
|
||||
SAleft += localLeft;
|
||||
} while (patternLength < pattern->size() && size > 0);
|
||||
} while (patternLength < pattern.size() && size > 0);
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> result;
|
||||
vector<SubstringOccurence> result;
|
||||
|
||||
if (size == 0) {
|
||||
// The search managed to find exactly the longest common prefixes.
|
||||
@ -151,7 +149,7 @@ boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
||||
}
|
||||
|
||||
void AnubisSearcher::_collectResults(
|
||||
boost::ptr_vector<SubstringOccurence> & result,
|
||||
vector<SubstringOccurence> & result,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
saidx_t left, saidx_t size) {
|
||||
@ -160,7 +158,7 @@ void AnubisSearcher::_collectResults(
|
||||
|
||||
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||
SUFFIX_MARKER_TYPE marker = markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
|
||||
result.push_back(new SubstringOccurence(marker));
|
||||
result.push_back(SubstringOccurence(marker));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@
|
||||
#define ANUBIS_SEARCHER_HDR
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/common/utils.hpp"
|
||||
@ -28,29 +27,29 @@ public:
|
||||
*/
|
||||
virtual ~AnubisSearcher();
|
||||
|
||||
boost::ptr_vector<AnubisSearchResult> anubisSearch(
|
||||
std::vector<AnubisSearchResult> anubisSearch(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
boost::shared_ptr<TmMatchesMap> getTmMatches(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
|
||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> lcpSearch(
|
||||
std::vector<SubstringOccurence> lcpSearch(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern,
|
||||
const std::vector<sauchar_t> & pattern,
|
||||
SUFFIX_MARKER_TYPE & length) throw(ConcordiaException);
|
||||
|
||||
private:
|
||||
void _collectResults(boost::ptr_vector<SubstringOccurence> & result,
|
||||
void _collectResults(vector<SubstringOccurence> & result,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
saidx_t left, saidx_t size);
|
||||
|
@ -30,31 +30,39 @@ SUFFIX_MARKER_TYPE Utils::readMarker(ifstream & file) {
|
||||
}
|
||||
|
||||
sauchar_t * Utils::indexVectorToSaucharArray(
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input) {
|
||||
const int kArraySize = input->size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
const vector<INDEX_CHARACTER_TYPE> & input) {
|
||||
const int kArraySize = input.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray =
|
||||
new sauchar_t[kArraySize];
|
||||
int pos = 0;
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = input->begin();
|
||||
it != input->end(); ++it) {
|
||||
for (vector<INDEX_CHARACTER_TYPE>::const_iterator it = input.begin();
|
||||
it != input.end(); ++it) {
|
||||
_insertCharToSaucharArray(patternArray, *it, pos);
|
||||
pos += sizeof(INDEX_CHARACTER_TYPE);
|
||||
}
|
||||
return patternArray;
|
||||
}
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > Utils::indexVectorToSaucharVector(
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input) {
|
||||
boost::shared_ptr<std::vector<sauchar_t> > result =
|
||||
boost::shared_ptr<std::vector<sauchar_t> >(new std::vector<sauchar_t>);
|
||||
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = input->begin();
|
||||
it != input->end(); ++it) {
|
||||
std::vector<sauchar_t> Utils::indexVectorToSaucharVector(
|
||||
const vector<INDEX_CHARACTER_TYPE> & input) {
|
||||
std::vector<sauchar_t> result;
|
||||
for (vector<INDEX_CHARACTER_TYPE>::const_iterator it = input.begin();
|
||||
it != input.end(); ++it) {
|
||||
appendCharToSaucharVector(result, *it);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
void Utils::appendCharToSaucharVector(
|
||||
std::vector<sauchar_t> & vector,
|
||||
INDEX_CHARACTER_TYPE character) {
|
||||
sauchar_t * characterArray = reinterpret_cast<sauchar_t *>(&character);
|
||||
for (int i = 0; i < sizeof(character); i++) {
|
||||
vector.push_back(characterArray[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void Utils::appendCharToSaucharVector(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
||||
INDEX_CHARACTER_TYPE character) {
|
||||
|
@ -32,18 +32,21 @@ public:
|
||||
static SUFFIX_MARKER_TYPE readMarker(ifstream & file);
|
||||
|
||||
static sauchar_t * indexVectorToSaucharArray(
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input);
|
||||
const vector<INDEX_CHARACTER_TYPE> & input);
|
||||
|
||||
static boost::shared_ptr<std::vector<sauchar_t> >
|
||||
indexVectorToSaucharVector(
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input);
|
||||
static std::vector<sauchar_t> indexVectorToSaucharVector(
|
||||
const vector<INDEX_CHARACTER_TYPE> & input);
|
||||
|
||||
static void appendCharToSaucharVector(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
||||
INDEX_CHARACTER_TYPE character);
|
||||
|
||||
static void appendCharToSaucharVector(
|
||||
std::vector<sauchar_t> & vector,
|
||||
INDEX_CHARACTER_TYPE character);
|
||||
|
||||
template <typename T>
|
||||
static void printVector(boost::shared_ptr<std::vector<T> > vector);
|
||||
static void printVector(const std::vector<T> & vector);
|
||||
|
||||
static SUFFIX_MARKER_TYPE getIdFromMarker(SUFFIX_MARKER_TYPE marker);
|
||||
|
||||
@ -65,9 +68,9 @@ private:
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void Utils::printVector(boost::shared_ptr<std::vector<T> > vector) {
|
||||
for (int i = 0; i < vector->size(); i++) {
|
||||
cout << static_cast<int>(vector->at(i)) << " ";
|
||||
void Utils::printVector(const std::vector<T> & vector) {
|
||||
for (int i = 0; i < vector.size(); i++) {
|
||||
cout << static_cast<int>(vector.at(i)) << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
@ -51,7 +51,7 @@ void Concordia::addExample(const Example & example)
|
||||
|
||||
// Sentences are written to disk and added to T.
|
||||
// SA is generated on command by other methods.
|
||||
void Concordia::addAllExamples(const boost::ptr_vector<Example > & examples)
|
||||
void Concordia::addAllExamples(const std::vector<Example> & examples)
|
||||
throw(ConcordiaException) {
|
||||
_index->addAllExamples(_hashGenerator, _T, _markers, examples);
|
||||
}
|
||||
@ -131,26 +131,26 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
|
||||
}
|
||||
}
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> Concordia::simpleSearch(
|
||||
std::vector<SubstringOccurence> Concordia::simpleSearch(
|
||||
const string & pattern)
|
||||
throw(ConcordiaException) {
|
||||
if (_T->size() > 0) {
|
||||
return _searcher->simpleSearch(_hashGenerator, _T,
|
||||
_markers, _SA, pattern);
|
||||
} else {
|
||||
boost::ptr_vector<SubstringOccurence> result;
|
||||
std::vector<SubstringOccurence> result;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
boost::ptr_vector<AnubisSearchResult> Concordia::anubisSearch(
|
||||
std::vector<AnubisSearchResult> Concordia::anubisSearch(
|
||||
const string & pattern)
|
||||
throw(ConcordiaException) {
|
||||
if (_T->size() > 0) {
|
||||
return _searcher->anubisSearch(_hashGenerator, _T,
|
||||
_markers, _SA, pattern);
|
||||
_markers, _SA, pattern);
|
||||
} else {
|
||||
boost::ptr_vector<AnubisSearchResult> result;
|
||||
std::vector<AnubisSearchResult> result;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
@ -4,7 +4,6 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
@ -41,16 +40,14 @@ public:
|
||||
|
||||
void addExample(const Example & example) throw(ConcordiaException);
|
||||
|
||||
void addAllExamples(const boost::ptr_vector<Example > & examples)
|
||||
void addAllExamples(const std::vector<Example> & examples)
|
||||
throw(ConcordiaException);
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> simpleSearch(
|
||||
const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
std::vector<SubstringOccurence> simpleSearch(const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
boost::ptr_vector<AnubisSearchResult> anubisSearch(
|
||||
const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
std::vector<AnubisSearchResult> anubisSearch(const std::string & pattern)
|
||||
throw(ConcordiaException);
|
||||
|
||||
void loadRAMIndexFromDisk() throw(ConcordiaException);
|
||||
|
||||
|
@ -56,7 +56,7 @@ void ConcordiaIndex::addAllExamples(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<vector<sauchar_t> > T,
|
||||
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const boost::ptr_vector<Example > & examples) {
|
||||
const vector<Example> & examples) {
|
||||
ofstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
||||
ios::app|ios::binary);
|
||||
@ -81,11 +81,11 @@ void ConcordiaIndex::_addSingleExample(
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const Example & example) {
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash
|
||||
vector<INDEX_CHARACTER_TYPE> hash
|
||||
= hashGenerator->generateHash(example.getSentence());
|
||||
int offset = 0;
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash->begin();
|
||||
it != hash->end(); ++it) {
|
||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||
it != hash.end(); ++it) {
|
||||
INDEX_CHARACTER_TYPE character = *it;
|
||||
Utils::writeIndexCharacter(hashedIndexFile, character);
|
||||
Utils::appendCharToSaucharVector(T, character);
|
||||
@ -95,7 +95,7 @@ void ConcordiaIndex::_addSingleExample(
|
||||
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
|
||||
example.getId(),
|
||||
offset,
|
||||
hash->size());
|
||||
hash.size());
|
||||
|
||||
Utils::writeMarker(markersFile, marker);
|
||||
markers->push_back(marker);
|
||||
|
@ -2,10 +2,10 @@
|
||||
#define CONCORDIA_INDEX_HDR
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/example.hpp"
|
||||
@ -40,7 +40,7 @@ public:
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<vector<sauchar_t> > T,
|
||||
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const boost::ptr_vector<Example > & examples);
|
||||
const vector<Example> & examples);
|
||||
|
||||
boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
|
||||
boost::shared_ptr<vector<sauchar_t> > T);
|
||||
|
@ -25,31 +25,28 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||
HashGenerator::~HashGenerator() {
|
||||
}
|
||||
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
|
||||
vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||
const string & sentence) throw(ConcordiaException) {
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
||||
result(new vector<INDEX_CHARACTER_TYPE>());
|
||||
boost::shared_ptr<vector<string> > tokenTexts =
|
||||
generateTokenVector(sentence);
|
||||
if (tokenTexts->size() > Utils::maxSentenceSize) {
|
||||
vector<INDEX_CHARACTER_TYPE> result;
|
||||
vector<string> tokenTexts = generateTokenVector(sentence);
|
||||
if (tokenTexts.size() > Utils::maxSentenceSize) {
|
||||
throw ConcordiaException("Trying to add too long sentence.");
|
||||
}
|
||||
for (vector<string>::iterator it = tokenTexts->begin();
|
||||
it != tokenTexts->end(); ++it) {
|
||||
for (vector<string>::iterator it = tokenTexts.begin();
|
||||
it != tokenTexts.end(); ++it) {
|
||||
string token = *it;
|
||||
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
|
||||
result->push_back(code);
|
||||
result.push_back(code);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
boost::shared_ptr<vector<string> >
|
||||
HashGenerator::generateTokenVector(const string & sentence) {
|
||||
vector<string> HashGenerator::generateTokenVector(const string & sentence) {
|
||||
string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
|
||||
boost::trim(anonymizedSentence);
|
||||
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
|
||||
boost::split(*tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
|
||||
vector<string> tokenTexts;
|
||||
boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
|
||||
boost::algorithm::token_compress_on);
|
||||
return tokenTexts;
|
||||
}
|
||||
|
@ -29,12 +29,10 @@ public:
|
||||
*/
|
||||
virtual ~HashGenerator();
|
||||
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
|
||||
generateHash(const string & sentence)
|
||||
vector<INDEX_CHARACTER_TYPE> generateHash(const string & sentence)
|
||||
throw(ConcordiaException);
|
||||
|
||||
boost::shared_ptr<vector<string> >
|
||||
generateTokenVector(const string & sentence);
|
||||
vector<string> generateTokenVector(const string & sentence);
|
||||
|
||||
void serializeWordMap();
|
||||
|
||||
|
@ -12,18 +12,17 @@ IndexSearcher::IndexSearcher() {
|
||||
IndexSearcher::~IndexSearcher() {
|
||||
}
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
||||
vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const string & pattern) throw(ConcordiaException) {
|
||||
boost::ptr_vector<SubstringOccurence> result;
|
||||
vector<SubstringOccurence> result;
|
||||
|
||||
int left;
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
|
||||
hashGenerator->generateHash(pattern);
|
||||
saidx_t patternLength = hash->size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator->generateHash(pattern);
|
||||
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||
|
||||
int size = sa_search(T->data(), (saidx_t) T->size(),
|
||||
@ -40,7 +39,7 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
||||
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
||||
|
||||
result.push_back(new SubstringOccurence(marker));
|
||||
result.push_back(SubstringOccurence(marker));
|
||||
}
|
||||
}
|
||||
|
||||
@ -48,13 +47,12 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
||||
return result;
|
||||
}
|
||||
|
||||
boost::ptr_vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
||||
vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const string & pattern) throw(ConcordiaException) {
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
|
||||
hashGenerator->generateHash(pattern);
|
||||
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator->generateHash(pattern);
|
||||
return _anubisSearcher->anubisSearch(T, markers, SA, hash);
|
||||
}
|
||||
|
@ -2,9 +2,9 @@
|
||||
#define INDEX_SEARCHER_HDR
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/substring_occurence.hpp"
|
||||
@ -30,14 +30,14 @@ public:
|
||||
*/
|
||||
virtual ~IndexSearcher();
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> simpleSearch(
|
||||
vector<SubstringOccurence> simpleSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||
const string & pattern) throw(ConcordiaException);
|
||||
|
||||
boost::ptr_vector<AnubisSearchResult> anubisSearch(
|
||||
vector<AnubisSearchResult> anubisSearch(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
|
@ -59,7 +59,7 @@ void SentenceAnonymizer::_createNeRules(string & namedEntitiesPath) {
|
||||
<< " in NE file: " << namedEntitiesPath;
|
||||
throw ConcordiaException(ss.str());
|
||||
} else {
|
||||
_namedEntities.push_back(new RegexReplacement(
|
||||
_namedEntities.push_back(RegexReplacement(
|
||||
tokenTexts->at(0), tokenTexts->at(1)));
|
||||
}
|
||||
}
|
||||
|
@ -2,12 +2,12 @@
|
||||
#define SENTENCE_ANONYMIZER_HDR
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/regex_replacement.hpp"
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
|
||||
@ -39,7 +39,7 @@ private:
|
||||
string replacement,
|
||||
bool wholeWord = false);
|
||||
|
||||
boost::ptr_vector<RegexReplacement> _namedEntities;
|
||||
vector<RegexReplacement> _namedEntities;
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _htmlTags;
|
||||
|
||||
|
@ -22,7 +22,6 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA(new std::vector<saidx_t>());
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern(new std::vector<sauchar_t>());
|
||||
|
||||
/* Search in text: "banana"
|
||||
T = 123232 (all one sentence id=34)
|
||||
@ -64,25 +63,26 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||
markers->push_back(Utils::createMarker(34,i,6));
|
||||
}
|
||||
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(2);
|
||||
std::vector<sauchar_t> pattern;
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(2);
|
||||
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(3);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(3);
|
||||
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(4);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(4);
|
||||
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(0);
|
||||
pattern->push_back(4);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(0);
|
||||
pattern.push_back(4);
|
||||
|
||||
/* Suffix array for the hashed index: 0001 0002 0003 0002 0003 0002
|
||||
0:000100020003000200030002
|
||||
@ -137,7 +137,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||
SA->push_back(11);
|
||||
|
||||
SUFFIX_MARKER_TYPE highResLength;
|
||||
boost::ptr_vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, highResLength);
|
||||
std::vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, highResLength);
|
||||
SUFFIX_MARKER_TYPE length = highResLength / sizeof(INDEX_CHARACTER_TYPE);
|
||||
|
||||
/* Expecting to get the following results from SA:
|
||||
@ -155,39 +155,39 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||
|
||||
//--------pattern banana
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern2(new std::vector<sauchar_t>());
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(1);
|
||||
std::vector<sauchar_t> pattern2;
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(1);
|
||||
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(2);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(2);
|
||||
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(3);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(3);
|
||||
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(2);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(2);
|
||||
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(3);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(3);
|
||||
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(0);
|
||||
pattern2->push_back(2);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(0);
|
||||
pattern2.push_back(2);
|
||||
|
||||
SUFFIX_MARKER_TYPE highResLength2;
|
||||
boost::ptr_vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, highResLength2);
|
||||
vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, highResLength2);
|
||||
SUFFIX_MARKER_TYPE length2 = highResLength2 / sizeof(INDEX_CHARACTER_TYPE);
|
||||
|
||||
/* Expecting to get one result from SA:
|
||||
@ -203,34 +203,34 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||
|
||||
//--------pattern banan
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern3(new std::vector<sauchar_t>());
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(1);
|
||||
std::vector<sauchar_t> pattern3;
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(1);
|
||||
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(2);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(2);
|
||||
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(3);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(3);
|
||||
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(2);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(2);
|
||||
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(0);
|
||||
pattern3->push_back(3);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(0);
|
||||
pattern3.push_back(3);
|
||||
|
||||
SUFFIX_MARKER_TYPE highResLength3;
|
||||
boost::ptr_vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, highResLength3);
|
||||
vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, highResLength3);
|
||||
SUFFIX_MARKER_TYPE length3 = highResLength3 / sizeof(INDEX_CHARACTER_TYPE);
|
||||
|
||||
/* Expecting to get one result from SA:
|
||||
@ -245,29 +245,29 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||
|
||||
//--------pattern nazz
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern4(new std::vector<sauchar_t>());
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(3);
|
||||
std::vector<sauchar_t> pattern4;
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(3);
|
||||
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(2);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(2);
|
||||
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(4);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(4);
|
||||
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(0);
|
||||
pattern4->push_back(4);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(0);
|
||||
pattern4.push_back(4);
|
||||
|
||||
SUFFIX_MARKER_TYPE highResLength4;
|
||||
boost::ptr_vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, highResLength4);
|
||||
vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, highResLength4);
|
||||
SUFFIX_MARKER_TYPE length4 = highResLength4 / sizeof(INDEX_CHARACTER_TYPE);
|
||||
|
||||
/* Expecting to get 2 results from SA:
|
||||
@ -286,19 +286,19 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||
|
||||
//--------pattern zz
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern5(new std::vector<sauchar_t>());
|
||||
pattern5->push_back(0);
|
||||
pattern5->push_back(0);
|
||||
pattern5->push_back(0);
|
||||
pattern5->push_back(4);
|
||||
std::vector<sauchar_t> pattern5;
|
||||
pattern5.push_back(0);
|
||||
pattern5.push_back(0);
|
||||
pattern5.push_back(0);
|
||||
pattern5.push_back(4);
|
||||
|
||||
pattern5->push_back(0);
|
||||
pattern5->push_back(0);
|
||||
pattern5->push_back(0);
|
||||
pattern5->push_back(4);
|
||||
pattern5.push_back(0);
|
||||
pattern5.push_back(0);
|
||||
pattern5.push_back(0);
|
||||
pattern5.push_back(4);
|
||||
|
||||
SUFFIX_MARKER_TYPE highResLength5;
|
||||
boost::ptr_vector<SubstringOccurence> result5 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength5);
|
||||
vector<SubstringOccurence> result5 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength5);
|
||||
SUFFIX_MARKER_TYPE length5 = highResLength5 / sizeof(INDEX_CHARACTER_TYPE);
|
||||
|
||||
/* Expecting to get 0 results from SA, lcp length = 0;
|
||||
@ -309,20 +309,20 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||
|
||||
//--------pattern existing in the text but spanning over parts of characters
|
||||
|
||||
boost::shared_ptr<std::vector<sauchar_t> > pattern6(new std::vector<sauchar_t>());
|
||||
pattern6->push_back(0);
|
||||
pattern6->push_back(0);
|
||||
pattern6->push_back(3);
|
||||
std::vector<sauchar_t> pattern6;
|
||||
pattern6.push_back(0);
|
||||
pattern6.push_back(0);
|
||||
pattern6.push_back(3);
|
||||
|
||||
pattern6->push_back(0);
|
||||
pattern6->push_back(0);
|
||||
pattern6->push_back(0);
|
||||
pattern6->push_back(2);
|
||||
pattern6.push_back(0);
|
||||
pattern6.push_back(0);
|
||||
pattern6.push_back(0);
|
||||
pattern6.push_back(2);
|
||||
|
||||
pattern6->push_back(0);
|
||||
pattern6.push_back(0);
|
||||
|
||||
SUFFIX_MARKER_TYPE highResLength6;
|
||||
boost::ptr_vector<SubstringOccurence> result6 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength6);
|
||||
vector<SubstringOccurence> result6 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength6);
|
||||
SUFFIX_MARKER_TYPE length6 = highResLength6 / sizeof(INDEX_CHARACTER_TYPE);
|
||||
|
||||
/* Expecting to get 0 results from SA, lcp length = 0;
|
||||
@ -378,7 +378,7 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
|
||||
|
||||
// searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
|
||||
|
||||
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern = hashGenerator->generateHash("Ola posiada rysia Marysia");
|
||||
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia");
|
||||
|
||||
boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);
|
||||
BOOST_CHECK_EQUAL(tmMatchesMap->size(), 3);
|
||||
@ -393,38 +393,38 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
|
||||
|
||||
// example 14
|
||||
// example interval list: [(1,2)]
|
||||
boost::ptr_vector<Interval> exampleIntervals14 = tmMatches14->getExampleIntervals();
|
||||
vector<Interval> exampleIntervals14 = tmMatches14->getExampleIntervals();
|
||||
BOOST_CHECK_EQUAL(exampleIntervals14.size(), 1);
|
||||
BOOST_CHECK_EQUAL(exampleIntervals14[0].getStart(), 1);
|
||||
BOOST_CHECK_EQUAL(exampleIntervals14[0].getEnd(), 2);
|
||||
// pattern interval list: [(1,2)]
|
||||
boost::ptr_vector<Interval> patternIntervals14 = tmMatches14->getPatternIntervals();
|
||||
vector<Interval> patternIntervals14 = tmMatches14->getPatternIntervals();
|
||||
BOOST_CHECK_EQUAL(patternIntervals14.size(), 1);
|
||||
BOOST_CHECK_EQUAL(patternIntervals14[0].getStart(), 1);
|
||||
BOOST_CHECK_EQUAL(patternIntervals14[0].getEnd(), 2);
|
||||
|
||||
// example 51
|
||||
// example interval list: [(1,3)]
|
||||
boost::ptr_vector<Interval> exampleIntervals51 = tmMatches51->getExampleIntervals();
|
||||
vector<Interval> exampleIntervals51 = tmMatches51->getExampleIntervals();
|
||||
BOOST_CHECK_EQUAL(exampleIntervals51.size(), 1);
|
||||
BOOST_CHECK_EQUAL(exampleIntervals51[0].getStart(), 1);
|
||||
BOOST_CHECK_EQUAL(exampleIntervals51[0].getEnd(), 3);
|
||||
// pattern interval list: [(1,3)]
|
||||
boost::ptr_vector<Interval> patternIntervals51 = tmMatches51->getPatternIntervals();
|
||||
vector<Interval> patternIntervals51 = tmMatches51->getPatternIntervals();
|
||||
BOOST_CHECK_EQUAL(patternIntervals51.size(), 1);
|
||||
BOOST_CHECK_EQUAL(patternIntervals51[0].getStart(), 1);
|
||||
BOOST_CHECK_EQUAL(patternIntervals51[0].getEnd(), 3);
|
||||
|
||||
// example 123
|
||||
// example interval list: [(1,3), (0,1)]
|
||||
boost::ptr_vector<Interval> exampleIntervals123 = tmMatches123->getExampleIntervals();
|
||||
vector<Interval> exampleIntervals123 = tmMatches123->getExampleIntervals();
|
||||
BOOST_CHECK_EQUAL(exampleIntervals123.size(), 2);
|
||||
BOOST_CHECK_EQUAL(exampleIntervals123[0].getStart(), 1);
|
||||
BOOST_CHECK_EQUAL(exampleIntervals123[0].getEnd(), 3);
|
||||
BOOST_CHECK_EQUAL(exampleIntervals123[1].getStart(), 0);
|
||||
BOOST_CHECK_EQUAL(exampleIntervals123[1].getEnd(), 1);
|
||||
// pattern interval list: [(1,3), (3,4)]
|
||||
boost::ptr_vector<Interval> patternIntervals123 = tmMatches123->getPatternIntervals();
|
||||
vector<Interval> patternIntervals123 = tmMatches123->getPatternIntervals();
|
||||
BOOST_CHECK_EQUAL(patternIntervals123.size(), 2);
|
||||
BOOST_CHECK_EQUAL(patternIntervals123[0].getStart(), 1);
|
||||
BOOST_CHECK_EQUAL(patternIntervals123[0].getEnd(), 3);
|
||||
|
@ -5,7 +5,6 @@
|
||||
#include "concordia/common/config.hpp"
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
#include <string>
|
||||
@ -52,8 +51,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
||||
|
||||
*/
|
||||
|
||||
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("posiada rysia");
|
||||
boost::ptr_vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("posiada kota Ala");
|
||||
vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("posiada rysia");
|
||||
vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("posiada kota Ala");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||
@ -74,11 +73,11 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
{
|
||||
// modified stop words to avoid anonymization
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
boost::ptr_vector<Example> testExamples;
|
||||
testExamples.push_back(new Example("xto xjest okno",312));
|
||||
testExamples.push_back(new Example("czy xjest okno otwarte",202));
|
||||
testExamples.push_back(new Example("chyba xto xjest xtutaj",45));
|
||||
testExamples.push_back(new Example("xto xjest",29));
|
||||
vector<Example> testExamples;
|
||||
testExamples.push_back(Example("xto xjest okno",312));
|
||||
testExamples.push_back(Example("czy xjest okno otwarte",202));
|
||||
testExamples.push_back(Example("chyba xto xjest xtutaj",45));
|
||||
testExamples.push_back(Example("xto xjest",29));
|
||||
concordia.addAllExamples(testExamples);
|
||||
|
||||
/*The test index contains 4 sentences:
|
||||
@ -107,8 +106,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
*/
|
||||
|
||||
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("xto xjest");
|
||||
boost::ptr_vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("xjest okno");
|
||||
vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("xto xjest");
|
||||
vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("xjest okno");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||
@ -132,13 +131,13 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
boost::ptr_vector<Example> testExamples;
|
||||
testExamples.push_back(new Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
|
||||
testExamples.push_back(new Example("czy xjest żółte otwarte",202));
|
||||
vector<Example> testExamples;
|
||||
testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
|
||||
testExamples.push_back(Example("czy xjest żółte otwarte",202));
|
||||
concordia.addAllExamples(testExamples);
|
||||
|
||||
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
||||
vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||
@ -177,8 +176,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||
|
||||
boost::ptr_vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
|
||||
boost::ptr_vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
|
||||
vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
|
||||
vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
|
||||
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||
|
@ -23,13 +23,13 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||
|
||||
HashGenerator hashGenerator = HashGenerator(config);
|
||||
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash("Ala posiada kota");
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
|
||||
expected->push_back(0);
|
||||
expected->push_back(1);
|
||||
expected->push_back(2);
|
||||
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota");
|
||||
vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(0);
|
||||
expected.push_back(1);
|
||||
expected.push_back(2);
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
|
||||
}
|
||||
|
||||
/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
|
||||
@ -76,22 +76,22 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
|
||||
HashGenerator hashGenerator1 = HashGenerator(config);
|
||||
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash1 = hashGenerator1.generateHash("Ala posiada kota");
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected1(new vector<INDEX_CHARACTER_TYPE>());
|
||||
expected1->push_back(0);
|
||||
expected1->push_back(1);
|
||||
expected1->push_back(2);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash1->begin(), hash1->end(), expected1->begin(), expected1->end());
|
||||
vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota");
|
||||
vector<INDEX_CHARACTER_TYPE> expected1;
|
||||
expected1.push_back(0);
|
||||
expected1.push_back(1);
|
||||
expected1.push_back(2);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash1.begin(), hash1.end(), expected1.begin(), expected1.end());
|
||||
|
||||
hashGenerator1.serializeWordMap();
|
||||
|
||||
HashGenerator hashGenerator2 = HashGenerator(config);
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash2 = hashGenerator2.generateHash("Ala posiada psa");
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected2(new vector<INDEX_CHARACTER_TYPE>());
|
||||
expected2->push_back(0);
|
||||
expected2->push_back(1);
|
||||
expected2->push_back(3);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash2->begin(), hash2->end(), expected2->begin(), expected2->end());
|
||||
vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa");
|
||||
vector<INDEX_CHARACTER_TYPE> expected2;
|
||||
expected2.push_back(0);
|
||||
expected2.push_back(1);
|
||||
expected2.push_back(3);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash2.begin(), hash2.end(), expected2.begin(), expected2.end());
|
||||
|
||||
boost::filesystem::remove(config->getWordMapFilePath());
|
||||
}
|
||||
@ -106,23 +106,23 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
|
||||
|
||||
HashGenerator hashGenerator = HashGenerator(config);
|
||||
|
||||
boost::shared_ptr<vector<string> > tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
|
||||
boost::shared_ptr<vector<string> > expected(new vector<string>());
|
||||
expected->push_back("ne_date");
|
||||
expected->push_back("godzinie");
|
||||
expected->push_back("ne_number");
|
||||
expected->push_back("ne_number");
|
||||
expected->push_back("doszło");
|
||||
expected->push_back("kolizji");
|
||||
expected->push_back("ulicy");
|
||||
expected->push_back("grobla");
|
||||
expected->push_back("policjanci");
|
||||
expected->push_back("ustalili");
|
||||
expected->push_back("kierowca");
|
||||
expected->push_back("zaparkował");
|
||||
expected->push_back("samochód");
|
||||
vector<string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
|
||||
vector<string> expected;
|
||||
expected.push_back("ne_date");
|
||||
expected.push_back("godzinie");
|
||||
expected.push_back("ne_number");
|
||||
expected.push_back("ne_number");
|
||||
expected.push_back("doszło");
|
||||
expected.push_back("kolizji");
|
||||
expected.push_back("ulicy");
|
||||
expected.push_back("grobla");
|
||||
expected.push_back("policjanci");
|
||||
expected.push_back("ustalili");
|
||||
expected.push_back("kierowca");
|
||||
expected.push_back("zaparkował");
|
||||
expected.push_back("samochód");
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(tokenVector->begin(), tokenVector->end(), expected->begin(), expected->end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(tokenVector.begin(), tokenVector.end(), expected.begin(), expected.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
@ -31,58 +31,58 @@ BOOST_AUTO_TEST_CASE( WriteReadSingleCharacter )
|
||||
|
||||
BOOST_AUTO_TEST_CASE( IndexVectorToSaucharArray )
|
||||
{
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash(new vector<INDEX_CHARACTER_TYPE>());
|
||||
hash->push_back(123456789); // in hex: 75BCD15
|
||||
vector<INDEX_CHARACTER_TYPE> hash;
|
||||
hash.push_back(123456789); // in hex: 75BCD15
|
||||
// in memory: 15 cd 5b 07
|
||||
// in memory DEC: 21 205 91 7
|
||||
|
||||
hash->push_back(987654321); // in hex: 3ADE68B1
|
||||
hash.push_back(987654321); // in hex: 3ADE68B1
|
||||
// in memory: b1 68 de 3a
|
||||
// in memory DEC: 177 104 222 58
|
||||
sauchar_t * dataArray = Utils::indexVectorToSaucharArray(hash);
|
||||
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > result(new vector<INDEX_CHARACTER_TYPE>());
|
||||
vector<INDEX_CHARACTER_TYPE> result;
|
||||
for (int i=0;i<8;i++) {
|
||||
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||
result->push_back(a);
|
||||
result.push_back(a);
|
||||
}
|
||||
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
|
||||
expected->push_back(21);
|
||||
expected->push_back(205);
|
||||
expected->push_back(91);
|
||||
expected->push_back(7);
|
||||
expected->push_back(177);
|
||||
expected->push_back(104);
|
||||
expected->push_back(222);
|
||||
expected->push_back(58);
|
||||
vector<INDEX_CHARACTER_TYPE> expected;
|
||||
expected.push_back(21);
|
||||
expected.push_back(205);
|
||||
expected.push_back(91);
|
||||
expected.push_back(7);
|
||||
expected.push_back(177);
|
||||
expected.push_back(104);
|
||||
expected.push_back(222);
|
||||
expected.push_back(58);
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector )
|
||||
{
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash(new vector<INDEX_CHARACTER_TYPE>());
|
||||
hash->push_back(123456789); // in hex: 75BCD15
|
||||
vector<INDEX_CHARACTER_TYPE> hash;
|
||||
hash.push_back(123456789); // in hex: 75BCD15
|
||||
// in memory: 15 cd 5b 07
|
||||
// in memory DEC: 21 205 91 7
|
||||
|
||||
hash->push_back(987654321); // in hex: 3ADE68B1
|
||||
hash.push_back(987654321); // in hex: 3ADE68B1
|
||||
// in memory: b1 68 de 3a
|
||||
// in memory DEC: 177 104 222 58
|
||||
boost::shared_ptr<vector<sauchar_t> > result = Utils::indexVectorToSaucharVector(hash);
|
||||
vector<sauchar_t> result = Utils::indexVectorToSaucharVector(hash);
|
||||
|
||||
boost::shared_ptr<vector<sauchar_t> > expected(new vector<sauchar_t>());
|
||||
expected->push_back(21);
|
||||
expected->push_back(205);
|
||||
expected->push_back(91);
|
||||
expected->push_back(7);
|
||||
expected->push_back(177);
|
||||
expected->push_back(104);
|
||||
expected->push_back(222);
|
||||
expected->push_back(58);
|
||||
vector<sauchar_t> expected;
|
||||
expected.push_back(21);
|
||||
expected.push_back(205);
|
||||
expected.push_back(91);
|
||||
expected.push_back(7);
|
||||
expected.push_back(177);
|
||||
expected.push_back(104);
|
||||
expected.push_back(222);
|
||||
expected.push_back(58);
|
||||
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( MaxSentenceSize )
|
||||
|
@ -40,37 +40,34 @@ void TmMatches::calculateSimpleScore() {
|
||||
|
||||
void TmMatches::addExampleInterval(int start, int end) {
|
||||
if (!_alreadyIntersects(_exampleMatchedRegions, start, end)) {
|
||||
_exampleMatchedRegions.push_back(new Interval(start, end));
|
||||
_exampleMatchedRegions.push_back(Interval(start, end));
|
||||
}
|
||||
}
|
||||
|
||||
void TmMatches::addPatternInterval(int start, int end) {
|
||||
if (!_alreadyIntersects(_patternMatchedRegions, start, end)) {
|
||||
_patternMatchedRegions.push_back(new Interval(start, end));
|
||||
_patternMatchedRegions.push_back(Interval(start, end));
|
||||
}
|
||||
}
|
||||
|
||||
bool TmMatches::_alreadyIntersects(
|
||||
boost::ptr_vector<Interval> intervalList,
|
||||
const vector<Interval> & intervalList,
|
||||
int start, int end) {
|
||||
Interval * tempInterval = new Interval(start, end);
|
||||
BOOST_FOREACH(Interval & oldInterval, intervalList) {
|
||||
if (oldInterval.intersects(*tempInterval)) {
|
||||
delete tempInterval;
|
||||
Interval tempInterval(start, end);
|
||||
BOOST_FOREACH(Interval oldInterval, intervalList) {
|
||||
if (oldInterval.intersects(tempInterval)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
delete tempInterval;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
double TmMatches::_getLogarithmicOverlay(
|
||||
boost::ptr_vector<Interval> intervalList,
|
||||
const vector<Interval> & intervalList,
|
||||
unsigned char sentenceSize,
|
||||
double k) {
|
||||
double overlayScore = 0;
|
||||
BOOST_FOREACH(Interval & interval, intervalList) {
|
||||
BOOST_FOREACH(Interval interval, intervalList) {
|
||||
double intervalOverlay = static_cast<double>(interval.getLength())
|
||||
/ static_cast<double>(sentenceSize);
|
||||
double significanceFactor = pow(log(interval.getLength()+1)
|
||||
|
@ -2,9 +2,9 @@
|
||||
#define TM_MATCHES_HDR
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/interval.hpp"
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
#include <boost/ptr_container/ptr_map.hpp>
|
||||
|
||||
|
||||
@ -29,11 +29,11 @@ public:
|
||||
return _score;
|
||||
}
|
||||
|
||||
boost::ptr_vector<Interval> getExampleIntervals() const {
|
||||
vector<Interval> getExampleIntervals() const {
|
||||
return _exampleMatchedRegions;
|
||||
}
|
||||
|
||||
boost::ptr_vector<Interval> getPatternIntervals() const {
|
||||
vector<Interval> getPatternIntervals() const {
|
||||
return _patternMatchedRegions;
|
||||
}
|
||||
|
||||
@ -50,18 +50,18 @@ public:
|
||||
void addPatternInterval(int start, int end);
|
||||
|
||||
private:
|
||||
bool _alreadyIntersects(boost::ptr_vector<Interval> intervalList,
|
||||
int start, int end);
|
||||
bool _alreadyIntersects(const vector<Interval> & intervalList,
|
||||
int start, int end);
|
||||
|
||||
double _getLogarithmicOverlay(boost::ptr_vector<Interval> intervalList,
|
||||
unsigned char sentenceSize,
|
||||
double k);
|
||||
double _getLogarithmicOverlay(const vector<Interval> & intervalList,
|
||||
unsigned char sentenceSize,
|
||||
double k);
|
||||
|
||||
SUFFIX_MARKER_TYPE _exampleId;
|
||||
|
||||
boost::ptr_vector<Interval> _exampleMatchedRegions;
|
||||
vector<Interval> _exampleMatchedRegions;
|
||||
|
||||
boost::ptr_vector<Interval> _patternMatchedRegions;
|
||||
vector<Interval> _patternMatchedRegions;
|
||||
|
||||
unsigned char _patternSize;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user