std vectors

Former-commit-id: 5816e87c856f7edc242cc707851a0e2ad05aeb38
This commit is contained in:
rjawor 2015-04-15 10:55:26 +02:00
parent e02bbaa0fa
commit 3a03b01f42
22 changed files with 314 additions and 319 deletions

View File

@ -21,6 +21,8 @@ IN PROGRESS 2. Wykonać anubis search na nowych markerach z długością zdania
zastanowić się nad optymalizacją:
- tmMatchesMap jako normalna mapa (nie ptr_map)
- REJECTED LCP array
- !important! rezygnacja z ptr_vector (wycieki!)
- !important! rezygnacja z ptr_vector
- zwracanie wektorów
- powyrzucać using namespace std
- profiling

View File

@ -3,7 +3,6 @@
#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/date_time/posix_time/posix_time.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
@ -68,7 +67,7 @@ int main(int argc, char** argv) {
std::cout << "\tSearching for pattern: \"" << pattern <<
"\"" << std::endl;
time_start = boost::posix_time::microsec_clock::local_time();
boost::ptr_vector<SubstringOccurence> result =
std::vector<SubstringOccurence> result =
concordia.simpleSearch(pattern);
time_end = boost::posix_time::microsec_clock::local_time();
msdiff = time_end - time_start;
@ -89,12 +88,12 @@ int main(int argc, char** argv) {
std::string line;
if (text_file.is_open()) {
long lineCount = 0;
boost::ptr_vector<Example> buffer;
vector<Example> buffer;
boost::posix_time::ptime timeStart =
boost::posix_time::microsec_clock::local_time();
while (getline(text_file, line)) {
lineCount++;
buffer.push_back(new Example(line, lineCount));
buffer.push_back(Example(line, lineCount));
if (lineCount % READ_BUFFER_LENGTH == 0) {
concordia.addAllExamples(buffer);
buffer.clear();

View File

@ -12,16 +12,16 @@ AnubisSearcher::~AnubisSearcher() {
}
boost::ptr_vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
throw(ConcordiaException) {
boost::shared_ptr<TmMatchesMap> tmMatchesMap = getTmMatches(T, markers, SA, pattern);
// get the tmMatches list sorted descending by score
boost::ptr_vector<AnubisSearchResult> result;
std::vector<AnubisSearchResult> result;
return result;
}
@ -29,30 +29,28 @@ boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
throw(ConcordiaException) {
boost::shared_ptr<std::vector<sauchar_t> > patternVector =
std::vector<sauchar_t> patternVector =
Utils::indexVectorToSaucharVector(pattern);
if (patternVector->size() !=
pattern->size() * sizeof(INDEX_CHARACTER_TYPE)) {
if (patternVector.size() !=
pattern.size() * sizeof(INDEX_CHARACTER_TYPE)) {
throw ConcordiaException("Increasing pattern resolution went wrong.");
}
boost::shared_ptr<TmMatchesMap> tmMatchesMap(new TmMatchesMap());
for (int offset = 0; offset < pattern->size(); offset++) {
for (int offset = 0; offset < pattern.size(); offset++) {
int highResOffset = offset * sizeof(INDEX_CHARACTER_TYPE);
boost::shared_ptr<std::vector<sauchar_t> > currentPattern =
boost::shared_ptr<std::vector<sauchar_t> >
(new std::vector<sauchar_t>(
patternVector->begin()+highResOffset, patternVector->end()));
std::vector<sauchar_t> currentPattern(
patternVector.begin()+highResOffset, patternVector.end());
saidx_t patternLength = 0;
saidx_t size = SA->size();
saidx_t left = 0;
sauchar_t * patternArray = currentPattern->data();
sauchar_t * patternArray = currentPattern.data();
saidx_t * SAleft = SA->data();
@ -77,19 +75,19 @@ boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
// Add to tm matches map results surrounding the main stream.
// from left
for (saidx_t i = prevLeft; i < left; i++) {
_addToMap(SA, markers, tmMatchesMap, i, pattern->size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
}
// from right
for (saidx_t i = left+size; i < prevLeft+prevSize; i++) {
_addToMap(SA, markers, tmMatchesMap, i, pattern->size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
}
}
} while (patternLength < currentPattern->size() && size > 0);
} while (patternLength < currentPattern.size() && size > 0);
if (size > 0) {
for (saidx_t i = left; i < left+size; i++) {
_addToMap(SA, markers, tmMatchesMap, i, pattern->size(), patternLength / sizeof(INDEX_CHARACTER_TYPE), offset);
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(), patternLength / sizeof(INDEX_CHARACTER_TYPE), offset);
}
}
}
@ -97,18 +95,18 @@ boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
return tmMatchesMap;
}
boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
std::vector<SubstringOccurence> AnubisSearcher::lcpSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<sauchar_t> > pattern,
const std::vector<sauchar_t> & pattern,
SUFFIX_MARKER_TYPE & length)
throw(ConcordiaException) {
saidx_t patternLength = 0;
saidx_t size = SA->size();
saidx_t left = 0;
sauchar_t * patternArray = pattern->data();
const sauchar_t * patternArray = pattern.data();
saidx_t * SAleft = SA->data();
@ -126,9 +124,9 @@ boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
SAleft, size, &localLeft);
left += localLeft;
SAleft += localLeft;
} while (patternLength < pattern->size() && size > 0);
} while (patternLength < pattern.size() && size > 0);
boost::ptr_vector<SubstringOccurence> result;
vector<SubstringOccurence> result;
if (size == 0) {
// The search managed to find exactly the longest common prefixes.
@ -151,7 +149,7 @@ boost::ptr_vector<SubstringOccurence> AnubisSearcher::lcpSearch(
}
void AnubisSearcher::_collectResults(
boost::ptr_vector<SubstringOccurence> & result,
vector<SubstringOccurence> & result,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
saidx_t left, saidx_t size) {
@ -160,7 +158,7 @@ void AnubisSearcher::_collectResults(
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
SUFFIX_MARKER_TYPE marker = markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
result.push_back(new SubstringOccurence(marker));
result.push_back(SubstringOccurence(marker));
}
}
}

View File

@ -2,7 +2,6 @@
#define ANUBIS_SEARCHER_HDR
#include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
@ -28,29 +27,29 @@ public:
*/
virtual ~AnubisSearcher();
boost::ptr_vector<AnubisSearchResult> anubisSearch(
std::vector<AnubisSearchResult> anubisSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
throw(ConcordiaException);
boost::shared_ptr<TmMatchesMap> getTmMatches(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern)
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
throw(ConcordiaException);
boost::ptr_vector<SubstringOccurence> lcpSearch(
std::vector<SubstringOccurence> lcpSearch(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
boost::shared_ptr<std::vector<sauchar_t> > pattern,
const std::vector<sauchar_t> & pattern,
SUFFIX_MARKER_TYPE & length) throw(ConcordiaException);
private:
void _collectResults(boost::ptr_vector<SubstringOccurence> & result,
void _collectResults(vector<SubstringOccurence> & result,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
saidx_t left, saidx_t size);

View File

@ -30,31 +30,39 @@ SUFFIX_MARKER_TYPE Utils::readMarker(ifstream & file) {
}
sauchar_t * Utils::indexVectorToSaucharArray(
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input) {
const int kArraySize = input->size()*sizeof(INDEX_CHARACTER_TYPE);
const vector<INDEX_CHARACTER_TYPE> & input) {
const int kArraySize = input.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray =
new sauchar_t[kArraySize];
int pos = 0;
for (vector<INDEX_CHARACTER_TYPE>::iterator it = input->begin();
it != input->end(); ++it) {
for (vector<INDEX_CHARACTER_TYPE>::const_iterator it = input.begin();
it != input.end(); ++it) {
_insertCharToSaucharArray(patternArray, *it, pos);
pos += sizeof(INDEX_CHARACTER_TYPE);
}
return patternArray;
}
boost::shared_ptr<std::vector<sauchar_t> > Utils::indexVectorToSaucharVector(
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input) {
boost::shared_ptr<std::vector<sauchar_t> > result =
boost::shared_ptr<std::vector<sauchar_t> >(new std::vector<sauchar_t>);
for (vector<INDEX_CHARACTER_TYPE>::iterator it = input->begin();
it != input->end(); ++it) {
std::vector<sauchar_t> Utils::indexVectorToSaucharVector(
const vector<INDEX_CHARACTER_TYPE> & input) {
std::vector<sauchar_t> result;
for (vector<INDEX_CHARACTER_TYPE>::const_iterator it = input.begin();
it != input.end(); ++it) {
appendCharToSaucharVector(result, *it);
}
return result;
}
void Utils::appendCharToSaucharVector(
std::vector<sauchar_t> & vector,
INDEX_CHARACTER_TYPE character) {
sauchar_t * characterArray = reinterpret_cast<sauchar_t *>(&character);
for (int i = 0; i < sizeof(character); i++) {
vector.push_back(characterArray[i]);
}
}
void Utils::appendCharToSaucharVector(
boost::shared_ptr<std::vector<sauchar_t> > vector,
INDEX_CHARACTER_TYPE character) {

View File

@ -32,18 +32,21 @@ public:
static SUFFIX_MARKER_TYPE readMarker(ifstream & file);
static sauchar_t * indexVectorToSaucharArray(
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input);
const vector<INDEX_CHARACTER_TYPE> & input);
static boost::shared_ptr<std::vector<sauchar_t> >
indexVectorToSaucharVector(
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > input);
static std::vector<sauchar_t> indexVectorToSaucharVector(
const vector<INDEX_CHARACTER_TYPE> & input);
static void appendCharToSaucharVector(
boost::shared_ptr<std::vector<sauchar_t> > vector,
INDEX_CHARACTER_TYPE character);
static void appendCharToSaucharVector(
std::vector<sauchar_t> & vector,
INDEX_CHARACTER_TYPE character);
template <typename T>
static void printVector(boost::shared_ptr<std::vector<T> > vector);
static void printVector(const std::vector<T> & vector);
static SUFFIX_MARKER_TYPE getIdFromMarker(SUFFIX_MARKER_TYPE marker);
@ -65,9 +68,9 @@ private:
};
template <typename T>
void Utils::printVector(boost::shared_ptr<std::vector<T> > vector) {
for (int i = 0; i < vector->size(); i++) {
cout << static_cast<int>(vector->at(i)) << " ";
void Utils::printVector(const std::vector<T> & vector) {
for (int i = 0; i < vector.size(); i++) {
cout << static_cast<int>(vector.at(i)) << " ";
}
cout << endl;
}

View File

@ -51,7 +51,7 @@ void Concordia::addExample(const Example & example)
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
void Concordia::addAllExamples(const boost::ptr_vector<Example > & examples)
void Concordia::addAllExamples(const std::vector<Example> & examples)
throw(ConcordiaException) {
_index->addAllExamples(_hashGenerator, _T, _markers, examples);
}
@ -131,26 +131,26 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
}
}
boost::ptr_vector<SubstringOccurence> Concordia::simpleSearch(
std::vector<SubstringOccurence> Concordia::simpleSearch(
const string & pattern)
throw(ConcordiaException) {
if (_T->size() > 0) {
return _searcher->simpleSearch(_hashGenerator, _T,
_markers, _SA, pattern);
} else {
boost::ptr_vector<SubstringOccurence> result;
std::vector<SubstringOccurence> result;
return result;
}
}
boost::ptr_vector<AnubisSearchResult> Concordia::anubisSearch(
std::vector<AnubisSearchResult> Concordia::anubisSearch(
const string & pattern)
throw(ConcordiaException) {
if (_T->size() > 0) {
return _searcher->anubisSearch(_hashGenerator, _T,
_markers, _SA, pattern);
_markers, _SA, pattern);
} else {
boost::ptr_vector<AnubisSearchResult> result;
std::vector<AnubisSearchResult> result;
return result;
}
}

View File

@ -4,7 +4,6 @@
#include <string>
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/filesystem.hpp>
#include "concordia/common/config.hpp"
@ -41,16 +40,14 @@ public:
void addExample(const Example & example) throw(ConcordiaException);
void addAllExamples(const boost::ptr_vector<Example > & examples)
void addAllExamples(const std::vector<Example> & examples)
throw(ConcordiaException);
boost::ptr_vector<SubstringOccurence> simpleSearch(
const std::string & pattern)
throw(ConcordiaException);
std::vector<SubstringOccurence> simpleSearch(const std::string & pattern)
throw(ConcordiaException);
boost::ptr_vector<AnubisSearchResult> anubisSearch(
const std::string & pattern)
throw(ConcordiaException);
std::vector<AnubisSearchResult> anubisSearch(const std::string & pattern)
throw(ConcordiaException);
void loadRAMIndexFromDisk() throw(ConcordiaException);

View File

@ -56,7 +56,7 @@ void ConcordiaIndex::addAllExamples(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T,
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
const boost::ptr_vector<Example > & examples) {
const vector<Example> & examples) {
ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
ios::app|ios::binary);
@ -81,11 +81,11 @@ void ConcordiaIndex::_addSingleExample(
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash
vector<INDEX_CHARACTER_TYPE> hash
= hashGenerator->generateHash(example.getSentence());
int offset = 0;
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash->begin();
it != hash->end(); ++it) {
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
it != hash.end(); ++it) {
INDEX_CHARACTER_TYPE character = *it;
Utils::writeIndexCharacter(hashedIndexFile, character);
Utils::appendCharToSaucharVector(T, character);
@ -95,7 +95,7 @@ void ConcordiaIndex::_addSingleExample(
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
example.getId(),
offset,
hash->size());
hash.size());
Utils::writeMarker(markersFile, marker);
markers->push_back(marker);

View File

@ -2,10 +2,10 @@
#define CONCORDIA_INDEX_HDR
#include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/example.hpp"
@ -40,7 +40,7 @@ public:
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T,
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
const boost::ptr_vector<Example > & examples);
const vector<Example> & examples);
boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
boost::shared_ptr<vector<sauchar_t> > T);

View File

@ -25,31 +25,28 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
HashGenerator::~HashGenerator() {
}
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
const string & sentence) throw(ConcordiaException) {
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
result(new vector<INDEX_CHARACTER_TYPE>());
boost::shared_ptr<vector<string> > tokenTexts =
generateTokenVector(sentence);
if (tokenTexts->size() > Utils::maxSentenceSize) {
vector<INDEX_CHARACTER_TYPE> result;
vector<string> tokenTexts = generateTokenVector(sentence);
if (tokenTexts.size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence.");
}
for (vector<string>::iterator it = tokenTexts->begin();
it != tokenTexts->end(); ++it) {
for (vector<string>::iterator it = tokenTexts.begin();
it != tokenTexts.end(); ++it) {
string token = *it;
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
result->push_back(code);
result.push_back(code);
}
return result;
}
boost::shared_ptr<vector<string> >
HashGenerator::generateTokenVector(const string & sentence) {
vector<string> HashGenerator::generateTokenVector(const string & sentence) {
string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
boost::trim(anonymizedSentence);
boost::shared_ptr<vector<string> > tokenTexts(new vector<string>());
boost::split(*tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
vector<string> tokenTexts;
boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
boost::algorithm::token_compress_on);
return tokenTexts;
}

View File

@ -29,12 +29,10 @@ public:
*/
virtual ~HashGenerator();
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> >
generateHash(const string & sentence)
vector<INDEX_CHARACTER_TYPE> generateHash(const string & sentence)
throw(ConcordiaException);
boost::shared_ptr<vector<string> >
generateTokenVector(const string & sentence);
vector<string> generateTokenVector(const string & sentence);
void serializeWordMap();

View File

@ -12,18 +12,17 @@ IndexSearcher::IndexSearcher() {
IndexSearcher::~IndexSearcher() {
}
boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
vector<SubstringOccurence> IndexSearcher::simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const string & pattern) throw(ConcordiaException) {
boost::ptr_vector<SubstringOccurence> result;
vector<SubstringOccurence> result;
int left;
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
hashGenerator->generateHash(pattern);
saidx_t patternLength = hash->size()*sizeof(INDEX_CHARACTER_TYPE);
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator->generateHash(pattern);
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
int size = sa_search(T->data(), (saidx_t) T->size(),
@ -40,7 +39,7 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
result.push_back(new SubstringOccurence(marker));
result.push_back(SubstringOccurence(marker));
}
}
@ -48,13 +47,12 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
return result;
}
boost::ptr_vector<AnubisSearchResult> IndexSearcher::anubisSearch(
vector<AnubisSearchResult> IndexSearcher::anubisSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const string & pattern) throw(ConcordiaException) {
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash =
hashGenerator->generateHash(pattern);
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator->generateHash(pattern);
return _anubisSearcher->anubisSearch(T, markers, SA, hash);
}

View File

@ -2,9 +2,9 @@
#define INDEX_SEARCHER_HDR
#include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <fstream>
#include <iostream>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/substring_occurence.hpp"
@ -30,14 +30,14 @@ public:
*/
virtual ~IndexSearcher();
boost::ptr_vector<SubstringOccurence> simpleSearch(
vector<SubstringOccurence> simpleSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<std::vector<saidx_t> > SA,
const string & pattern) throw(ConcordiaException);
boost::ptr_vector<AnubisSearchResult> anubisSearch(
vector<AnubisSearchResult> anubisSearch(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,

View File

@ -59,7 +59,7 @@ void SentenceAnonymizer::_createNeRules(string & namedEntitiesPath) {
<< " in NE file: " << namedEntitiesPath;
throw ConcordiaException(ss.str());
} else {
_namedEntities.push_back(new RegexReplacement(
_namedEntities.push_back(RegexReplacement(
tokenTexts->at(0), tokenTexts->at(1)));
}
}

View File

@ -2,12 +2,12 @@
#define SENTENCE_ANONYMIZER_HDR
#include <string>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/regex_replacement.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/filesystem.hpp>
@ -39,7 +39,7 @@ private:
string replacement,
bool wholeWord = false);
boost::ptr_vector<RegexReplacement> _namedEntities;
vector<RegexReplacement> _namedEntities;
boost::shared_ptr<RegexReplacement> _htmlTags;

View File

@ -22,7 +22,6 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
boost::shared_ptr<std::vector<sauchar_t> > T(new std::vector<sauchar_t>());
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers(new std::vector<SUFFIX_MARKER_TYPE>());
boost::shared_ptr<std::vector<saidx_t> > SA(new std::vector<saidx_t>());
boost::shared_ptr<std::vector<sauchar_t> > pattern(new std::vector<sauchar_t>());
/* Search in text: "banana"
T = 123232 (all one sentence id=34)
@ -64,25 +63,26 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
markers->push_back(Utils::createMarker(34,i,6));
}
pattern->push_back(0);
pattern->push_back(0);
pattern->push_back(0);
pattern->push_back(2);
std::vector<sauchar_t> pattern;
pattern.push_back(0);
pattern.push_back(0);
pattern.push_back(0);
pattern.push_back(2);
pattern->push_back(0);
pattern->push_back(0);
pattern->push_back(0);
pattern->push_back(3);
pattern.push_back(0);
pattern.push_back(0);
pattern.push_back(0);
pattern.push_back(3);
pattern->push_back(0);
pattern->push_back(0);
pattern->push_back(0);
pattern->push_back(4);
pattern.push_back(0);
pattern.push_back(0);
pattern.push_back(0);
pattern.push_back(4);
pattern->push_back(0);
pattern->push_back(0);
pattern->push_back(0);
pattern->push_back(4);
pattern.push_back(0);
pattern.push_back(0);
pattern.push_back(0);
pattern.push_back(4);
/* Suffix array for the hashed index: 0001 0002 0003 0002 0003 0002
0:000100020003000200030002
@ -137,7 +137,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
SA->push_back(11);
SUFFIX_MARKER_TYPE highResLength;
boost::ptr_vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, highResLength);
std::vector<SubstringOccurence> result = searcher.lcpSearch(T, markers, SA, pattern, highResLength);
SUFFIX_MARKER_TYPE length = highResLength / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get the following results from SA:
@ -155,39 +155,39 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
//--------pattern banana
boost::shared_ptr<std::vector<sauchar_t> > pattern2(new std::vector<sauchar_t>());
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(1);
std::vector<sauchar_t> pattern2;
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(1);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(2);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(2);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(3);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(3);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(2);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(2);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(3);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(3);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(0);
pattern2->push_back(2);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(0);
pattern2.push_back(2);
SUFFIX_MARKER_TYPE highResLength2;
boost::ptr_vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, highResLength2);
vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, highResLength2);
SUFFIX_MARKER_TYPE length2 = highResLength2 / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get one result from SA:
@ -203,34 +203,34 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
//--------pattern banan
boost::shared_ptr<std::vector<sauchar_t> > pattern3(new std::vector<sauchar_t>());
pattern3->push_back(0);
pattern3->push_back(0);
pattern3->push_back(0);
pattern3->push_back(1);
std::vector<sauchar_t> pattern3;
pattern3.push_back(0);
pattern3.push_back(0);
pattern3.push_back(0);
pattern3.push_back(1);
pattern3->push_back(0);
pattern3->push_back(0);
pattern3->push_back(0);
pattern3->push_back(2);
pattern3.push_back(0);
pattern3.push_back(0);
pattern3.push_back(0);
pattern3.push_back(2);
pattern3->push_back(0);
pattern3->push_back(0);
pattern3->push_back(0);
pattern3->push_back(3);
pattern3.push_back(0);
pattern3.push_back(0);
pattern3.push_back(0);
pattern3.push_back(3);
pattern3->push_back(0);
pattern3->push_back(0);
pattern3->push_back(0);
pattern3->push_back(2);
pattern3.push_back(0);
pattern3.push_back(0);
pattern3.push_back(0);
pattern3.push_back(2);
pattern3->push_back(0);
pattern3->push_back(0);
pattern3->push_back(0);
pattern3->push_back(3);
pattern3.push_back(0);
pattern3.push_back(0);
pattern3.push_back(0);
pattern3.push_back(3);
SUFFIX_MARKER_TYPE highResLength3;
boost::ptr_vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, highResLength3);
vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, highResLength3);
SUFFIX_MARKER_TYPE length3 = highResLength3 / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get one result from SA:
@ -245,29 +245,29 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
//--------pattern nazz
boost::shared_ptr<std::vector<sauchar_t> > pattern4(new std::vector<sauchar_t>());
pattern4->push_back(0);
pattern4->push_back(0);
pattern4->push_back(0);
pattern4->push_back(3);
std::vector<sauchar_t> pattern4;
pattern4.push_back(0);
pattern4.push_back(0);
pattern4.push_back(0);
pattern4.push_back(3);
pattern4->push_back(0);
pattern4->push_back(0);
pattern4->push_back(0);
pattern4->push_back(2);
pattern4.push_back(0);
pattern4.push_back(0);
pattern4.push_back(0);
pattern4.push_back(2);
pattern4->push_back(0);
pattern4->push_back(0);
pattern4->push_back(0);
pattern4->push_back(4);
pattern4.push_back(0);
pattern4.push_back(0);
pattern4.push_back(0);
pattern4.push_back(4);
pattern4->push_back(0);
pattern4->push_back(0);
pattern4->push_back(0);
pattern4->push_back(4);
pattern4.push_back(0);
pattern4.push_back(0);
pattern4.push_back(0);
pattern4.push_back(4);
SUFFIX_MARKER_TYPE highResLength4;
boost::ptr_vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, highResLength4);
vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, highResLength4);
SUFFIX_MARKER_TYPE length4 = highResLength4 / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get 2 results from SA:
@ -286,19 +286,19 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
//--------pattern zz
boost::shared_ptr<std::vector<sauchar_t> > pattern5(new std::vector<sauchar_t>());
pattern5->push_back(0);
pattern5->push_back(0);
pattern5->push_back(0);
pattern5->push_back(4);
std::vector<sauchar_t> pattern5;
pattern5.push_back(0);
pattern5.push_back(0);
pattern5.push_back(0);
pattern5.push_back(4);
pattern5->push_back(0);
pattern5->push_back(0);
pattern5->push_back(0);
pattern5->push_back(4);
pattern5.push_back(0);
pattern5.push_back(0);
pattern5.push_back(0);
pattern5.push_back(4);
SUFFIX_MARKER_TYPE highResLength5;
boost::ptr_vector<SubstringOccurence> result5 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength5);
vector<SubstringOccurence> result5 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength5);
SUFFIX_MARKER_TYPE length5 = highResLength5 / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get 0 results from SA, lcp length = 0;
@ -309,20 +309,20 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
//--------pattern existing in the text but spanning over parts of characters
boost::shared_ptr<std::vector<sauchar_t> > pattern6(new std::vector<sauchar_t>());
pattern6->push_back(0);
pattern6->push_back(0);
pattern6->push_back(3);
std::vector<sauchar_t> pattern6;
pattern6.push_back(0);
pattern6.push_back(0);
pattern6.push_back(3);
pattern6->push_back(0);
pattern6->push_back(0);
pattern6->push_back(0);
pattern6->push_back(2);
pattern6.push_back(0);
pattern6.push_back(0);
pattern6.push_back(0);
pattern6.push_back(2);
pattern6->push_back(0);
pattern6.push_back(0);
SUFFIX_MARKER_TYPE highResLength6;
boost::ptr_vector<SubstringOccurence> result6 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength6);
vector<SubstringOccurence> result6 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength6);
SUFFIX_MARKER_TYPE length6 = highResLength6 / sizeof(INDEX_CHARACTER_TYPE);
/* Expecting to get 0 results from SA, lcp length = 0;
@ -378,7 +378,7 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
// searching for pattern "Ola posiada rysia Marysia" (5 1 3 4)
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > pattern = hashGenerator->generateHash("Ola posiada rysia Marysia");
std::vector<INDEX_CHARACTER_TYPE> pattern = hashGenerator->generateHash("Ola posiada rysia Marysia");
boost::shared_ptr<TmMatchesMap> tmMatchesMap = searcher.getTmMatches(T, markers, SA, pattern);
BOOST_CHECK_EQUAL(tmMatchesMap->size(), 3);
@ -393,38 +393,38 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
// example 14
// example interval list: [(1,2)]
boost::ptr_vector<Interval> exampleIntervals14 = tmMatches14->getExampleIntervals();
vector<Interval> exampleIntervals14 = tmMatches14->getExampleIntervals();
BOOST_CHECK_EQUAL(exampleIntervals14.size(), 1);
BOOST_CHECK_EQUAL(exampleIntervals14[0].getStart(), 1);
BOOST_CHECK_EQUAL(exampleIntervals14[0].getEnd(), 2);
// pattern interval list: [(1,2)]
boost::ptr_vector<Interval> patternIntervals14 = tmMatches14->getPatternIntervals();
vector<Interval> patternIntervals14 = tmMatches14->getPatternIntervals();
BOOST_CHECK_EQUAL(patternIntervals14.size(), 1);
BOOST_CHECK_EQUAL(patternIntervals14[0].getStart(), 1);
BOOST_CHECK_EQUAL(patternIntervals14[0].getEnd(), 2);
// example 51
// example interval list: [(1,3)]
boost::ptr_vector<Interval> exampleIntervals51 = tmMatches51->getExampleIntervals();
vector<Interval> exampleIntervals51 = tmMatches51->getExampleIntervals();
BOOST_CHECK_EQUAL(exampleIntervals51.size(), 1);
BOOST_CHECK_EQUAL(exampleIntervals51[0].getStart(), 1);
BOOST_CHECK_EQUAL(exampleIntervals51[0].getEnd(), 3);
// pattern interval list: [(1,3)]
boost::ptr_vector<Interval> patternIntervals51 = tmMatches51->getPatternIntervals();
vector<Interval> patternIntervals51 = tmMatches51->getPatternIntervals();
BOOST_CHECK_EQUAL(patternIntervals51.size(), 1);
BOOST_CHECK_EQUAL(patternIntervals51[0].getStart(), 1);
BOOST_CHECK_EQUAL(patternIntervals51[0].getEnd(), 3);
// example 123
// example interval list: [(1,3), (0,1)]
boost::ptr_vector<Interval> exampleIntervals123 = tmMatches123->getExampleIntervals();
vector<Interval> exampleIntervals123 = tmMatches123->getExampleIntervals();
BOOST_CHECK_EQUAL(exampleIntervals123.size(), 2);
BOOST_CHECK_EQUAL(exampleIntervals123[0].getStart(), 1);
BOOST_CHECK_EQUAL(exampleIntervals123[0].getEnd(), 3);
BOOST_CHECK_EQUAL(exampleIntervals123[1].getStart(), 0);
BOOST_CHECK_EQUAL(exampleIntervals123[1].getEnd(), 1);
// pattern interval list: [(1,3), (3,4)]
boost::ptr_vector<Interval> patternIntervals123 = tmMatches123->getPatternIntervals();
vector<Interval> patternIntervals123 = tmMatches123->getPatternIntervals();
BOOST_CHECK_EQUAL(patternIntervals123.size(), 2);
BOOST_CHECK_EQUAL(patternIntervals123[0].getStart(), 1);
BOOST_CHECK_EQUAL(patternIntervals123[0].getEnd(), 3);

View File

@ -5,7 +5,6 @@
#include "concordia/common/config.hpp"
#include <boost/algorithm/string/predicate.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/filesystem.hpp>
#include <string>
@ -52,8 +51,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
*/
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("posiada rysia");
boost::ptr_vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("posiada kota Ala");
vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("posiada rysia");
vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("posiada kota Ala");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
@ -74,11 +73,11 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
{
// modified stop words to avoid anonymization
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::ptr_vector<Example> testExamples;
testExamples.push_back(new Example("xto xjest okno",312));
testExamples.push_back(new Example("czy xjest okno otwarte",202));
testExamples.push_back(new Example("chyba xto xjest xtutaj",45));
testExamples.push_back(new Example("xto xjest",29));
vector<Example> testExamples;
testExamples.push_back(Example("xto xjest okno",312));
testExamples.push_back(Example("czy xjest okno otwarte",202));
testExamples.push_back(Example("chyba xto xjest xtutaj",45));
testExamples.push_back(Example("xto xjest",29));
concordia.addAllExamples(testExamples);
/*The test index contains 4 sentences:
@ -107,8 +106,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
*/
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("xto xjest");
boost::ptr_vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("xjest okno");
vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("xto xjest");
vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("xjest okno");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
@ -132,13 +131,13 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::ptr_vector<Example> testExamples;
testExamples.push_back(new Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
testExamples.push_back(new Example("czy xjest żółte otwarte",202));
vector<Example> testExamples;
testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
testExamples.push_back(Example("czy xjest żółte otwarte",202));
concordia.addAllExamples(testExamples);
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::ptr_vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
@ -177,8 +176,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
n: 0 1 2 3 4 5 6 7 8 9 10 11
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
boost::ptr_vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
boost::ptr_vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));

View File

@ -23,13 +23,13 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
HashGenerator hashGenerator = HashGenerator(config);
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash("Ala posiada kota");
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
expected->push_back(0);
expected->push_back(1);
expected->push_back(2);
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota");
vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(0);
expected.push_back(1);
expected.push_back(2);
BOOST_CHECK_EQUAL_COLLECTIONS(hash->begin(), hash->end(), expected->begin(), expected->end());
BOOST_CHECK_EQUAL_COLLECTIONS(hash.begin(), hash.end(), expected.begin(), expected.end());
}
/* Commentet out - the test takes too long. Run it once whenever the SUFFIX_MARKER_SENTENCE_BYTES parameter changes.
@ -76,22 +76,22 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
HashGenerator hashGenerator1 = HashGenerator(config);
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash1 = hashGenerator1.generateHash("Ala posiada kota");
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected1(new vector<INDEX_CHARACTER_TYPE>());
expected1->push_back(0);
expected1->push_back(1);
expected1->push_back(2);
BOOST_CHECK_EQUAL_COLLECTIONS(hash1->begin(), hash1->end(), expected1->begin(), expected1->end());
vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota");
vector<INDEX_CHARACTER_TYPE> expected1;
expected1.push_back(0);
expected1.push_back(1);
expected1.push_back(2);
BOOST_CHECK_EQUAL_COLLECTIONS(hash1.begin(), hash1.end(), expected1.begin(), expected1.end());
hashGenerator1.serializeWordMap();
HashGenerator hashGenerator2 = HashGenerator(config);
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash2 = hashGenerator2.generateHash("Ala posiada psa");
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected2(new vector<INDEX_CHARACTER_TYPE>());
expected2->push_back(0);
expected2->push_back(1);
expected2->push_back(3);
BOOST_CHECK_EQUAL_COLLECTIONS(hash2->begin(), hash2->end(), expected2->begin(), expected2->end());
vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa");
vector<INDEX_CHARACTER_TYPE> expected2;
expected2.push_back(0);
expected2.push_back(1);
expected2.push_back(3);
BOOST_CHECK_EQUAL_COLLECTIONS(hash2.begin(), hash2.end(), expected2.begin(), expected2.end());
boost::filesystem::remove(config->getWordMapFilePath());
}
@ -106,23 +106,23 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
HashGenerator hashGenerator = HashGenerator(config);
boost::shared_ptr<vector<string> > tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
boost::shared_ptr<vector<string> > expected(new vector<string>());
expected->push_back("ne_date");
expected->push_back("godzinie");
expected->push_back("ne_number");
expected->push_back("ne_number");
expected->push_back("doszło");
expected->push_back("kolizji");
expected->push_back("ulicy");
expected->push_back("grobla");
expected->push_back("policjanci");
expected->push_back("ustalili");
expected->push_back("kierowca");
expected->push_back("zaparkował");
expected->push_back("samochód");
vector<string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
vector<string> expected;
expected.push_back("ne_date");
expected.push_back("godzinie");
expected.push_back("ne_number");
expected.push_back("ne_number");
expected.push_back("doszło");
expected.push_back("kolizji");
expected.push_back("ulicy");
expected.push_back("grobla");
expected.push_back("policjanci");
expected.push_back("ustalili");
expected.push_back("kierowca");
expected.push_back("zaparkował");
expected.push_back("samochód");
BOOST_CHECK_EQUAL_COLLECTIONS(tokenVector->begin(), tokenVector->end(), expected->begin(), expected->end());
BOOST_CHECK_EQUAL_COLLECTIONS(tokenVector.begin(), tokenVector.end(), expected.begin(), expected.end());
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -31,58 +31,58 @@ BOOST_AUTO_TEST_CASE( WriteReadSingleCharacter )
BOOST_AUTO_TEST_CASE( IndexVectorToSaucharArray )
{
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash(new vector<INDEX_CHARACTER_TYPE>());
hash->push_back(123456789); // in hex: 75BCD15
vector<INDEX_CHARACTER_TYPE> hash;
hash.push_back(123456789); // in hex: 75BCD15
// in memory: 15 cd 5b 07
// in memory DEC: 21 205 91 7
hash->push_back(987654321); // in hex: 3ADE68B1
hash.push_back(987654321); // in hex: 3ADE68B1
// in memory: b1 68 de 3a
// in memory DEC: 177 104 222 58
sauchar_t * dataArray = Utils::indexVectorToSaucharArray(hash);
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > result(new vector<INDEX_CHARACTER_TYPE>());
vector<INDEX_CHARACTER_TYPE> result;
for (int i=0;i<8;i++) {
INDEX_CHARACTER_TYPE a = dataArray[i];
result->push_back(a);
result.push_back(a);
}
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
expected->push_back(21);
expected->push_back(205);
expected->push_back(91);
expected->push_back(7);
expected->push_back(177);
expected->push_back(104);
expected->push_back(222);
expected->push_back(58);
vector<INDEX_CHARACTER_TYPE> expected;
expected.push_back(21);
expected.push_back(205);
expected.push_back(91);
expected.push_back(7);
expected.push_back(177);
expected.push_back(104);
expected.push_back(222);
expected.push_back(58);
BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
}
BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector )
{
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash(new vector<INDEX_CHARACTER_TYPE>());
hash->push_back(123456789); // in hex: 75BCD15
vector<INDEX_CHARACTER_TYPE> hash;
hash.push_back(123456789); // in hex: 75BCD15
// in memory: 15 cd 5b 07
// in memory DEC: 21 205 91 7
hash->push_back(987654321); // in hex: 3ADE68B1
hash.push_back(987654321); // in hex: 3ADE68B1
// in memory: b1 68 de 3a
// in memory DEC: 177 104 222 58
boost::shared_ptr<vector<sauchar_t> > result = Utils::indexVectorToSaucharVector(hash);
vector<sauchar_t> result = Utils::indexVectorToSaucharVector(hash);
boost::shared_ptr<vector<sauchar_t> > expected(new vector<sauchar_t>());
expected->push_back(21);
expected->push_back(205);
expected->push_back(91);
expected->push_back(7);
expected->push_back(177);
expected->push_back(104);
expected->push_back(222);
expected->push_back(58);
vector<sauchar_t> expected;
expected.push_back(21);
expected.push_back(205);
expected.push_back(91);
expected.push_back(7);
expected.push_back(177);
expected.push_back(104);
expected.push_back(222);
expected.push_back(58);
BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());
BOOST_CHECK_EQUAL_COLLECTIONS(result.begin(), result.end(), expected.begin(), expected.end());
}
BOOST_AUTO_TEST_CASE( MaxSentenceSize )

View File

@ -40,37 +40,34 @@ void TmMatches::calculateSimpleScore() {
void TmMatches::addExampleInterval(int start, int end) {
if (!_alreadyIntersects(_exampleMatchedRegions, start, end)) {
_exampleMatchedRegions.push_back(new Interval(start, end));
_exampleMatchedRegions.push_back(Interval(start, end));
}
}
void TmMatches::addPatternInterval(int start, int end) {
if (!_alreadyIntersects(_patternMatchedRegions, start, end)) {
_patternMatchedRegions.push_back(new Interval(start, end));
_patternMatchedRegions.push_back(Interval(start, end));
}
}
bool TmMatches::_alreadyIntersects(
boost::ptr_vector<Interval> intervalList,
const vector<Interval> & intervalList,
int start, int end) {
Interval * tempInterval = new Interval(start, end);
BOOST_FOREACH(Interval & oldInterval, intervalList) {
if (oldInterval.intersects(*tempInterval)) {
delete tempInterval;
Interval tempInterval(start, end);
BOOST_FOREACH(Interval oldInterval, intervalList) {
if (oldInterval.intersects(tempInterval)) {
return true;
}
}
delete tempInterval;
return false;
}
double TmMatches::_getLogarithmicOverlay(
boost::ptr_vector<Interval> intervalList,
const vector<Interval> & intervalList,
unsigned char sentenceSize,
double k) {
double overlayScore = 0;
BOOST_FOREACH(Interval & interval, intervalList) {
BOOST_FOREACH(Interval interval, intervalList) {
double intervalOverlay = static_cast<double>(interval.getLength())
/ static_cast<double>(sentenceSize);
double significanceFactor = pow(log(interval.getLength()+1)

View File

@ -2,9 +2,9 @@
#define TM_MATCHES_HDR
#include <string>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/interval.hpp"
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/ptr_container/ptr_map.hpp>
@ -29,11 +29,11 @@ public:
return _score;
}
boost::ptr_vector<Interval> getExampleIntervals() const {
vector<Interval> getExampleIntervals() const {
return _exampleMatchedRegions;
}
boost::ptr_vector<Interval> getPatternIntervals() const {
vector<Interval> getPatternIntervals() const {
return _patternMatchedRegions;
}
@ -50,18 +50,18 @@ public:
void addPatternInterval(int start, int end);
private:
bool _alreadyIntersects(boost::ptr_vector<Interval> intervalList,
int start, int end);
bool _alreadyIntersects(const vector<Interval> & intervalList,
int start, int end);
double _getLogarithmicOverlay(boost::ptr_vector<Interval> intervalList,
unsigned char sentenceSize,
double k);
double _getLogarithmicOverlay(const vector<Interval> & intervalList,
unsigned char sentenceSize,
double k);
SUFFIX_MARKER_TYPE _exampleId;
boost::ptr_vector<Interval> _exampleMatchedRegions;
vector<Interval> _exampleMatchedRegions;
boost::ptr_vector<Interval> _patternMatchedRegions;
vector<Interval> _patternMatchedRegions;
unsigned char _patternSize;