removed using namespace std
Former-commit-id: dbb5129e1f94d83eca887ada0f89d6bb45250f1e
This commit is contained in:
parent
a09999c130
commit
0d4bdf12de
@ -84,11 +84,11 @@ int main(int argc, char** argv) {
|
|||||||
std::string filePath = cli["read-file"].as<std::string>();
|
std::string filePath = cli["read-file"].as<std::string>();
|
||||||
std::cout << "\tReading sentences from file: " << filePath <<
|
std::cout << "\tReading sentences from file: " << filePath <<
|
||||||
std::endl;
|
std::endl;
|
||||||
ifstream text_file(filePath.c_str());
|
std::ifstream text_file(filePath.c_str());
|
||||||
std::string line;
|
std::string line;
|
||||||
if (text_file.is_open()) {
|
if (text_file.is_open()) {
|
||||||
long lineCount = 0;
|
long lineCount = 0;
|
||||||
vector<Example> buffer;
|
std::vector<Example> buffer;
|
||||||
boost::posix_time::ptime timeStart =
|
boost::posix_time::ptime timeStart =
|
||||||
boost::posix_time::microsec_clock::local_time();
|
boost::posix_time::microsec_clock::local_time();
|
||||||
while (getline(text_file, line)) {
|
while (getline(text_file, line)) {
|
||||||
@ -147,7 +147,7 @@ int main(int argc, char** argv) {
|
|||||||
<< "Terminating execution."
|
<< "Terminating execution."
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
} catch(exception & e) {
|
} catch(std::exception & e) {
|
||||||
std::cerr << "Unexpected exception caught with message: "
|
std::cerr << "Unexpected exception caught with message: "
|
||||||
<< std::endl
|
<< std::endl
|
||||||
<< e.what()
|
<< e.what()
|
||||||
|
@ -8,8 +8,6 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class AnubisSearchResult {
|
class AnubisSearchResult {
|
||||||
public:
|
public:
|
||||||
explicit AnubisSearchResult(const SUFFIX_MARKER_TYPE & exampleId,
|
explicit AnubisSearchResult(const SUFFIX_MARKER_TYPE & exampleId,
|
||||||
|
@ -18,7 +18,8 @@ std::vector<AnubisSearchResult> AnubisSearcher::anubisSearch(
|
|||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
boost::shared_ptr<TmMatchesMap> tmMatchesMap = getTmMatches(T, markers, SA, pattern);
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap =
|
||||||
|
getTmMatches(T, markers, SA, pattern);
|
||||||
|
|
||||||
// get the tmMatches list sorted descending by score
|
// get the tmMatches list sorted descending by score
|
||||||
std::vector<AnubisSearchResult> result;
|
std::vector<AnubisSearchResult> result;
|
||||||
@ -31,7 +32,6 @@ boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
|
|||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
const std::vector<INDEX_CHARACTER_TYPE> & pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
|
|
||||||
std::vector<sauchar_t> patternVector =
|
std::vector<sauchar_t> patternVector =
|
||||||
Utils::indexVectorToSaucharVector(pattern);
|
Utils::indexVectorToSaucharVector(pattern);
|
||||||
|
|
||||||
@ -75,19 +75,23 @@ boost::shared_ptr<TmMatchesMap> AnubisSearcher::getTmMatches(
|
|||||||
// Add to tm matches map results surrounding the main stream.
|
// Add to tm matches map results surrounding the main stream.
|
||||||
// from left
|
// from left
|
||||||
for (saidx_t i = prevLeft; i < left; i++) {
|
for (saidx_t i = prevLeft; i < left; i++) {
|
||||||
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
|
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(),
|
||||||
|
(patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1,
|
||||||
|
offset);
|
||||||
}
|
}
|
||||||
// from right
|
// from right
|
||||||
for (saidx_t i = left+size; i < prevLeft+prevSize; i++) {
|
for (saidx_t i = left+size; i < prevLeft+prevSize; i++) {
|
||||||
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(), (patternLength / sizeof(INDEX_CHARACTER_TYPE)) -1, offset);
|
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(),
|
||||||
|
(patternLength / sizeof(INDEX_CHARACTER_TYPE)) - 1,
|
||||||
|
offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
} while (patternLength < currentPattern.size() && size > 0);
|
} while (patternLength < currentPattern.size() && size > 0);
|
||||||
|
|
||||||
if (size > 0) {
|
if (size > 0) {
|
||||||
for (saidx_t i = left; i < left+size; i++) {
|
for (saidx_t i = left; i < left+size; i++) {
|
||||||
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(), patternLength / sizeof(INDEX_CHARACTER_TYPE), offset);
|
_addToMap(SA, markers, tmMatchesMap, i, pattern.size(),
|
||||||
|
patternLength / sizeof(INDEX_CHARACTER_TYPE), offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -126,7 +130,7 @@ std::vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
|||||||
SAleft += localLeft;
|
SAleft += localLeft;
|
||||||
} while (patternLength < pattern.size() && size > 0);
|
} while (patternLength < pattern.size() && size > 0);
|
||||||
|
|
||||||
vector<SubstringOccurence> result;
|
std::vector<SubstringOccurence> result;
|
||||||
|
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
// The search managed to find exactly the longest common prefixes.
|
// The search managed to find exactly the longest common prefixes.
|
||||||
@ -149,7 +153,7 @@ std::vector<SubstringOccurence> AnubisSearcher::lcpSearch(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void AnubisSearcher::_collectResults(
|
void AnubisSearcher::_collectResults(
|
||||||
vector<SubstringOccurence> & result,
|
std::vector<SubstringOccurence> & result,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
saidx_t left, saidx_t size) {
|
saidx_t left, saidx_t size) {
|
||||||
@ -157,7 +161,8 @@ void AnubisSearcher::_collectResults(
|
|||||||
saidx_t resultPos = SA->at(left + i);
|
saidx_t resultPos = SA->at(left + i);
|
||||||
|
|
||||||
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||||
SUFFIX_MARKER_TYPE marker = markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
|
SUFFIX_MARKER_TYPE marker =
|
||||||
|
markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
|
||||||
result.push_back(SubstringOccurence(marker));
|
result.push_back(SubstringOccurence(marker));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -180,7 +185,6 @@ void AnubisSearcher::_addToMap(boost::shared_ptr<std::vector<saidx_t> > SA,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool AnubisSearcher::_getOccurenceFromSA(
|
bool AnubisSearcher::_getOccurenceFromSA(
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
@ -189,13 +193,14 @@ bool AnubisSearcher::_getOccurenceFromSA(
|
|||||||
saidx_t resultPos = SA->at(sa_pos);
|
saidx_t resultPos = SA->at(sa_pos);
|
||||||
|
|
||||||
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
if (resultPos % sizeof(INDEX_CHARACTER_TYPE) == 0) {
|
||||||
SUFFIX_MARKER_TYPE marker = markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
|
SUFFIX_MARKER_TYPE marker =
|
||||||
|
markers->at(resultPos / sizeof(INDEX_CHARACTER_TYPE));
|
||||||
occurence.enterDataFromMarker(marker);
|
occurence.enterDataFromMarker(marker);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void AnubisSearcher::_addOccurenceToMap(boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
void AnubisSearcher::_addOccurenceToMap(
|
||||||
|
boost::shared_ptr<TmMatchesMap> tmMatchesMap,
|
||||||
SubstringOccurence & occurence,
|
SubstringOccurence & occurence,
|
||||||
SUFFIX_MARKER_TYPE totalPatternLength,
|
SUFFIX_MARKER_TYPE totalPatternLength,
|
||||||
SUFFIX_MARKER_TYPE matchedFragmentLength,
|
SUFFIX_MARKER_TYPE matchedFragmentLength,
|
||||||
@ -217,12 +222,8 @@ void AnubisSearcher::_addOccurenceToMap(boost::shared_ptr<TmMatchesMap> tmMatche
|
|||||||
// add intervals to tmMatches
|
// add intervals to tmMatches
|
||||||
tmMatches->addExampleInterval(
|
tmMatches->addExampleInterval(
|
||||||
occurence.getOffset(),
|
occurence.getOffset(),
|
||||||
occurence.getOffset() + matchedFragmentLength
|
occurence.getOffset() + matchedFragmentLength);
|
||||||
);
|
|
||||||
tmMatches->addPatternInterval(
|
tmMatches->addPatternInterval(
|
||||||
patternOffset,
|
patternOffset,
|
||||||
patternOffset + matchedFragmentLength
|
patternOffset + matchedFragmentLength);
|
||||||
);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include "concordia/anubis_search_result.hpp"
|
#include "concordia/anubis_search_result.hpp"
|
||||||
#include "concordia/tm_matches.hpp"
|
#include "concordia/tm_matches.hpp"
|
||||||
|
|
||||||
|
#include<vector>
|
||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -17,8 +18,6 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class AnubisSearcher {
|
class AnubisSearcher {
|
||||||
public:
|
public:
|
||||||
explicit AnubisSearcher();
|
explicit AnubisSearcher();
|
||||||
@ -49,7 +48,7 @@ public:
|
|||||||
SUFFIX_MARKER_TYPE & length) throw(ConcordiaException);
|
SUFFIX_MARKER_TYPE & length) throw(ConcordiaException);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void _collectResults(vector<SubstringOccurence> & result,
|
void _collectResults(std::vector<SubstringOccurence> & result,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
saidx_t left, saidx_t size);
|
saidx_t left, saidx_t size);
|
||||||
|
@ -10,10 +10,10 @@ TextUtils::TextUtils() {
|
|||||||
StringCaseConverterManager::getInstance().getUpperCaseConverter("pl");
|
StringCaseConverterManager::getInstance().getUpperCaseConverter("pl");
|
||||||
}
|
}
|
||||||
|
|
||||||
string TextUtils::toLowerCase(const string & text) {
|
std::string TextUtils::toLowerCase(const std::string & text) {
|
||||||
return simpleConvert(*_lowerConverter, text);
|
return simpleConvert(*_lowerConverter, text);
|
||||||
}
|
}
|
||||||
|
|
||||||
string TextUtils::toUpperCase(const string & text) {
|
std::string TextUtils::toUpperCase(const std::string & text) {
|
||||||
return simpleConvert(*_upperConverter, text);
|
return simpleConvert(*_upperConverter, text);
|
||||||
}
|
}
|
||||||
|
@ -7,9 +7,6 @@
|
|||||||
#include "utf8case/case_converter_factory.hpp"
|
#include "utf8case/case_converter_factory.hpp"
|
||||||
#include "utf8case/string_case_converter_manager.hpp"
|
#include "utf8case/string_case_converter_manager.hpp"
|
||||||
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
/*! Utility class for performing simple string operations.
|
/*! Utility class for performing simple string operations.
|
||||||
*/
|
*/
|
||||||
class TextUtils {
|
class TextUtils {
|
||||||
@ -26,13 +23,13 @@ public:
|
|||||||
\param text input string
|
\param text input string
|
||||||
\returns lower case version of the input string.
|
\returns lower case version of the input string.
|
||||||
*/
|
*/
|
||||||
string toLowerCase(const string & text);
|
std::string toLowerCase(const std::string & text);
|
||||||
|
|
||||||
/*! A method for converting all string letters to upper case.
|
/*! A method for converting all string letters to upper case.
|
||||||
\param text input string
|
\param text input string
|
||||||
\returns upper case version of the input string.
|
\returns upper case version of the input string.
|
||||||
*/
|
*/
|
||||||
string toUpperCase(const string & text);
|
std::string toUpperCase(const std::string & text);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
explicit TextUtils(TextUtils const&); // Don't Implement
|
explicit TextUtils(TextUtils const&); // Don't Implement
|
||||||
|
@ -7,35 +7,35 @@ Utils::Utils() {
|
|||||||
Utils::~Utils() {
|
Utils::~Utils() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Utils::writeIndexCharacter(ofstream & file,
|
void Utils::writeIndexCharacter(std::ofstream & file,
|
||||||
INDEX_CHARACTER_TYPE character) {
|
INDEX_CHARACTER_TYPE character) {
|
||||||
file.write(reinterpret_cast<char *>(&character), sizeof(character));
|
file.write(reinterpret_cast<char *>(&character), sizeof(character));
|
||||||
}
|
}
|
||||||
|
|
||||||
void Utils::writeMarker(ofstream & file,
|
void Utils::writeMarker(std::ofstream & file,
|
||||||
SUFFIX_MARKER_TYPE marker) {
|
SUFFIX_MARKER_TYPE marker) {
|
||||||
file.write(reinterpret_cast<char *>(&marker), sizeof(marker));
|
file.write(reinterpret_cast<char *>(&marker), sizeof(marker));
|
||||||
}
|
}
|
||||||
|
|
||||||
INDEX_CHARACTER_TYPE Utils::readIndexCharacter(ifstream & file) {
|
INDEX_CHARACTER_TYPE Utils::readIndexCharacter(std::ifstream & file) {
|
||||||
INDEX_CHARACTER_TYPE character;
|
INDEX_CHARACTER_TYPE character;
|
||||||
file.read(reinterpret_cast<char *>(&character), sizeof(character));
|
file.read(reinterpret_cast<char *>(&character), sizeof(character));
|
||||||
return character;
|
return character;
|
||||||
}
|
}
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE Utils::readMarker(ifstream & file) {
|
SUFFIX_MARKER_TYPE Utils::readMarker(std::ifstream & file) {
|
||||||
SUFFIX_MARKER_TYPE marker;
|
SUFFIX_MARKER_TYPE marker;
|
||||||
file.read(reinterpret_cast<char *>(&marker), sizeof(marker));
|
file.read(reinterpret_cast<char *>(&marker), sizeof(marker));
|
||||||
return marker;
|
return marker;
|
||||||
}
|
}
|
||||||
|
|
||||||
sauchar_t * Utils::indexVectorToSaucharArray(
|
sauchar_t * Utils::indexVectorToSaucharArray(
|
||||||
const vector<INDEX_CHARACTER_TYPE> & input) {
|
const std::vector<INDEX_CHARACTER_TYPE> & input) {
|
||||||
const int kArraySize = input.size()*sizeof(INDEX_CHARACTER_TYPE);
|
const int kArraySize = input.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||||
sauchar_t * patternArray =
|
sauchar_t * patternArray =
|
||||||
new sauchar_t[kArraySize];
|
new sauchar_t[kArraySize];
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
for (vector<INDEX_CHARACTER_TYPE>::const_iterator it = input.begin();
|
for (std::vector<INDEX_CHARACTER_TYPE>::const_iterator it = input.begin();
|
||||||
it != input.end(); ++it) {
|
it != input.end(); ++it) {
|
||||||
_insertCharToSaucharArray(patternArray, *it, pos);
|
_insertCharToSaucharArray(patternArray, *it, pos);
|
||||||
pos += sizeof(INDEX_CHARACTER_TYPE);
|
pos += sizeof(INDEX_CHARACTER_TYPE);
|
||||||
@ -44,9 +44,9 @@ sauchar_t * Utils::indexVectorToSaucharArray(
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<sauchar_t> Utils::indexVectorToSaucharVector(
|
std::vector<sauchar_t> Utils::indexVectorToSaucharVector(
|
||||||
const vector<INDEX_CHARACTER_TYPE> & input) {
|
const std::vector<INDEX_CHARACTER_TYPE> & input) {
|
||||||
std::vector<sauchar_t> result;
|
std::vector<sauchar_t> result;
|
||||||
for (vector<INDEX_CHARACTER_TYPE>::const_iterator it = input.begin();
|
for (std::vector<INDEX_CHARACTER_TYPE>::const_iterator it = input.begin();
|
||||||
it != input.end(); ++it) {
|
it != input.end(); ++it) {
|
||||||
appendCharToSaucharVector(result, *it);
|
appendCharToSaucharVector(result, *it);
|
||||||
}
|
}
|
||||||
|
@ -11,8 +11,6 @@
|
|||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include <divsufsort.h>
|
#include <divsufsort.h>
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class Utils {
|
class Utils {
|
||||||
public:
|
public:
|
||||||
explicit Utils();
|
explicit Utils();
|
||||||
@ -21,21 +19,21 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~Utils();
|
virtual ~Utils();
|
||||||
|
|
||||||
static void writeIndexCharacter(ofstream & file,
|
static void writeIndexCharacter(std::ofstream & file,
|
||||||
INDEX_CHARACTER_TYPE character);
|
INDEX_CHARACTER_TYPE character);
|
||||||
|
|
||||||
static void writeMarker(ofstream & file,
|
static void writeMarker(std::ofstream & file,
|
||||||
SUFFIX_MARKER_TYPE marker);
|
SUFFIX_MARKER_TYPE marker);
|
||||||
|
|
||||||
static INDEX_CHARACTER_TYPE readIndexCharacter(ifstream & file);
|
static INDEX_CHARACTER_TYPE readIndexCharacter(std::ifstream & file);
|
||||||
|
|
||||||
static SUFFIX_MARKER_TYPE readMarker(ifstream & file);
|
static SUFFIX_MARKER_TYPE readMarker(std::ifstream & file);
|
||||||
|
|
||||||
static sauchar_t * indexVectorToSaucharArray(
|
static sauchar_t * indexVectorToSaucharArray(
|
||||||
const vector<INDEX_CHARACTER_TYPE> & input);
|
const std::vector<INDEX_CHARACTER_TYPE> & input);
|
||||||
|
|
||||||
static std::vector<sauchar_t> indexVectorToSaucharVector(
|
static std::vector<sauchar_t> indexVectorToSaucharVector(
|
||||||
const vector<INDEX_CHARACTER_TYPE> & input);
|
const std::vector<INDEX_CHARACTER_TYPE> & input);
|
||||||
|
|
||||||
static void appendCharToSaucharVector(
|
static void appendCharToSaucharVector(
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
boost::shared_ptr<std::vector<sauchar_t> > vector,
|
||||||
@ -70,8 +68,8 @@ private:
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
void Utils::printVector(const std::vector<T> & vector) {
|
void Utils::printVector(const std::vector<T> & vector) {
|
||||||
for (int i = 0; i < vector.size(); i++) {
|
for (int i = 0; i < vector.size(); i++) {
|
||||||
cout << static_cast<int>(vector.at(i)) << " ";
|
std::cout << static_cast<int>(vector.at(i)) << " ";
|
||||||
}
|
}
|
||||||
cout << endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -62,12 +62,13 @@ void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
|||||||
&& boost::filesystem::exists(_config->getMarkersFilePath())) {
|
&& boost::filesystem::exists(_config->getMarkersFilePath())) {
|
||||||
// reading index from file
|
// reading index from file
|
||||||
_T->clear();
|
_T->clear();
|
||||||
ifstream hashedIndexFile;
|
std::ifstream hashedIndexFile;
|
||||||
hashedIndexFile.open(_config->getHashedIndexFilePath().c_str(), ios::in
|
hashedIndexFile.open(
|
||||||
| ios::ate | ios::binary);
|
_config->getHashedIndexFilePath().c_str(), std::ios::in
|
||||||
|
| std::ios::ate | std::ios::binary);
|
||||||
saidx_t hiFileSize = hashedIndexFile.tellg();
|
saidx_t hiFileSize = hashedIndexFile.tellg();
|
||||||
if (hiFileSize > 0) {
|
if (hiFileSize > 0) {
|
||||||
hashedIndexFile.seekg(0, ios::beg);
|
hashedIndexFile.seekg(0, std::ios::beg);
|
||||||
|
|
||||||
while (!hashedIndexFile.eof()) {
|
while (!hashedIndexFile.eof()) {
|
||||||
INDEX_CHARACTER_TYPE character =
|
INDEX_CHARACTER_TYPE character =
|
||||||
@ -82,12 +83,12 @@ void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
|||||||
|
|
||||||
// reading markers from file
|
// reading markers from file
|
||||||
_markers->clear();
|
_markers->clear();
|
||||||
ifstream markersFile;
|
std::ifstream markersFile;
|
||||||
markersFile.open(_config->getMarkersFilePath().c_str(), ios::in
|
markersFile.open(_config->getMarkersFilePath().c_str(), std::ios::in
|
||||||
| ios::ate | ios::binary);
|
| std::ios::ate | std::ios::binary);
|
||||||
saidx_t maFileSize = markersFile.tellg();
|
saidx_t maFileSize = markersFile.tellg();
|
||||||
if (maFileSize > 0) {
|
if (maFileSize > 0) {
|
||||||
markersFile.seekg(0, ios::beg);
|
markersFile.seekg(0, std::ios::beg);
|
||||||
|
|
||||||
while (!markersFile.eof()) {
|
while (!markersFile.eof()) {
|
||||||
SUFFIX_MARKER_TYPE marker =
|
SUFFIX_MARKER_TYPE marker =
|
||||||
@ -132,7 +133,7 @@ void Concordia::_initializeIndex() throw(ConcordiaException) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<SubstringOccurence> Concordia::simpleSearch(
|
std::vector<SubstringOccurence> Concordia::simpleSearch(
|
||||||
const string & pattern)
|
const std::string & pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
if (_T->size() > 0) {
|
if (_T->size() > 0) {
|
||||||
return _searcher->simpleSearch(_hashGenerator, _T,
|
return _searcher->simpleSearch(_hashGenerator, _T,
|
||||||
@ -144,7 +145,7 @@ std::vector<SubstringOccurence> Concordia::simpleSearch(
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<AnubisSearchResult> Concordia::anubisSearch(
|
std::vector<AnubisSearchResult> Concordia::anubisSearch(
|
||||||
const string & pattern)
|
const std::string & pattern)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
if (_T->size() > 0) {
|
if (_T->size() > 0) {
|
||||||
return _searcher->anubisSearch(_hashGenerator, _T,
|
return _searcher->anubisSearch(_hashGenerator, _T,
|
||||||
|
@ -13,13 +13,13 @@
|
|||||||
#define NAMED_ENTITIES_PARAM "named_entities_path"
|
#define NAMED_ENTITIES_PARAM "named_entities_path"
|
||||||
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
|
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
|
||||||
|
|
||||||
ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
|
ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
try {
|
try {
|
||||||
_config.readFile(configFilePath.c_str());
|
_config.readFile(configFilePath.c_str());
|
||||||
} catch(ParseException & e) {
|
} catch(libconfig::ParseException & e) {
|
||||||
throw ConcordiaException("Error parsing config file: "+configFilePath);
|
throw ConcordiaException("Error parsing config file: "+configFilePath);
|
||||||
} catch(FileIOException & e) {
|
} catch(libconfig::FileIOException & e) {
|
||||||
throw ConcordiaException("I/O error reading config file: "
|
throw ConcordiaException("I/O error reading config file: "
|
||||||
+configFilePath);
|
+configFilePath);
|
||||||
}
|
}
|
||||||
@ -49,7 +49,7 @@ ConcordiaConfig::ConcordiaConfig(const string & configFilePath)
|
|||||||
ConcordiaConfig::~ConcordiaConfig() {
|
ConcordiaConfig::~ConcordiaConfig() {
|
||||||
}
|
}
|
||||||
|
|
||||||
string ConcordiaConfig::_readConfigParameterStr(const string & name)
|
std::string ConcordiaConfig::_readConfigParameterStr(const std::string & name)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
if (!_config.exists(name)) {
|
if (!_config.exists(name)) {
|
||||||
throw ConcordiaException("Config error: "+name+" setting not found");
|
throw ConcordiaException("Config error: "+name+" setting not found");
|
||||||
|
@ -7,9 +7,6 @@
|
|||||||
|
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace libconfig;
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing the Concordia configuration.
|
Class representing the Concordia configuration.
|
||||||
*/
|
*/
|
||||||
@ -20,7 +17,7 @@ public:
|
|||||||
\param configFilePath path of the configuration file (see \ref running3 for file specification).
|
\param configFilePath path of the configuration file (see \ref running3 for file specification).
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
explicit ConcordiaConfig(const string & configFilePath)
|
explicit ConcordiaConfig(const std::string & configFilePath)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
@ -30,70 +27,70 @@ public:
|
|||||||
/*! Getter for the puddle file path parameter.
|
/*! Getter for the puddle file path parameter.
|
||||||
\returns file path of the puddle tagset
|
\returns file path of the puddle tagset
|
||||||
*/
|
*/
|
||||||
string & getPuddleTagsetFilePath() {
|
std::string & getPuddleTagsetFilePath() {
|
||||||
return _puddleTagsetFilePath;
|
return _puddleTagsetFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
string & getWordMapFilePath() {
|
std::string & getWordMapFilePath() {
|
||||||
return _wordMapFilePath;
|
return _wordMapFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
string & getHashedIndexFilePath() {
|
std::string & getHashedIndexFilePath() {
|
||||||
return _hashedIndexFilePath;
|
return _hashedIndexFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
string & getMarkersFilePath() {
|
std::string & getMarkersFilePath() {
|
||||||
return _markersFilePath;
|
return _markersFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
string & getSuffixArrayFilePath() {
|
std::string & getSuffixArrayFilePath() {
|
||||||
return _suffixArrayFilePath;
|
return _suffixArrayFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
string & getHtmlTagsFilePath() {
|
std::string & getHtmlTagsFilePath() {
|
||||||
return _htmlTagsFilePath;
|
return _htmlTagsFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
string & getSpaceSymbolsFilePath() {
|
std::string & getSpaceSymbolsFilePath() {
|
||||||
return _spaceSymbolsFilePath;
|
return _spaceSymbolsFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
string & getStopWordsFilePath() {
|
std::string & getStopWordsFilePath() {
|
||||||
return _stopWordsFilePath;
|
return _stopWordsFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
string & getNamedEntitiesFilePath() {
|
std::string & getNamedEntitiesFilePath() {
|
||||||
return _namedEntitiesFilePath;
|
return _namedEntitiesFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
string & getStopSymbolsFilePath() {
|
std::string & getStopSymbolsFilePath() {
|
||||||
return _stopSymbolsFilePath;
|
return _stopSymbolsFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Config _config;
|
libconfig::Config _config;
|
||||||
|
|
||||||
string _puddleTagsetFilePath;
|
std::string _puddleTagsetFilePath;
|
||||||
|
|
||||||
string _wordMapFilePath;
|
std::string _wordMapFilePath;
|
||||||
|
|
||||||
string _hashedIndexFilePath;
|
std::string _hashedIndexFilePath;
|
||||||
|
|
||||||
string _markersFilePath;
|
std::string _markersFilePath;
|
||||||
|
|
||||||
string _suffixArrayFilePath;
|
std::string _suffixArrayFilePath;
|
||||||
|
|
||||||
string _htmlTagsFilePath;
|
std::string _htmlTagsFilePath;
|
||||||
|
|
||||||
string _spaceSymbolsFilePath;
|
std::string _spaceSymbolsFilePath;
|
||||||
|
|
||||||
string _stopWordsFilePath;
|
std::string _stopWordsFilePath;
|
||||||
|
|
||||||
string _namedEntitiesFilePath;
|
std::string _namedEntitiesFilePath;
|
||||||
|
|
||||||
string _stopSymbolsFilePath;
|
std::string _stopSymbolsFilePath;
|
||||||
|
|
||||||
string _readConfigParameterStr(const string & name)
|
std::string _readConfigParameterStr(const std::string & name)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ ConcordiaException::ConcordiaException() throw():
|
|||||||
_message("Concordia exception") {
|
_message("Concordia exception") {
|
||||||
}
|
}
|
||||||
|
|
||||||
ConcordiaException::ConcordiaException(const string & message) throw():
|
ConcordiaException::ConcordiaException(const std::string & message) throw():
|
||||||
_message(message) {
|
_message(message) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,12 +5,10 @@
|
|||||||
#include<string>
|
#include<string>
|
||||||
#include<string.h>
|
#include<string.h>
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class representing an internal exception thrown in the Concordia library.
|
Class representing an internal exception thrown in the Concordia library.
|
||||||
*/
|
*/
|
||||||
class ConcordiaException : public exception {
|
class ConcordiaException : public std::exception {
|
||||||
public:
|
public:
|
||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
*/
|
*/
|
||||||
@ -19,7 +17,7 @@ public:
|
|||||||
/*! Constructor with a message.
|
/*! Constructor with a message.
|
||||||
\param message message of the exception
|
\param message message of the exception
|
||||||
*/
|
*/
|
||||||
explicit ConcordiaException(const string & message) throw();
|
explicit ConcordiaException(const std::string & message) throw();
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
@ -30,7 +28,7 @@ public:
|
|||||||
virtual const char* what() const throw();
|
virtual const char* what() const throw();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
string _message;
|
std::string _message;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -7,8 +7,8 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
|
|
||||||
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
|
ConcordiaIndex::ConcordiaIndex(const std::string & hashedIndexFilePath,
|
||||||
const string & markersFilePath)
|
const std::string & markersFilePath)
|
||||||
throw(ConcordiaException) :
|
throw(ConcordiaException) :
|
||||||
_hashedIndexFilePath(hashedIndexFilePath),
|
_hashedIndexFilePath(hashedIndexFilePath),
|
||||||
_markersFilePath(markersFilePath) {
|
_markersFilePath(markersFilePath) {
|
||||||
@ -17,15 +17,15 @@ ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
|
|||||||
ConcordiaIndex::~ConcordiaIndex() {
|
ConcordiaIndex::~ConcordiaIndex() {
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
boost::shared_ptr<std::vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
||||||
boost::shared_ptr<vector<sauchar_t> > T) {
|
boost::shared_ptr<std::vector<sauchar_t> > T) {
|
||||||
saidx_t * SA_array = new saidx_t[T->size()];
|
saidx_t * SA_array = new saidx_t[T->size()];
|
||||||
if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) {
|
if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) {
|
||||||
throw ConcordiaException("Error creating suffix array.");
|
throw ConcordiaException("Error creating suffix array.");
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > result =
|
boost::shared_ptr<std::vector<saidx_t> > result =
|
||||||
boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>);
|
boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>);
|
||||||
for (int i = 0; i < T->size(); i++) {
|
for (int i = 0; i < T->size(); i++) {
|
||||||
result->push_back(SA_array[i]);
|
result->push_back(SA_array[i]);
|
||||||
}
|
}
|
||||||
@ -36,15 +36,15 @@ boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
|||||||
|
|
||||||
void ConcordiaIndex::addExample(
|
void ConcordiaIndex::addExample(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const Example & example) {
|
const Example & example) {
|
||||||
ofstream hashedIndexFile;
|
std::ofstream hashedIndexFile;
|
||||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
||||||
ios::app|ios::binary);
|
std::ios::app|std::ios::binary);
|
||||||
ofstream markersFile;
|
std::ofstream markersFile;
|
||||||
markersFile.open(_markersFilePath.c_str(), ios::out|
|
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||||
ios::app|ios::binary);
|
std::ios::app|std::ios::binary);
|
||||||
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
T, markers, example);
|
T, markers, example);
|
||||||
hashedIndexFile.close();
|
hashedIndexFile.close();
|
||||||
@ -54,15 +54,15 @@ void ConcordiaIndex::addExample(
|
|||||||
|
|
||||||
void ConcordiaIndex::addAllExamples(
|
void ConcordiaIndex::addAllExamples(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const vector<Example> & examples) {
|
const std::vector<Example> & examples) {
|
||||||
ofstream hashedIndexFile;
|
std::ofstream hashedIndexFile;
|
||||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), ios::out|
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
||||||
ios::app|ios::binary);
|
std::ios::app|std::ios::binary);
|
||||||
ofstream markersFile;
|
std::ofstream markersFile;
|
||||||
markersFile.open(_markersFilePath.c_str(), ios::out|
|
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||||
ios::app|ios::binary);
|
std::ios::app|std::ios::binary);
|
||||||
|
|
||||||
BOOST_FOREACH(Example example, examples) {
|
BOOST_FOREACH(Example example, examples) {
|
||||||
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
_addSingleExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
@ -75,16 +75,16 @@ void ConcordiaIndex::addAllExamples(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ConcordiaIndex::_addSingleExample(
|
void ConcordiaIndex::_addSingleExample(
|
||||||
ofstream & hashedIndexFile,
|
std::ofstream & hashedIndexFile,
|
||||||
ofstream & markersFile,
|
std::ofstream & markersFile,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const Example & example) {
|
const Example & example) {
|
||||||
vector<INDEX_CHARACTER_TYPE> hash
|
std::vector<INDEX_CHARACTER_TYPE> hash
|
||||||
= hashGenerator->generateHash(example.getSentence());
|
= hashGenerator->generateHash(example.getSentence());
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
for (vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||||
it != hash.end(); ++it) {
|
it != hash.end(); ++it) {
|
||||||
INDEX_CHARACTER_TYPE character = *it;
|
INDEX_CHARACTER_TYPE character = *it;
|
||||||
Utils::writeIndexCharacter(hashedIndexFile, character);
|
Utils::writeIndexCharacter(hashedIndexFile, character);
|
||||||
|
@ -18,12 +18,10 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class ConcordiaIndex {
|
class ConcordiaIndex {
|
||||||
public:
|
public:
|
||||||
explicit ConcordiaIndex(const string & hashedIndexFilePath,
|
explicit ConcordiaIndex(const std::string & hashedIndexFilePath,
|
||||||
const string & markersFilePath)
|
const std::string & markersFilePath)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
@ -32,31 +30,31 @@ public:
|
|||||||
|
|
||||||
void addExample(
|
void addExample(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const Example & example);
|
const Example & example);
|
||||||
|
|
||||||
void addAllExamples(
|
void addAllExamples(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const vector<Example> & examples);
|
const std::vector<Example> & examples);
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
|
boost::shared_ptr<std::vector<saidx_t> > generateSuffixArray(
|
||||||
boost::shared_ptr<vector<sauchar_t> > T);
|
boost::shared_ptr<std::vector<sauchar_t> > T);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Add example to disk index and update RAM index.
|
// Add example to disk index and update RAM index.
|
||||||
void _addSingleExample(ofstream & hashedIndexFile,
|
void _addSingleExample(std::ofstream & hashedIndexFile,
|
||||||
ofstream & markersFile,
|
std::ofstream & markersFile,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const Example & example);
|
const Example & example);
|
||||||
|
|
||||||
string _hashedIndexFilePath;
|
std::string _hashedIndexFilePath;
|
||||||
|
|
||||||
string _markersFilePath;
|
std::string _markersFilePath;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
#include "concordia/example.hpp"
|
#include "concordia/example.hpp"
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
|
Example::Example(const std::string & sentence, const SUFFIX_MARKER_TYPE & id)
|
||||||
throw(ConcordiaException):
|
throw(ConcordiaException):
|
||||||
_sentence(sentence),
|
_sentence(sentence),
|
||||||
_id(id) {
|
_id(id) {
|
||||||
|
@ -10,18 +10,17 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class Example {
|
class Example {
|
||||||
public:
|
public:
|
||||||
explicit Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
|
explicit Example(const std::string & sentence,
|
||||||
|
const SUFFIX_MARKER_TYPE & id)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~Example();
|
virtual ~Example();
|
||||||
|
|
||||||
string getSentence() const {
|
std::string getSentence() const {
|
||||||
return _sentence;
|
return _sentence;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -30,7 +29,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
string _sentence;
|
std::string _sentence;
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE _id;
|
SUFFIX_MARKER_TYPE _id;
|
||||||
};
|
};
|
||||||
|
@ -15,7 +15,7 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
|||||||
_sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(
|
_sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(
|
||||||
new SentenceAnonymizer(config))) {
|
new SentenceAnonymizer(config))) {
|
||||||
if (boost::filesystem::exists(_wordMapFilePath)) {
|
if (boost::filesystem::exists(_wordMapFilePath)) {
|
||||||
ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
|
std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
|
||||||
boost::archive::binary_iarchive ia(ifs);
|
boost::archive::binary_iarchive ia(ifs);
|
||||||
boost::shared_ptr<WordMap> restoredWordMap(new WordMap);
|
boost::shared_ptr<WordMap> restoredWordMap(new WordMap);
|
||||||
ia >> *_wordMap;
|
ia >> *_wordMap;
|
||||||
@ -25,16 +25,16 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
|||||||
HashGenerator::~HashGenerator() {
|
HashGenerator::~HashGenerator() {
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||||
const string & sentence) throw(ConcordiaException) {
|
const std::string & sentence) throw(ConcordiaException) {
|
||||||
vector<INDEX_CHARACTER_TYPE> result;
|
std::vector<INDEX_CHARACTER_TYPE> result;
|
||||||
vector<string> tokenTexts = generateTokenVector(sentence);
|
std::vector<std::string> tokenTexts = generateTokenVector(sentence);
|
||||||
if (tokenTexts.size() > Utils::maxSentenceSize) {
|
if (tokenTexts.size() > Utils::maxSentenceSize) {
|
||||||
throw ConcordiaException("Trying to add too long sentence.");
|
throw ConcordiaException("Trying to add too long sentence.");
|
||||||
}
|
}
|
||||||
for (vector<string>::iterator it = tokenTexts.begin();
|
for (std::vector<std::string>::iterator it = tokenTexts.begin();
|
||||||
it != tokenTexts.end(); ++it) {
|
it != tokenTexts.end(); ++it) {
|
||||||
string token = *it;
|
std::string token = *it;
|
||||||
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
|
INDEX_CHARACTER_TYPE code = _wordMap->getWordCode(token);
|
||||||
result.push_back(code);
|
result.push_back(code);
|
||||||
}
|
}
|
||||||
@ -42,10 +42,11 @@ vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<string> HashGenerator::generateTokenVector(const string & sentence) {
|
std::vector<std::string> HashGenerator::generateTokenVector(
|
||||||
string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
|
const std::string & sentence) {
|
||||||
|
std::string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
|
||||||
boost::trim(anonymizedSentence);
|
boost::trim(anonymizedSentence);
|
||||||
vector<string> tokenTexts;
|
std::vector<std::string> tokenTexts;
|
||||||
boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
|
boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
|
||||||
boost::algorithm::token_compress_on);
|
boost::algorithm::token_compress_on);
|
||||||
return tokenTexts;
|
return tokenTexts;
|
||||||
@ -53,7 +54,7 @@ vector<string> HashGenerator::generateTokenVector(const string & sentence) {
|
|||||||
|
|
||||||
|
|
||||||
void HashGenerator::serializeWordMap() {
|
void HashGenerator::serializeWordMap() {
|
||||||
ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
|
std::ofstream ofs(_wordMapFilePath.c_str(), std::ios::binary);
|
||||||
boost::archive::binary_oarchive oa(ofs);
|
boost::archive::binary_oarchive oa(ofs);
|
||||||
oa << *_wordMap;
|
oa << *_wordMap;
|
||||||
}
|
}
|
||||||
|
@ -18,8 +18,6 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class HashGenerator {
|
class HashGenerator {
|
||||||
public:
|
public:
|
||||||
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||||
@ -29,10 +27,10 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~HashGenerator();
|
virtual ~HashGenerator();
|
||||||
|
|
||||||
vector<INDEX_CHARACTER_TYPE> generateHash(const string & sentence)
|
std::vector<INDEX_CHARACTER_TYPE> generateHash(const std::string & sentence)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
vector<string> generateTokenVector(const string & sentence);
|
std::vector<std::string> generateTokenVector(const std::string & sentence);
|
||||||
|
|
||||||
void serializeWordMap();
|
void serializeWordMap();
|
||||||
|
|
||||||
@ -41,7 +39,7 @@ private:
|
|||||||
|
|
||||||
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
|
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
|
||||||
|
|
||||||
string _wordMapFilePath;
|
std::string _wordMapFilePath;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -12,16 +12,17 @@ IndexSearcher::IndexSearcher() {
|
|||||||
IndexSearcher::~IndexSearcher() {
|
IndexSearcher::~IndexSearcher() {
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
std::vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const string & pattern) throw(ConcordiaException) {
|
const std::string & pattern) throw(ConcordiaException) {
|
||||||
vector<SubstringOccurence> result;
|
std::vector<SubstringOccurence> result;
|
||||||
|
|
||||||
int left;
|
int left;
|
||||||
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator->generateHash(pattern);
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
|
hashGenerator->generateHash(pattern);
|
||||||
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
saidx_t patternLength = hash.size()*sizeof(INDEX_CHARACTER_TYPE);
|
||||||
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
sauchar_t * patternArray = Utils::indexVectorToSaucharArray(hash);
|
||||||
|
|
||||||
@ -47,12 +48,13 @@ vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
std::vector<AnubisSearchResult> IndexSearcher::anubisSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const string & pattern) throw(ConcordiaException) {
|
const std::string & pattern) throw(ConcordiaException) {
|
||||||
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator->generateHash(pattern);
|
std::vector<INDEX_CHARACTER_TYPE> hash =
|
||||||
|
hashGenerator->generateHash(pattern);
|
||||||
return _anubisSearcher->anubisSearch(T, markers, SA, hash);
|
return _anubisSearcher->anubisSearch(T, markers, SA, hash);
|
||||||
}
|
}
|
||||||
|
@ -20,8 +20,6 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class IndexSearcher {
|
class IndexSearcher {
|
||||||
public:
|
public:
|
||||||
explicit IndexSearcher();
|
explicit IndexSearcher();
|
||||||
@ -30,19 +28,19 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~IndexSearcher();
|
virtual ~IndexSearcher();
|
||||||
|
|
||||||
vector<SubstringOccurence> simpleSearch(
|
std::vector<SubstringOccurence> simpleSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const string & pattern) throw(ConcordiaException);
|
const std::string & pattern) throw(ConcordiaException);
|
||||||
|
|
||||||
vector<AnubisSearchResult> anubisSearch(
|
std::vector<AnubisSearchResult> anubisSearch(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA,
|
boost::shared_ptr<std::vector<saidx_t> > SA,
|
||||||
const string & pattern) throw(ConcordiaException);
|
const std::string & pattern) throw(ConcordiaException);
|
||||||
private:
|
private:
|
||||||
boost::shared_ptr<AnubisSearcher> _anubisSearcher;
|
boost::shared_ptr<AnubisSearcher> _anubisSearcher;
|
||||||
};
|
};
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
#include "concordia/interval.hpp"
|
#include "concordia/interval.hpp"
|
||||||
|
|
||||||
|
|
||||||
Interval::Interval(const SUFFIX_MARKER_TYPE start, const SUFFIX_MARKER_TYPE end):
|
Interval::Interval(const SUFFIX_MARKER_TYPE start,
|
||||||
|
const SUFFIX_MARKER_TYPE end):
|
||||||
_start(start),
|
_start(start),
|
||||||
_end(end) {
|
_end(end) {
|
||||||
}
|
}
|
||||||
|
@ -8,11 +8,10 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class Interval {
|
class Interval {
|
||||||
public:
|
public:
|
||||||
explicit Interval(const SUFFIX_MARKER_TYPE start, const SUFFIX_MARKER_TYPE end);
|
explicit Interval(const SUFFIX_MARKER_TYPE start,
|
||||||
|
const SUFFIX_MARKER_TYPE end);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
|
@ -3,7 +3,8 @@
|
|||||||
#include <boost/exception/all.hpp>
|
#include <boost/exception/all.hpp>
|
||||||
#include <boost/throw_exception.hpp>
|
#include <boost/throw_exception.hpp>
|
||||||
|
|
||||||
RegexReplacement::RegexReplacement(string patternString, string replacement,
|
RegexReplacement::RegexReplacement(std::string patternString,
|
||||||
|
std::string replacement,
|
||||||
bool caseSensitive)
|
bool caseSensitive)
|
||||||
throw(ConcordiaException):
|
throw(ConcordiaException):
|
||||||
_replacement(replacement) {
|
_replacement(replacement) {
|
||||||
@ -15,7 +16,7 @@ RegexReplacement::RegexReplacement(string patternString, string replacement,
|
|||||||
boost::regex::icase);
|
boost::regex::icase);
|
||||||
}
|
}
|
||||||
} catch(const std::exception & e) {
|
} catch(const std::exception & e) {
|
||||||
stringstream ss;
|
std::stringstream ss;
|
||||||
|
|
||||||
ss << "Bad regex pattern: " << patternString <<
|
ss << "Bad regex pattern: " << patternString <<
|
||||||
" Detailed info: " << e.what();
|
" Detailed info: " << e.what();
|
||||||
@ -31,7 +32,7 @@ RegexReplacement::RegexReplacement(string patternString, string replacement,
|
|||||||
RegexReplacement::~RegexReplacement() {
|
RegexReplacement::~RegexReplacement() {
|
||||||
}
|
}
|
||||||
|
|
||||||
string RegexReplacement::apply(const string & text) {
|
std::string RegexReplacement::apply(const std::string & text) {
|
||||||
try {
|
try {
|
||||||
return boost::u32regex_replace(text, _pattern, _replacement,
|
return boost::u32regex_replace(text, _pattern, _replacement,
|
||||||
boost::match_default | boost::format_all);
|
boost::match_default | boost::format_all);
|
||||||
|
@ -14,13 +14,11 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
|
typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
|
||||||
|
|
||||||
class RegexReplacement {
|
class RegexReplacement {
|
||||||
public:
|
public:
|
||||||
RegexReplacement(string patternString, string replacement,
|
RegexReplacement(std::string patternString, std::string replacement,
|
||||||
bool caseSensitive = true)
|
bool caseSensitive = true)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
@ -28,12 +26,12 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~RegexReplacement();
|
virtual ~RegexReplacement();
|
||||||
|
|
||||||
string apply(const string & text);
|
std::string apply(const std::string & text);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
boost::u32regex _pattern;
|
boost::u32regex _pattern;
|
||||||
|
|
||||||
string _replacement;
|
std::string _replacement;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -23,8 +23,8 @@ SentenceAnonymizer::SentenceAnonymizer(
|
|||||||
SentenceAnonymizer::~SentenceAnonymizer() {
|
SentenceAnonymizer::~SentenceAnonymizer() {
|
||||||
}
|
}
|
||||||
|
|
||||||
string SentenceAnonymizer::anonymize(const string & sentence) {
|
std::string SentenceAnonymizer::anonymize(const std::string & sentence) {
|
||||||
string result = sentence;
|
std::string result = sentence;
|
||||||
|
|
||||||
result = _htmlTags->apply(result);
|
result = _htmlTags->apply(result);
|
||||||
|
|
||||||
@ -41,20 +41,20 @@ string SentenceAnonymizer::anonymize(const string & sentence) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SentenceAnonymizer::_createNeRules(string & namedEntitiesPath) {
|
void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
|
||||||
if (boost::filesystem::exists(namedEntitiesPath)) {
|
if (boost::filesystem::exists(namedEntitiesPath)) {
|
||||||
string line;
|
std::string line;
|
||||||
ifstream neFile(namedEntitiesPath.c_str());
|
std::ifstream neFile(namedEntitiesPath.c_str());
|
||||||
if (neFile.is_open()) {
|
if (neFile.is_open()) {
|
||||||
int lineCounter = 0;
|
int lineCounter = 0;
|
||||||
while (getline(neFile, line)) {
|
while (getline(neFile, line)) {
|
||||||
lineCounter++;
|
lineCounter++;
|
||||||
boost::shared_ptr<vector<string> >
|
boost::shared_ptr<std::vector<std::string> >
|
||||||
tokenTexts(new vector<string>());
|
tokenTexts(new std::vector<std::string>());
|
||||||
boost::split(*tokenTexts, line, boost::is_any_of(" "),
|
boost::split(*tokenTexts, line, boost::is_any_of(" "),
|
||||||
boost::token_compress_on);
|
boost::token_compress_on);
|
||||||
if (tokenTexts->size() != 2) {
|
if (tokenTexts->size() != 2) {
|
||||||
stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "Invalid line: " << lineCounter
|
ss << "Invalid line: " << lineCounter
|
||||||
<< " in NE file: " << namedEntitiesPath;
|
<< " in NE file: " << namedEntitiesPath;
|
||||||
throw ConcordiaException(ss.str());
|
throw ConcordiaException(ss.str());
|
||||||
@ -72,11 +72,11 @@ void SentenceAnonymizer::_createNeRules(string & namedEntitiesPath) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SentenceAnonymizer::_createHtmlTagsRule(string & htmlTagsPath) {
|
void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
||||||
string tagsExpression = "<\\/?(";
|
std::string tagsExpression = "<\\/?(";
|
||||||
if (boost::filesystem::exists(htmlTagsPath)) {
|
if (boost::filesystem::exists(htmlTagsPath)) {
|
||||||
string line;
|
std::string line;
|
||||||
ifstream tagsFile(htmlTagsPath.c_str());
|
std::ifstream tagsFile(htmlTagsPath.c_str());
|
||||||
if (tagsFile.is_open()) {
|
if (tagsFile.is_open()) {
|
||||||
while (getline(tagsFile, line)) {
|
while (getline(tagsFile, line)) {
|
||||||
tagsExpression += "|";
|
tagsExpression += "|";
|
||||||
@ -96,11 +96,11 @@ void SentenceAnonymizer::_createHtmlTagsRule(string & htmlTagsPath) {
|
|||||||
|
|
||||||
boost::shared_ptr<RegexReplacement>
|
boost::shared_ptr<RegexReplacement>
|
||||||
SentenceAnonymizer::_getMultipleReplacementRule(
|
SentenceAnonymizer::_getMultipleReplacementRule(
|
||||||
string & filePath, string replacement, bool wholeWord) {
|
std::string & filePath, std::string replacement, bool wholeWord) {
|
||||||
string expression = "(";
|
std::string expression = "(";
|
||||||
if (boost::filesystem::exists(filePath)) {
|
if (boost::filesystem::exists(filePath)) {
|
||||||
string line;
|
std::string line;
|
||||||
ifstream ruleFile(filePath.c_str());
|
std::ifstream ruleFile(filePath.c_str());
|
||||||
if (ruleFile.is_open()) {
|
if (ruleFile.is_open()) {
|
||||||
while (getline(ruleFile, line)) {
|
while (getline(ruleFile, line)) {
|
||||||
if (wholeWord) {
|
if (wholeWord) {
|
||||||
|
@ -16,8 +16,6 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class SentenceAnonymizer {
|
class SentenceAnonymizer {
|
||||||
public:
|
public:
|
||||||
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
||||||
@ -27,19 +25,19 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~SentenceAnonymizer();
|
virtual ~SentenceAnonymizer();
|
||||||
|
|
||||||
string anonymize(const string & sentence);
|
std::string anonymize(const std::string & sentence);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void _createNeRules(string & namedEntitiesPath);
|
void _createNeRules(std::string & namedEntitiesPath);
|
||||||
|
|
||||||
void _createHtmlTagsRule(string & htmlTagsPath);
|
void _createHtmlTagsRule(std::string & htmlTagsPath);
|
||||||
|
|
||||||
boost::shared_ptr<RegexReplacement> _getMultipleReplacementRule(
|
boost::shared_ptr<RegexReplacement> _getMultipleReplacementRule(
|
||||||
string & filePath,
|
std::string & filePath,
|
||||||
string replacement,
|
std::string replacement,
|
||||||
bool wholeWord = false);
|
bool wholeWord = false);
|
||||||
|
|
||||||
vector<RegexReplacement> _namedEntities;
|
std::vector<RegexReplacement> _namedEntities;
|
||||||
|
|
||||||
boost::shared_ptr<RegexReplacement> _htmlTags;
|
boost::shared_ptr<RegexReplacement> _htmlTags;
|
||||||
|
|
||||||
|
@ -10,7 +10,8 @@ SubstringOccurence::SubstringOccurence(const SUFFIX_MARKER_TYPE & marker) {
|
|||||||
_exampleLength = Utils::getLengthFromMarker(marker);
|
_exampleLength = Utils::getLengthFromMarker(marker);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SubstringOccurence::enterDataFromMarker(const SUFFIX_MARKER_TYPE & marker) {
|
void SubstringOccurence::enterDataFromMarker(
|
||||||
|
const SUFFIX_MARKER_TYPE & marker) {
|
||||||
_id = Utils::getIdFromMarker(marker);
|
_id = Utils::getIdFromMarker(marker);
|
||||||
_offset = Utils::getOffsetFromMarker(marker);
|
_offset = Utils::getOffsetFromMarker(marker);
|
||||||
_exampleLength = Utils::getLengthFromMarker(marker);
|
_exampleLength = Utils::getLengthFromMarker(marker);
|
||||||
|
@ -9,8 +9,6 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class SubstringOccurence {
|
class SubstringOccurence {
|
||||||
public:
|
public:
|
||||||
SubstringOccurence();
|
SubstringOccurence();
|
||||||
|
@ -12,8 +12,6 @@
|
|||||||
#include "concordia/common/logging.hpp"
|
#include "concordia/common/logging.hpp"
|
||||||
#include "tests/common/test_resources_manager.hpp"
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(anubis_searcher)
|
BOOST_AUTO_TEST_SUITE(anubis_searcher)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
||||||
@ -187,7 +185,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
pattern2.push_back(2);
|
pattern2.push_back(2);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE highResLength2;
|
SUFFIX_MARKER_TYPE highResLength2;
|
||||||
vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, highResLength2);
|
std::vector<SubstringOccurence> result2 = searcher.lcpSearch(T, markers, SA, pattern2, highResLength2);
|
||||||
SUFFIX_MARKER_TYPE length2 = highResLength2 / sizeof(INDEX_CHARACTER_TYPE);
|
SUFFIX_MARKER_TYPE length2 = highResLength2 / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
/* Expecting to get one result from SA:
|
/* Expecting to get one result from SA:
|
||||||
@ -230,7 +228,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
pattern3.push_back(3);
|
pattern3.push_back(3);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE highResLength3;
|
SUFFIX_MARKER_TYPE highResLength3;
|
||||||
vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, highResLength3);
|
std::vector<SubstringOccurence> result3 = searcher.lcpSearch(T, markers, SA, pattern3, highResLength3);
|
||||||
SUFFIX_MARKER_TYPE length3 = highResLength3 / sizeof(INDEX_CHARACTER_TYPE);
|
SUFFIX_MARKER_TYPE length3 = highResLength3 / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
/* Expecting to get one result from SA:
|
/* Expecting to get one result from SA:
|
||||||
@ -267,7 +265,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
pattern4.push_back(4);
|
pattern4.push_back(4);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE highResLength4;
|
SUFFIX_MARKER_TYPE highResLength4;
|
||||||
vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, highResLength4);
|
std::vector<SubstringOccurence> result4 = searcher.lcpSearch(T, markers, SA, pattern4, highResLength4);
|
||||||
SUFFIX_MARKER_TYPE length4 = highResLength4 / sizeof(INDEX_CHARACTER_TYPE);
|
SUFFIX_MARKER_TYPE length4 = highResLength4 / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
/* Expecting to get 2 results from SA:
|
/* Expecting to get 2 results from SA:
|
||||||
@ -298,7 +296,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
pattern5.push_back(4);
|
pattern5.push_back(4);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE highResLength5;
|
SUFFIX_MARKER_TYPE highResLength5;
|
||||||
vector<SubstringOccurence> result5 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength5);
|
std::vector<SubstringOccurence> result5 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength5);
|
||||||
SUFFIX_MARKER_TYPE length5 = highResLength5 / sizeof(INDEX_CHARACTER_TYPE);
|
SUFFIX_MARKER_TYPE length5 = highResLength5 / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
/* Expecting to get 0 results from SA, lcp length = 0;
|
/* Expecting to get 0 results from SA, lcp length = 0;
|
||||||
@ -322,7 +320,7 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
pattern6.push_back(0);
|
pattern6.push_back(0);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE highResLength6;
|
SUFFIX_MARKER_TYPE highResLength6;
|
||||||
vector<SubstringOccurence> result6 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength6);
|
std::vector<SubstringOccurence> result6 = searcher.lcpSearch(T, markers, SA, pattern5, highResLength6);
|
||||||
SUFFIX_MARKER_TYPE length6 = highResLength6 / sizeof(INDEX_CHARACTER_TYPE);
|
SUFFIX_MARKER_TYPE length6 = highResLength6 / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
|
|
||||||
/* Expecting to get 0 results from SA, lcp length = 0;
|
/* Expecting to get 0 results from SA, lcp length = 0;
|
||||||
@ -393,38 +391,38 @@ BOOST_AUTO_TEST_CASE( TmMatchesTest )
|
|||||||
|
|
||||||
// example 14
|
// example 14
|
||||||
// example interval list: [(1,2)]
|
// example interval list: [(1,2)]
|
||||||
vector<Interval> exampleIntervals14 = tmMatches14->getExampleIntervals();
|
std::vector<Interval> exampleIntervals14 = tmMatches14->getExampleIntervals();
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals14.size(), 1);
|
BOOST_CHECK_EQUAL(exampleIntervals14.size(), 1);
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals14[0].getStart(), 1);
|
BOOST_CHECK_EQUAL(exampleIntervals14[0].getStart(), 1);
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals14[0].getEnd(), 2);
|
BOOST_CHECK_EQUAL(exampleIntervals14[0].getEnd(), 2);
|
||||||
// pattern interval list: [(1,2)]
|
// pattern interval list: [(1,2)]
|
||||||
vector<Interval> patternIntervals14 = tmMatches14->getPatternIntervals();
|
std::vector<Interval> patternIntervals14 = tmMatches14->getPatternIntervals();
|
||||||
BOOST_CHECK_EQUAL(patternIntervals14.size(), 1);
|
BOOST_CHECK_EQUAL(patternIntervals14.size(), 1);
|
||||||
BOOST_CHECK_EQUAL(patternIntervals14[0].getStart(), 1);
|
BOOST_CHECK_EQUAL(patternIntervals14[0].getStart(), 1);
|
||||||
BOOST_CHECK_EQUAL(patternIntervals14[0].getEnd(), 2);
|
BOOST_CHECK_EQUAL(patternIntervals14[0].getEnd(), 2);
|
||||||
|
|
||||||
// example 51
|
// example 51
|
||||||
// example interval list: [(1,3)]
|
// example interval list: [(1,3)]
|
||||||
vector<Interval> exampleIntervals51 = tmMatches51->getExampleIntervals();
|
std::vector<Interval> exampleIntervals51 = tmMatches51->getExampleIntervals();
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals51.size(), 1);
|
BOOST_CHECK_EQUAL(exampleIntervals51.size(), 1);
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals51[0].getStart(), 1);
|
BOOST_CHECK_EQUAL(exampleIntervals51[0].getStart(), 1);
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals51[0].getEnd(), 3);
|
BOOST_CHECK_EQUAL(exampleIntervals51[0].getEnd(), 3);
|
||||||
// pattern interval list: [(1,3)]
|
// pattern interval list: [(1,3)]
|
||||||
vector<Interval> patternIntervals51 = tmMatches51->getPatternIntervals();
|
std::vector<Interval> patternIntervals51 = tmMatches51->getPatternIntervals();
|
||||||
BOOST_CHECK_EQUAL(patternIntervals51.size(), 1);
|
BOOST_CHECK_EQUAL(patternIntervals51.size(), 1);
|
||||||
BOOST_CHECK_EQUAL(patternIntervals51[0].getStart(), 1);
|
BOOST_CHECK_EQUAL(patternIntervals51[0].getStart(), 1);
|
||||||
BOOST_CHECK_EQUAL(patternIntervals51[0].getEnd(), 3);
|
BOOST_CHECK_EQUAL(patternIntervals51[0].getEnd(), 3);
|
||||||
|
|
||||||
// example 123
|
// example 123
|
||||||
// example interval list: [(1,3), (0,1)]
|
// example interval list: [(1,3), (0,1)]
|
||||||
vector<Interval> exampleIntervals123 = tmMatches123->getExampleIntervals();
|
std::vector<Interval> exampleIntervals123 = tmMatches123->getExampleIntervals();
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals123.size(), 2);
|
BOOST_CHECK_EQUAL(exampleIntervals123.size(), 2);
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals123[0].getStart(), 1);
|
BOOST_CHECK_EQUAL(exampleIntervals123[0].getStart(), 1);
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals123[0].getEnd(), 3);
|
BOOST_CHECK_EQUAL(exampleIntervals123[0].getEnd(), 3);
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals123[1].getStart(), 0);
|
BOOST_CHECK_EQUAL(exampleIntervals123[1].getStart(), 0);
|
||||||
BOOST_CHECK_EQUAL(exampleIntervals123[1].getEnd(), 1);
|
BOOST_CHECK_EQUAL(exampleIntervals123[1].getEnd(), 1);
|
||||||
// pattern interval list: [(1,3), (3,4)]
|
// pattern interval list: [(1,3), (3,4)]
|
||||||
vector<Interval> patternIntervals123 = tmMatches123->getPatternIntervals();
|
std::vector<Interval> patternIntervals123 = tmMatches123->getPatternIntervals();
|
||||||
BOOST_CHECK_EQUAL(patternIntervals123.size(), 2);
|
BOOST_CHECK_EQUAL(patternIntervals123.size(), 2);
|
||||||
BOOST_CHECK_EQUAL(patternIntervals123[0].getStart(), 1);
|
BOOST_CHECK_EQUAL(patternIntervals123[0].getStart(), 1);
|
||||||
BOOST_CHECK_EQUAL(patternIntervals123[0].getEnd(), 3);
|
BOOST_CHECK_EQUAL(patternIntervals123[0].getEnd(), 3);
|
||||||
|
@ -9,14 +9,12 @@
|
|||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(concordia_main)
|
BOOST_AUTO_TEST_SUITE(concordia_main)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConcordiaVersion )
|
BOOST_AUTO_TEST_CASE( ConcordiaVersion )
|
||||||
{
|
{
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
string version = concordia.getVersion();
|
std::string version = concordia.getVersion();
|
||||||
BOOST_CHECK_EQUAL( version , "0.1");
|
BOOST_CHECK_EQUAL( version , "0.1");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -51,8 +49,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch1 )
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("posiada rysia");
|
std::vector<SubstringOccurence> searchResult1 = concordia.simpleSearch("posiada rysia");
|
||||||
vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("posiada kota Ala");
|
std::vector<SubstringOccurence> searchResult2 = concordia.simpleSearch("posiada kota Ala");
|
||||||
|
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
@ -73,7 +71,7 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
|||||||
{
|
{
|
||||||
// modified stop words to avoid anonymization
|
// modified stop words to avoid anonymization
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
vector<Example> testExamples;
|
std::vector<Example> testExamples;
|
||||||
testExamples.push_back(Example("xto xjest okno",312));
|
testExamples.push_back(Example("xto xjest okno",312));
|
||||||
testExamples.push_back(Example("czy xjest okno otwarte",202));
|
testExamples.push_back(Example("czy xjest okno otwarte",202));
|
||||||
testExamples.push_back(Example("chyba xto xjest xtutaj",45));
|
testExamples.push_back(Example("chyba xto xjest xtutaj",45));
|
||||||
@ -106,8 +104,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("xto xjest");
|
std::vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("xto xjest");
|
||||||
vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("xjest okno");
|
std::vector<SubstringOccurence> searchResult2 = concordia2.simpleSearch("xjest okno");
|
||||||
|
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
@ -131,13 +129,13 @@ BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch2 )
|
|||||||
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
BOOST_AUTO_TEST_CASE( ConcordiaSimpleSearch3 )
|
||||||
{
|
{
|
||||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
vector<Example> testExamples;
|
std::vector<Example> testExamples;
|
||||||
testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
|
testExamples.push_back(Example("2. Ma on w szczególności prawo do podjęcia zatrudnienia dostępnego na terytorium innego Państwa Członkowskiego z takim samym pierwszeństwem, z jakiego korzystają obywatele tego państwa.",312));
|
||||||
testExamples.push_back(Example("czy xjest żółte otwarte",202));
|
testExamples.push_back(Example("czy xjest żółte otwarte",202));
|
||||||
concordia.addAllExamples(testExamples);
|
concordia.addAllExamples(testExamples);
|
||||||
|
|
||||||
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
Concordia concordia2 = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
std::vector<SubstringOccurence> searchResult1 = concordia2.simpleSearch("on w szczególności prawo do podjęcia");
|
||||||
|
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
@ -176,8 +174,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaAnubisSearch1 )
|
|||||||
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
|
std::vector<AnubisSearchResult> searchResult1 = concordia.anubisSearch("posiada rysia chyba");
|
||||||
vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
|
std::vector<AnubisSearchResult> searchResult2 = concordia.anubisSearch("posiada kota Ala");
|
||||||
|
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_WORD_MAP));
|
||||||
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
boost::filesystem::remove(TestResourcesManager::getTestFilePath("temp",TEMP_MARKERS));
|
||||||
|
@ -7,8 +7,6 @@
|
|||||||
#include <list>
|
#include <list>
|
||||||
#include <boost/algorithm/string/predicate.hpp>
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(concordia_config)
|
BOOST_AUTO_TEST_SUITE(concordia_config)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ConfigParameters )
|
BOOST_AUTO_TEST_CASE( ConfigParameters )
|
||||||
@ -29,7 +27,7 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
|
|||||||
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
|
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
|
||||||
{
|
{
|
||||||
bool exceptionThrown = false;
|
bool exceptionThrown = false;
|
||||||
string message = "";
|
std::string message = "";
|
||||||
try {
|
try {
|
||||||
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("foo.cfg"));
|
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("foo.cfg"));
|
||||||
} catch (ConcordiaException & e) {
|
} catch (ConcordiaException & e) {
|
||||||
@ -44,7 +42,7 @@ BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
|
|||||||
BOOST_AUTO_TEST_CASE( InvalidConfigTest )
|
BOOST_AUTO_TEST_CASE( InvalidConfigTest )
|
||||||
{
|
{
|
||||||
bool exceptionThrown = false;
|
bool exceptionThrown = false;
|
||||||
string message = "";
|
std::string message = "";
|
||||||
try {
|
try {
|
||||||
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("invalid.cfg"));
|
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("invalid.cfg"));
|
||||||
} catch (ConcordiaException & e) {
|
} catch (ConcordiaException & e) {
|
||||||
|
@ -7,8 +7,6 @@
|
|||||||
#include <boost/algorithm/string/predicate.hpp>
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(concordia_index)
|
BOOST_AUTO_TEST_SUITE(concordia_index)
|
||||||
|
|
||||||
|
|
||||||
@ -16,7 +14,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
|||||||
{
|
{
|
||||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||||
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||||
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
boost::shared_ptr<std::vector<sauchar_t> > T = boost::shared_ptr<std::vector<sauchar_t> >(new std::vector<sauchar_t>());
|
||||||
// Test hashed index:
|
// Test hashed index:
|
||||||
// n: 0 1 2 3 4 5 6 7 8
|
// n: 0 1 2 3 4 5 6 7 8
|
||||||
// T[n]: 0 1 2 0 1 3 4 1 3
|
// T[n]: 0 1 2 0 1 3 4 1 3
|
||||||
@ -36,7 +34,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
|||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
boost::shared_ptr<std::vector<saidx_t> > expectedSA = boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>());
|
||||||
expectedSA->push_back(0);
|
expectedSA->push_back(0);
|
||||||
expectedSA->push_back(3);
|
expectedSA->push_back(3);
|
||||||
expectedSA->push_back(1);
|
expectedSA->push_back(1);
|
||||||
@ -53,7 +51,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
|
|||||||
{
|
{
|
||||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||||
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||||
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
boost::shared_ptr<std::vector<sauchar_t> > T = boost::shared_ptr<std::vector<sauchar_t> >(new std::vector<sauchar_t>());
|
||||||
|
|
||||||
//Test hashed index:
|
//Test hashed index:
|
||||||
// n: 0 1 2 3 4 5 6 7 8 9 10 11
|
// n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
@ -77,7 +75,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
|
|||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
boost::shared_ptr<std::vector<saidx_t> > expectedSA = boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>());
|
||||||
expectedSA->push_back(0);
|
expectedSA->push_back(0);
|
||||||
expectedSA->push_back(4);
|
expectedSA->push_back(4);
|
||||||
expectedSA->push_back(1);
|
expectedSA->push_back(1);
|
||||||
@ -97,7 +95,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest3 )
|
|||||||
{
|
{
|
||||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||||
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||||
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
boost::shared_ptr<std::vector<sauchar_t> > T = boost::shared_ptr<std::vector<sauchar_t> >(new std::vector<sauchar_t>());
|
||||||
|
|
||||||
//Test hashed index:
|
//Test hashed index:
|
||||||
// n: 0 1 2 3 4 5
|
// n: 0 1 2 3 4 5
|
||||||
@ -115,7 +113,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest3 )
|
|||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
boost::shared_ptr<std::vector<saidx_t> > expectedSA = boost::shared_ptr<std::vector<saidx_t> >(new std::vector<saidx_t>());
|
||||||
expectedSA->push_back(0);
|
expectedSA->push_back(0);
|
||||||
expectedSA->push_back(5);
|
expectedSA->push_back(5);
|
||||||
expectedSA->push_back(3);
|
expectedSA->push_back(3);
|
||||||
|
@ -5,8 +5,6 @@
|
|||||||
|
|
||||||
#include "concordia/example.hpp"
|
#include "concordia/example.hpp"
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(exampleTest)
|
BOOST_AUTO_TEST_SUITE(exampleTest)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ExceedingId )
|
BOOST_AUTO_TEST_CASE( ExceedingId )
|
||||||
@ -15,7 +13,7 @@ BOOST_AUTO_TEST_CASE( ExceedingId )
|
|||||||
Example example1("Test", maxId);
|
Example example1("Test", maxId);
|
||||||
|
|
||||||
bool exceptionThrown = false;
|
bool exceptionThrown = false;
|
||||||
string message = "";
|
std::string message = "";
|
||||||
try {
|
try {
|
||||||
Example example2("Test", maxId+1);
|
Example example2("Test", maxId+1);
|
||||||
} catch (ConcordiaException & e) {
|
} catch (ConcordiaException & e) {
|
||||||
|
@ -8,9 +8,6 @@
|
|||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
#include "tests/common/test_resources_manager.hpp"
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(hash_generator)
|
BOOST_AUTO_TEST_SUITE(hash_generator)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||||
@ -23,8 +20,8 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
|||||||
|
|
||||||
HashGenerator hashGenerator = HashGenerator(config);
|
HashGenerator hashGenerator = HashGenerator(config);
|
||||||
|
|
||||||
vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota");
|
std::vector<INDEX_CHARACTER_TYPE> hash = hashGenerator.generateHash("Ala posiada kota");
|
||||||
vector<INDEX_CHARACTER_TYPE> expected;
|
std::vector<INDEX_CHARACTER_TYPE> expected;
|
||||||
expected.push_back(0);
|
expected.push_back(0);
|
||||||
expected.push_back(1);
|
expected.push_back(1);
|
||||||
expected.push_back(2);
|
expected.push_back(2);
|
||||||
@ -44,17 +41,17 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
|||||||
|
|
||||||
HashGenerator hashGenerator = HashGenerator(config);
|
HashGenerator hashGenerator = HashGenerator(config);
|
||||||
|
|
||||||
stringstream ss;
|
std::stringstream ss;
|
||||||
for (int i=0;i<65537;i++) {
|
for (int i=0;i<65537;i++) {
|
||||||
ss << "xx" << i << " ";
|
ss << "xx" << i << " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
string longSentence = ss.str();
|
std::string longSentence = ss.str();
|
||||||
|
|
||||||
bool exceptionThrown = false;
|
bool exceptionThrown = false;
|
||||||
string message = "";
|
std::string message = "";
|
||||||
try {
|
try {
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash(longSentence);
|
boost::shared_ptr<std::vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash(longSentence);
|
||||||
} catch (ConcordiaException & e) {
|
} catch (ConcordiaException & e) {
|
||||||
exceptionThrown = true;
|
exceptionThrown = true;
|
||||||
message = e.what();
|
message = e.what();
|
||||||
@ -76,8 +73,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
|||||||
|
|
||||||
HashGenerator hashGenerator1 = HashGenerator(config);
|
HashGenerator hashGenerator1 = HashGenerator(config);
|
||||||
|
|
||||||
vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota");
|
std::vector<INDEX_CHARACTER_TYPE> hash1 = hashGenerator1.generateHash("Ala posiada kota");
|
||||||
vector<INDEX_CHARACTER_TYPE> expected1;
|
std::vector<INDEX_CHARACTER_TYPE> expected1;
|
||||||
expected1.push_back(0);
|
expected1.push_back(0);
|
||||||
expected1.push_back(1);
|
expected1.push_back(1);
|
||||||
expected1.push_back(2);
|
expected1.push_back(2);
|
||||||
@ -86,8 +83,8 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
|||||||
hashGenerator1.serializeWordMap();
|
hashGenerator1.serializeWordMap();
|
||||||
|
|
||||||
HashGenerator hashGenerator2 = HashGenerator(config);
|
HashGenerator hashGenerator2 = HashGenerator(config);
|
||||||
vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa");
|
std::vector<INDEX_CHARACTER_TYPE> hash2 = hashGenerator2.generateHash("Ala posiada psa");
|
||||||
vector<INDEX_CHARACTER_TYPE> expected2;
|
std::vector<INDEX_CHARACTER_TYPE> expected2;
|
||||||
expected2.push_back(0);
|
expected2.push_back(0);
|
||||||
expected2.push_back(1);
|
expected2.push_back(1);
|
||||||
expected2.push_back(3);
|
expected2.push_back(3);
|
||||||
@ -106,8 +103,8 @@ BOOST_AUTO_TEST_CASE( TokenVectorTest )
|
|||||||
|
|
||||||
HashGenerator hashGenerator = HashGenerator(config);
|
HashGenerator hashGenerator = HashGenerator(config);
|
||||||
|
|
||||||
vector<string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
|
std::vector<std::string> tokenVector = hashGenerator.generateTokenVector("12.02.2014 o godzinie 17:40 doszło do kolizji na ulicy Grobla; policjanci ustalili, że kierowca zaparkował samochód.");
|
||||||
vector<string> expected;
|
std::vector<std::string> expected;
|
||||||
expected.push_back("ne_date");
|
expected.push_back("ne_date");
|
||||||
expected.push_back("godzinie");
|
expected.push_back("godzinie");
|
||||||
expected.push_back("ne_number");
|
expected.push_back("ne_number");
|
||||||
|
@ -2,8 +2,6 @@
|
|||||||
#include "concordia/interval.hpp"
|
#include "concordia/interval.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(interval)
|
BOOST_AUTO_TEST_SUITE(interval)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( IntervalIntersects1 )
|
BOOST_AUTO_TEST_CASE( IntervalIntersects1 )
|
||||||
|
@ -6,8 +6,6 @@
|
|||||||
|
|
||||||
#define TMP_LOG_FILE "/tmp/concordia.log"
|
#define TMP_LOG_FILE "/tmp/concordia.log"
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(logging)
|
BOOST_AUTO_TEST_SUITE(logging)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( LoggingTest )
|
BOOST_AUTO_TEST_CASE( LoggingTest )
|
||||||
|
@ -5,8 +5,6 @@
|
|||||||
#include <boost/locale.hpp>
|
#include <boost/locale.hpp>
|
||||||
#include <boost/algorithm/string/case_conv.hpp>
|
#include <boost/algorithm/string/case_conv.hpp>
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(regex_replacement)
|
BOOST_AUTO_TEST_SUITE(regex_replacement)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
||||||
@ -18,7 +16,7 @@ BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
|||||||
BOOST_AUTO_TEST_CASE( BadRegex )
|
BOOST_AUTO_TEST_CASE( BadRegex )
|
||||||
{
|
{
|
||||||
bool exceptionThrown = false;
|
bool exceptionThrown = false;
|
||||||
string message = "";
|
std::string message = "";
|
||||||
try {
|
try {
|
||||||
RegexReplacement rr("+a","b");
|
RegexReplacement rr("+a","b");
|
||||||
} catch (ConcordiaException & e) {
|
} catch (ConcordiaException & e) {
|
||||||
|
@ -8,9 +8,6 @@
|
|||||||
#include "concordia/sentence_anonymizer.hpp"
|
#include "concordia/sentence_anonymizer.hpp"
|
||||||
#include "tests/common/test_resources_manager.hpp"
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(sentence_anonymizer)
|
BOOST_AUTO_TEST_SUITE(sentence_anonymizer)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( NETest )
|
BOOST_AUTO_TEST_CASE( NETest )
|
||||||
@ -19,7 +16,7 @@ BOOST_AUTO_TEST_CASE( NETest )
|
|||||||
SentenceAnonymizer anonymizer(config);
|
SentenceAnonymizer anonymizer(config);
|
||||||
|
|
||||||
|
|
||||||
string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
|
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"date ne_date mail ne_email number ne_number");
|
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"date ne_date mail ne_email number ne_number");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -29,7 +26,7 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
|||||||
SentenceAnonymizer anonymizer(config);
|
SentenceAnonymizer anonymizer(config);
|
||||||
|
|
||||||
|
|
||||||
string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"link and bold and newline ");
|
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"link and bold and newline ");
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -40,7 +37,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
|
|||||||
SentenceAnonymizer anonymizer(config);
|
SentenceAnonymizer anonymizer(config);
|
||||||
|
|
||||||
|
|
||||||
string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne");
|
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne");
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -51,7 +48,7 @@ BOOST_AUTO_TEST_CASE( StopSymbolsTest )
|
|||||||
SentenceAnonymizer anonymizer(config);
|
SentenceAnonymizer anonymizer(config);
|
||||||
|
|
||||||
|
|
||||||
string sentence = "xxx, . xxx # xx $xx@ xx";
|
std::string sentence = "xxx, . xxx # xx $xx@ xx";
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx xx");
|
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx xx");
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -62,7 +59,7 @@ BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
|
|||||||
SentenceAnonymizer anonymizer(config);
|
SentenceAnonymizer anonymizer(config);
|
||||||
|
|
||||||
|
|
||||||
string sentence = "xxx-xxx xx|xx";
|
std::string sentence = "xxx-xxx xx|xx";
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx");
|
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx");
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -72,7 +69,7 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
|||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
SentenceAnonymizer anonymizer(config);
|
SentenceAnonymizer anonymizer(config);
|
||||||
|
|
||||||
string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
|
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,19 +2,17 @@
|
|||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/common/text_utils.hpp"
|
#include "concordia/common/text_utils.hpp"
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(text_utils)
|
BOOST_AUTO_TEST_SUITE(text_utils)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ToLower )
|
BOOST_AUTO_TEST_CASE( ToLower )
|
||||||
{
|
{
|
||||||
string str = "ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
std::string str = "ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
||||||
BOOST_CHECK_EQUAL(TextUtils::getInstance().toLowerCase(str),"zażółć gęślą jaźń");
|
BOOST_CHECK_EQUAL(TextUtils::getInstance().toLowerCase(str),"zażółć gęślą jaźń");
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ToUpper )
|
BOOST_AUTO_TEST_CASE( ToUpper )
|
||||||
{
|
{
|
||||||
string str = "zażółć gęślą jaźń";
|
std::string str = "zażółć gęślą jaźń";
|
||||||
BOOST_CHECK_EQUAL(TextUtils::getInstance().toUpperCase(str),"ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
|
BOOST_CHECK_EQUAL(TextUtils::getInstance().toUpperCase(str),"ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,8 +3,6 @@
|
|||||||
#include "concordia/tm_matches.hpp"
|
#include "concordia/tm_matches.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(tm_matches)
|
BOOST_AUTO_TEST_SUITE(tm_matches)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( TmMatchesSimpleScore1 )
|
BOOST_AUTO_TEST_CASE( TmMatchesSimpleScore1 )
|
||||||
|
@ -5,23 +5,20 @@
|
|||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
|
|
||||||
#include "divsufsort.h"
|
#include "divsufsort.h"
|
||||||
#include <string>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(utils)
|
BOOST_AUTO_TEST_SUITE(utils)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( WriteReadSingleCharacter )
|
BOOST_AUTO_TEST_CASE( WriteReadSingleCharacter )
|
||||||
{
|
{
|
||||||
ofstream testFileOutput;
|
std::ofstream testFileOutput;
|
||||||
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
testFileOutput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),
|
||||||
ios::out|ios::binary);
|
std::ios::out|std::ios::binary);
|
||||||
INDEX_CHARACTER_TYPE testCharacter = 123456789; //in hex: 75BCD15
|
INDEX_CHARACTER_TYPE testCharacter = 123456789; //in hex: 75BCD15
|
||||||
Utils::writeIndexCharacter(testFileOutput,testCharacter);
|
Utils::writeIndexCharacter(testFileOutput,testCharacter);
|
||||||
testFileOutput.close();
|
testFileOutput.close();
|
||||||
|
|
||||||
ifstream testFileInput;
|
std::ifstream testFileInput;
|
||||||
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),ios::in|ios::binary);
|
testFileInput.open(TestResourcesManager::getTestFilePath("temp","temp_file.bin").c_str(),std::ios::in|std::ios::binary);
|
||||||
INDEX_CHARACTER_TYPE retrievedCharacter = Utils::readIndexCharacter(testFileInput);
|
INDEX_CHARACTER_TYPE retrievedCharacter = Utils::readIndexCharacter(testFileInput);
|
||||||
BOOST_CHECK_EQUAL(retrievedCharacter, testCharacter);
|
BOOST_CHECK_EQUAL(retrievedCharacter, testCharacter);
|
||||||
testFileInput.close();
|
testFileInput.close();
|
||||||
@ -31,7 +28,7 @@ BOOST_AUTO_TEST_CASE( WriteReadSingleCharacter )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( IndexVectorToSaucharArray )
|
BOOST_AUTO_TEST_CASE( IndexVectorToSaucharArray )
|
||||||
{
|
{
|
||||||
vector<INDEX_CHARACTER_TYPE> hash;
|
std::vector<INDEX_CHARACTER_TYPE> hash;
|
||||||
hash.push_back(123456789); // in hex: 75BCD15
|
hash.push_back(123456789); // in hex: 75BCD15
|
||||||
// in memory: 15 cd 5b 07
|
// in memory: 15 cd 5b 07
|
||||||
// in memory DEC: 21 205 91 7
|
// in memory DEC: 21 205 91 7
|
||||||
@ -41,13 +38,13 @@ BOOST_AUTO_TEST_CASE( IndexVectorToSaucharArray )
|
|||||||
// in memory DEC: 177 104 222 58
|
// in memory DEC: 177 104 222 58
|
||||||
sauchar_t * dataArray = Utils::indexVectorToSaucharArray(hash);
|
sauchar_t * dataArray = Utils::indexVectorToSaucharArray(hash);
|
||||||
|
|
||||||
vector<INDEX_CHARACTER_TYPE> result;
|
std::vector<INDEX_CHARACTER_TYPE> result;
|
||||||
for (int i=0;i<8;i++) {
|
for (int i=0;i<8;i++) {
|
||||||
INDEX_CHARACTER_TYPE a = dataArray[i];
|
INDEX_CHARACTER_TYPE a = dataArray[i];
|
||||||
result.push_back(a);
|
result.push_back(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<INDEX_CHARACTER_TYPE> expected;
|
std::vector<INDEX_CHARACTER_TYPE> expected;
|
||||||
expected.push_back(21);
|
expected.push_back(21);
|
||||||
expected.push_back(205);
|
expected.push_back(205);
|
||||||
expected.push_back(91);
|
expected.push_back(91);
|
||||||
@ -62,7 +59,7 @@ BOOST_AUTO_TEST_CASE( IndexVectorToSaucharArray )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector )
|
BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector )
|
||||||
{
|
{
|
||||||
vector<INDEX_CHARACTER_TYPE> hash;
|
std::vector<INDEX_CHARACTER_TYPE> hash;
|
||||||
hash.push_back(123456789); // in hex: 75BCD15
|
hash.push_back(123456789); // in hex: 75BCD15
|
||||||
// in memory: 15 cd 5b 07
|
// in memory: 15 cd 5b 07
|
||||||
// in memory DEC: 21 205 91 7
|
// in memory DEC: 21 205 91 7
|
||||||
@ -70,9 +67,9 @@ BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector )
|
|||||||
hash.push_back(987654321); // in hex: 3ADE68B1
|
hash.push_back(987654321); // in hex: 3ADE68B1
|
||||||
// in memory: b1 68 de 3a
|
// in memory: b1 68 de 3a
|
||||||
// in memory DEC: 177 104 222 58
|
// in memory DEC: 177 104 222 58
|
||||||
vector<sauchar_t> result = Utils::indexVectorToSaucharVector(hash);
|
std::vector<sauchar_t> result = Utils::indexVectorToSaucharVector(hash);
|
||||||
|
|
||||||
vector<sauchar_t> expected;
|
std::vector<sauchar_t> expected;
|
||||||
expected.push_back(21);
|
expected.push_back(21);
|
||||||
expected.push_back(205);
|
expected.push_back(205);
|
||||||
expected.push_back(91);
|
expected.push_back(91);
|
||||||
|
@ -1,10 +1,6 @@
|
|||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
#include "concordia/word_map.hpp"
|
#include "concordia/word_map.hpp"
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(word_map)
|
BOOST_AUTO_TEST_SUITE(word_map)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( WordCodeTest )
|
BOOST_AUTO_TEST_CASE( WordCodeTest )
|
||||||
|
@ -54,7 +54,7 @@ void TmMatches::addPatternInterval(int start, int end) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool TmMatches::_alreadyIntersects(
|
bool TmMatches::_alreadyIntersects(
|
||||||
const vector<Interval> & intervalList,
|
const std::vector<Interval> & intervalList,
|
||||||
int start, int end) {
|
int start, int end) {
|
||||||
Interval tempInterval(start, end);
|
Interval tempInterval(start, end);
|
||||||
BOOST_FOREACH(Interval oldInterval, intervalList) {
|
BOOST_FOREACH(Interval oldInterval, intervalList) {
|
||||||
@ -66,7 +66,7 @@ bool TmMatches::_alreadyIntersects(
|
|||||||
}
|
}
|
||||||
|
|
||||||
double TmMatches::_getLogarithmicOverlay(
|
double TmMatches::_getLogarithmicOverlay(
|
||||||
const vector<Interval> & intervalList,
|
const std::vector<Interval> & intervalList,
|
||||||
SUFFIX_MARKER_TYPE sentenceSize,
|
SUFFIX_MARKER_TYPE sentenceSize,
|
||||||
double k) {
|
double k) {
|
||||||
double overlayScore = 0;
|
double overlayScore = 0;
|
||||||
|
@ -14,8 +14,6 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class TmMatches {
|
class TmMatches {
|
||||||
public:
|
public:
|
||||||
TmMatches();
|
TmMatches();
|
||||||
@ -32,11 +30,11 @@ public:
|
|||||||
return _score;
|
return _score;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<Interval> getExampleIntervals() const {
|
std::vector<Interval> getExampleIntervals() const {
|
||||||
return _exampleMatchedRegions;
|
return _exampleMatchedRegions;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<Interval> getPatternIntervals() const {
|
std::vector<Interval> getPatternIntervals() const {
|
||||||
return _patternMatchedRegions;
|
return _patternMatchedRegions;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -53,18 +51,18 @@ public:
|
|||||||
void addPatternInterval(int start, int end);
|
void addPatternInterval(int start, int end);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool _alreadyIntersects(const vector<Interval> & intervalList,
|
bool _alreadyIntersects(const std::vector<Interval> & intervalList,
|
||||||
int start, int end);
|
int start, int end);
|
||||||
|
|
||||||
double _getLogarithmicOverlay(const vector<Interval> & intervalList,
|
double _getLogarithmicOverlay(const std::vector<Interval> & intervalList,
|
||||||
SUFFIX_MARKER_TYPE sentenceSize,
|
SUFFIX_MARKER_TYPE sentenceSize,
|
||||||
double k);
|
double k);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE _exampleId;
|
SUFFIX_MARKER_TYPE _exampleId;
|
||||||
|
|
||||||
vector<Interval> _exampleMatchedRegions;
|
std::vector<Interval> _exampleMatchedRegions;
|
||||||
|
|
||||||
vector<Interval> _patternMatchedRegions;
|
std::vector<Interval> _patternMatchedRegions;
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE _patternSize;
|
SUFFIX_MARKER_TYPE _patternSize;
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@ WordMap::WordMap() throw(ConcordiaException) {
|
|||||||
WordMap::~WordMap() {
|
WordMap::~WordMap() {
|
||||||
}
|
}
|
||||||
|
|
||||||
INDEX_CHARACTER_TYPE WordMap::getWordCode(const string & word)
|
INDEX_CHARACTER_TYPE WordMap::getWordCode(const std::string & word)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
if (_map.find(word) == _map.end()) {
|
if (_map.find(word) == _map.end()) {
|
||||||
if (_nextFree == INDEX_CHARACTER_TYPE_MAX_VALUE) {
|
if (_nextFree == INDEX_CHARACTER_TYPE_MAX_VALUE) {
|
||||||
|
@ -14,8 +14,6 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class WordMap {
|
class WordMap {
|
||||||
public:
|
public:
|
||||||
explicit WordMap() throw(ConcordiaException);
|
explicit WordMap() throw(ConcordiaException);
|
||||||
@ -24,7 +22,7 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~WordMap();
|
virtual ~WordMap();
|
||||||
|
|
||||||
INDEX_CHARACTER_TYPE getWordCode(const string & word)
|
INDEX_CHARACTER_TYPE getWordCode(const std::string & word)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -37,7 +35,7 @@ private:
|
|||||||
ar & _nextFree;
|
ar & _nextFree;
|
||||||
}
|
}
|
||||||
|
|
||||||
map<string, INDEX_CHARACTER_TYPE> _map;
|
std::map<std::string, INDEX_CHARACTER_TYPE> _map;
|
||||||
|
|
||||||
INDEX_CHARACTER_TYPE _nextFree;
|
INDEX_CHARACTER_TYPE _nextFree;
|
||||||
};
|
};
|
||||||
|
@ -4,24 +4,24 @@
|
|||||||
#define CONCORDIA_TAGSET_DIRECTORY "concordia-tagset"
|
#define CONCORDIA_TAGSET_DIRECTORY "concordia-tagset"
|
||||||
#define CONCORDIA_CONFIG_DIRECTORY "concordia-config"
|
#define CONCORDIA_CONFIG_DIRECTORY "concordia-config"
|
||||||
|
|
||||||
string TestResourcesManager::getPuddleFilePath(const string & filename) {
|
std::string TestResourcesManager::getPuddleFilePath(const std::string & filename) {
|
||||||
string result = string(TEST_RESOURCES_DIRECTORY);
|
std::string result = std::string(TEST_RESOURCES_DIRECTORY);
|
||||||
return result + "/" + PUDDLE_TEST_DIRECTORY + "/" + filename;
|
return result + "/" + PUDDLE_TEST_DIRECTORY + "/" + filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
string TestResourcesManager::getTestConcordiaConfigFilePath(const string & filename) {
|
std::string TestResourcesManager::getTestConcordiaConfigFilePath(const std::string & filename) {
|
||||||
string result = string(TEST_RESOURCES_DIRECTORY);
|
std::string result = std::string(TEST_RESOURCES_DIRECTORY);
|
||||||
return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename;
|
return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
string TestResourcesManager::getProdConcordiaConfigFilePath(const string & filename) {
|
std::string TestResourcesManager::getProdConcordiaConfigFilePath(const std::string & filename) {
|
||||||
string result = string(PROD_RESOURCES_DIRECTORY);
|
std::string result = std::string(PROD_RESOURCES_DIRECTORY);
|
||||||
return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename;
|
return result + "/" + CONCORDIA_CONFIG_DIRECTORY + "/" + filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
string TestResourcesManager::getTestFilePath(const string & module, const string & filename) {
|
std::string TestResourcesManager::getTestFilePath(const std::string & module, const std::string & filename) {
|
||||||
string result = string(TEST_RESOURCES_DIRECTORY);
|
std::string result = std::string(TEST_RESOURCES_DIRECTORY);
|
||||||
return result + "/" + module + "/" + filename;
|
return result + "/" + module + "/" + filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,17 +6,15 @@
|
|||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
class TestResourcesManager {
|
class TestResourcesManager {
|
||||||
public:
|
public:
|
||||||
static string getPuddleFilePath(const string & filename);
|
static std::string getPuddleFilePath(const std::string & filename);
|
||||||
|
|
||||||
static string getTestConcordiaConfigFilePath(const string & filename);
|
static std::string getTestConcordiaConfigFilePath(const std::string & filename);
|
||||||
|
|
||||||
static string getProdConcordiaConfigFilePath(const string & filename);
|
static std::string getProdConcordiaConfigFilePath(const std::string & filename);
|
||||||
|
|
||||||
static string getTestFilePath(const string & module, const string & filename);
|
static std::string getTestFilePath(const std::string & module, const std::string & filename);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user