sentence anonymizer stub, regex replacement
Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e
This commit is contained in:
parent
5d56065e93
commit
13c97f572d
@ -74,7 +74,7 @@ endif(WITH_PCRE)
|
|||||||
set(Boost_USE_STATIC_LIBS OFF)
|
set(Boost_USE_STATIC_LIBS OFF)
|
||||||
set(Boost_USE_STATIC_RUNTIME OFF)
|
set(Boost_USE_STATIC_RUNTIME OFF)
|
||||||
find_package(Boost COMPONENTS
|
find_package(Boost COMPONENTS
|
||||||
serialization unit_test_framework system filesystem program_options iostreams REQUIRED)
|
serialization unit_test_framework system filesystem program_options iostreams regex REQUIRED)
|
||||||
|
|
||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
# libconfig
|
# libconfig
|
||||||
|
@ -6,6 +6,8 @@ foreach(dir ${ALL_DIRECTORIES})
|
|||||||
endforeach(dir)
|
endforeach(dir)
|
||||||
|
|
||||||
add_library(concordia SHARED
|
add_library(concordia SHARED
|
||||||
|
regex_replacement.cpp
|
||||||
|
sentence_anonymizer.cpp
|
||||||
interval.cpp
|
interval.cpp
|
||||||
tm_matches.cpp
|
tm_matches.cpp
|
||||||
anubis_search_result.cpp
|
anubis_search_result.cpp
|
||||||
|
@ -100,20 +100,20 @@ void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
|||||||
throw ConcordiaException("Index corrupt: empty markers file");
|
throw ConcordiaException("Index corrupt: empty markers file");
|
||||||
}
|
}
|
||||||
// generating suffix array
|
// generating suffix array
|
||||||
_SA = _index->generateSuffixArray(_hashGenerator, _T);
|
_SA = _index->generateSuffixArray(_T);
|
||||||
} else {
|
} else {
|
||||||
throw ConcordiaException("Index corrupt: missing files");
|
throw ConcordiaException("Index corrupt: missing files");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Concordia::refreshSAfromRAM() throw(ConcordiaException) {
|
void Concordia::refreshSAfromRAM() throw(ConcordiaException) {
|
||||||
_SA = _index->generateSuffixArray(_hashGenerator, _T);
|
_SA = _index->generateSuffixArray(_T);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void Concordia::_initializeIndex() throw(ConcordiaException) {
|
void Concordia::_initializeIndex() throw(ConcordiaException) {
|
||||||
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
||||||
new HashGenerator(_config->getWordMapFilePath()));
|
new HashGenerator(_config));
|
||||||
_T = boost::shared_ptr<std::vector<sauchar_t> >(
|
_T = boost::shared_ptr<std::vector<sauchar_t> >(
|
||||||
new std::vector<sauchar_t>);
|
new std::vector<sauchar_t>);
|
||||||
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(
|
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(
|
||||||
|
@ -17,7 +17,6 @@ ConcordiaIndex::~ConcordiaIndex() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
||||||
boost::shared_ptr<vector<sauchar_t> > T) {
|
boost::shared_ptr<vector<sauchar_t> > T) {
|
||||||
saidx_t * SA_array = new saidx_t[T->size()];
|
saidx_t * SA_array = new saidx_t[T->size()];
|
||||||
if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) {
|
if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) {
|
||||||
|
@ -43,7 +43,6 @@ public:
|
|||||||
const boost::ptr_vector<Example > & examples);
|
const boost::ptr_vector<Example > & examples);
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
|
boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
|
||||||
boost::shared_ptr<vector<sauchar_t> > T);
|
boost::shared_ptr<vector<sauchar_t> > T);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -5,10 +5,11 @@
|
|||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
HashGenerator::HashGenerator(const string & wordMapFilePath)
|
HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||||
throw(ConcordiaException) :
|
throw(ConcordiaException) :
|
||||||
_wordMapFilePath(wordMapFilePath),
|
_wordMapFilePath(config->getWordMapFilePath()),
|
||||||
_wordMap(boost::shared_ptr<WordMap>(new WordMap)) {
|
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
|
||||||
|
_sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(new SentenceAnonymizer(config))) {
|
||||||
if (boost::filesystem::exists(_wordMapFilePath)) {
|
if (boost::filesystem::exists(_wordMapFilePath)) {
|
||||||
ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
|
ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
|
||||||
boost::archive::binary_iarchive ia(ifs);
|
boost::archive::binary_iarchive ia(ifs);
|
||||||
|
@ -8,6 +8,8 @@
|
|||||||
#include <boost/algorithm/string/predicate.hpp>
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
#include "concordia/word_map.hpp"
|
#include "concordia/word_map.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/sentence_anonymizer.hpp"
|
||||||
|
#include "concordia/concordia_config.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
|
||||||
|
|
||||||
@ -20,8 +22,8 @@ using namespace std;
|
|||||||
|
|
||||||
class HashGenerator {
|
class HashGenerator {
|
||||||
public:
|
public:
|
||||||
explicit HashGenerator(const string & wordMapFilePath)
|
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
@ -35,6 +37,8 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
boost::shared_ptr<WordMap> _wordMap;
|
boost::shared_ptr<WordMap> _wordMap;
|
||||||
|
|
||||||
|
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
|
||||||
|
|
||||||
string _wordMapFilePath;
|
string _wordMapFilePath;
|
||||||
};
|
};
|
||||||
|
31
concordia/regex_replacement.cpp
Normal file
31
concordia/regex_replacement.cpp
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
#include "concordia/regex_replacement.hpp"
|
||||||
|
#include <sstream>
|
||||||
|
#include <boost/exception/all.hpp>
|
||||||
|
#include <boost/throw_exception.hpp>
|
||||||
|
|
||||||
|
RegexReplacement::RegexReplacement(string patternString, string replacement)
|
||||||
|
throw(ConcordiaException):
|
||||||
|
_replacement(replacement) {
|
||||||
|
try {
|
||||||
|
_pattern = boost::regex(patternString);
|
||||||
|
} catch ( const std::exception & e ) {
|
||||||
|
stringstream ss;
|
||||||
|
|
||||||
|
ss << "Bad regex pattern: " << patternString <<
|
||||||
|
" Detailed info: " << e.what();
|
||||||
|
|
||||||
|
if ( std::string const * extra = boost::get_error_info<my_tag_error_info>(e) ) {
|
||||||
|
ss << *extra;
|
||||||
|
}
|
||||||
|
throw ConcordiaException(ss.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RegexReplacement::~RegexReplacement() {
|
||||||
|
}
|
||||||
|
|
||||||
|
string RegexReplacement::apply(const string & text) {
|
||||||
|
return boost::regex_replace(text, _pattern, _replacement,
|
||||||
|
boost::match_default | boost::format_all);
|
||||||
|
}
|
||||||
|
|
37
concordia/regex_replacement.hpp
Normal file
37
concordia/regex_replacement.hpp
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
#ifndef REGEX_REPLACEMENT_HDR
|
||||||
|
#define REGEX_REPLACEMENT_HDR
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <boost/regex.hpp>
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Class for replacing string occurences.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
typedef boost::error_info<struct my_tag,std::string> my_tag_error_info;
|
||||||
|
|
||||||
|
class RegexReplacement {
|
||||||
|
public:
|
||||||
|
explicit RegexReplacement(string patternString, string replacement)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~RegexReplacement();
|
||||||
|
|
||||||
|
string apply(const string & text);
|
||||||
|
|
||||||
|
private:
|
||||||
|
boost::regex _pattern;
|
||||||
|
|
||||||
|
string _replacement;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
13
concordia/sentence_anonymizer.cpp
Normal file
13
concordia/sentence_anonymizer.cpp
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#include "concordia/sentence_anonymizer.hpp"
|
||||||
|
|
||||||
|
SentenceAnonymizer::SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
}
|
||||||
|
|
||||||
|
SentenceAnonymizer::~SentenceAnonymizer() {
|
||||||
|
}
|
||||||
|
|
||||||
|
string SentenceAnonymizer::anonymize(const string & sentence) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
32
concordia/sentence_anonymizer.hpp
Normal file
32
concordia/sentence_anonymizer.hpp
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#ifndef SENTENCE_ANONYMIZER_HDR
|
||||||
|
#define SENTENCE_ANONYMIZER_HDR
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/concordia_config.hpp"
|
||||||
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
#include <boost/shared_ptr.hpp>
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Class for anonymizing sentence before adding to index.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
class SentenceAnonymizer {
|
||||||
|
public:
|
||||||
|
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~SentenceAnonymizer();
|
||||||
|
|
||||||
|
string anonymize(const string & sentence);
|
||||||
|
|
||||||
|
private:
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -1,4 +1,5 @@
|
|||||||
add_library(concordia-tests
|
add_library(concordia-tests
|
||||||
|
test_regex_replacement.cpp
|
||||||
test_example.cpp
|
test_example.cpp
|
||||||
test_tm_matches.cpp
|
test_tm_matches.cpp
|
||||||
test_interval.cpp
|
test_interval.cpp
|
||||||
|
@ -15,10 +15,10 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
|
|||||||
{
|
{
|
||||||
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg"));
|
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg"));
|
||||||
BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" );
|
BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" );
|
||||||
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" );
|
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "/tmp/wm.bin" );
|
||||||
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" );
|
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
|
||||||
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "tmp/ma.bin" );
|
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
|
||||||
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" );
|
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "/tmp/sa.bin" );
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
|
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
|
||||||
|
@ -14,8 +14,6 @@ BOOST_AUTO_TEST_SUITE(concordia_index)
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
||||||
{
|
{
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
|
|
||||||
|
|
||||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||||
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||||
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
||||||
@ -36,7 +34,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
|||||||
// n: 0 1 2 3 4 5 6 7 8
|
// n: 0 1 2 3 4 5 6 7 8
|
||||||
//SA[n]: 0 3 1 7 4 2 8 5 6
|
//SA[n]: 0 3 1 7 4 2 8 5 6
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(hashGenerator, T);
|
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
||||||
expectedSA->push_back(0);
|
expectedSA->push_back(0);
|
||||||
@ -53,8 +51,6 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
|
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
|
||||||
{
|
{
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
|
|
||||||
|
|
||||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||||
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||||
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
||||||
@ -79,7 +75,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
|
|||||||
// n: 0 1 2 3 4 5 6 7 8 9 10 11
|
// n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||||
//SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
//SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||||
|
|
||||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(hashGenerator, T);
|
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||||
|
|
||||||
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
||||||
expectedSA->push_back(0);
|
expectedSA->push_back(0);
|
||||||
|
@ -3,10 +3,11 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
|
#include <boost/shared_ptr.hpp>
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
|
|
||||||
#define TEST_WORD_MAP_PATH "/tmp/test_word_map.bin"
|
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -14,11 +15,13 @@ BOOST_AUTO_TEST_SUITE(hash_generator)
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||||
{
|
{
|
||||||
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")));
|
||||||
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
|
||||||
|
if (boost::filesystem::exists(config->getWordMapFilePath())) {
|
||||||
|
boost::filesystem::remove(config->getWordMapFilePath());
|
||||||
}
|
}
|
||||||
|
|
||||||
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
HashGenerator hashGenerator = HashGenerator(config);
|
||||||
|
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash("Ala ma kota");
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash("Ala ma kota");
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
|
||||||
@ -31,11 +34,13 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
||||||
{
|
{
|
||||||
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")));
|
||||||
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
|
||||||
|
if (boost::filesystem::exists(config->getWordMapFilePath())) {
|
||||||
|
boost::filesystem::remove(config->getWordMapFilePath());
|
||||||
}
|
}
|
||||||
|
|
||||||
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
HashGenerator hashGenerator = HashGenerator(config);
|
||||||
|
|
||||||
stringstream ss;
|
stringstream ss;
|
||||||
for (int i=0;i<256;i++) {
|
for (int i=0;i<256;i++) {
|
||||||
@ -60,11 +65,14 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||||
{
|
{
|
||||||
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")));
|
||||||
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
|
||||||
|
if (boost::filesystem::exists(config->getWordMapFilePath())) {
|
||||||
|
boost::filesystem::remove(config->getWordMapFilePath());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HashGenerator hashGenerator1 = HashGenerator(config);
|
||||||
|
|
||||||
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
|
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash1 = hashGenerator1.generateHash("Ala ma kota");
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash1 = hashGenerator1.generateHash("Ala ma kota");
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected1(new vector<INDEX_CHARACTER_TYPE>());
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected1(new vector<INDEX_CHARACTER_TYPE>());
|
||||||
expected1->push_back(0);
|
expected1->push_back(0);
|
||||||
@ -74,7 +82,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
|||||||
|
|
||||||
hashGenerator1.serializeWordMap();
|
hashGenerator1.serializeWordMap();
|
||||||
|
|
||||||
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
|
HashGenerator hashGenerator2 = HashGenerator(config);
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash2 = hashGenerator2.generateHash("Ala ma psa");
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash2 = hashGenerator2.generateHash("Ala ma psa");
|
||||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected2(new vector<INDEX_CHARACTER_TYPE>());
|
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected2(new vector<INDEX_CHARACTER_TYPE>());
|
||||||
expected2->push_back(0);
|
expected2->push_back(0);
|
||||||
@ -82,7 +90,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
|||||||
expected2->push_back(3);
|
expected2->push_back(3);
|
||||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash2->begin(), hash2->end(), expected2->begin(), expected2->end());
|
BOOST_CHECK_EQUAL_COLLECTIONS(hash2->begin(), hash2->end(), expected2->begin(), expected2->end());
|
||||||
|
|
||||||
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
boost::filesystem::remove(config->getWordMapFilePath());
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
42
concordia/t/test_regex_replacement.cpp
Normal file
42
concordia/t/test_regex_replacement.cpp
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
|
#include "concordia/regex_replacement.hpp"
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE(regex_replacement)
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
||||||
|
{
|
||||||
|
RegexReplacement rr("a","b");
|
||||||
|
BOOST_CHECK_EQUAL(rr.apply("xxxxxxxaxxxaxxaxaxa"),"xxxxxxxbxxxbxxbxbxb");
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( BadRegex )
|
||||||
|
{
|
||||||
|
bool exceptionThrown = false;
|
||||||
|
string message = "";
|
||||||
|
try {
|
||||||
|
RegexReplacement rr("+a","b");
|
||||||
|
} catch (ConcordiaException & e) {
|
||||||
|
exceptionThrown = true;
|
||||||
|
message = e.what();
|
||||||
|
}
|
||||||
|
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
||||||
|
BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
||||||
|
{
|
||||||
|
RegexReplacement rr("['\"\\\\.]","");
|
||||||
|
BOOST_CHECK_EQUAL(rr.apply("Don't stop believin' \\ Hold on to the feelin'."),"Dont stop believin Hold on to the feelin");
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( BackrefReplacement )
|
||||||
|
{
|
||||||
|
RegexReplacement rr("(\\d+)","the number: \\1");
|
||||||
|
BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812.");
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE_END()
|
@ -6,12 +6,12 @@
|
|||||||
#Path to the Puddle tagset
|
#Path to the Puddle tagset
|
||||||
puddle_tagset_path = "puddle/tagset.txt";
|
puddle_tagset_path = "puddle/tagset.txt";
|
||||||
|
|
||||||
word_map_path = "tmp/wm.bin"
|
word_map_path = "/tmp/wm.bin"
|
||||||
|
|
||||||
hashed_index_path = "tmp/hi.bin"
|
hashed_index_path = "/tmp/hi.bin"
|
||||||
|
|
||||||
markers_path = "tmp/ma.bin"
|
markers_path = "/tmp/ma.bin"
|
||||||
|
|
||||||
suffix_array_path = "tmp/sa.bin"
|
suffix_array_path = "/tmp/sa.bin"
|
||||||
|
|
||||||
### eof
|
### eof
|
||||||
|
Loading…
Reference in New Issue
Block a user