sentence anonymizer stub, regex replacement
Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e
This commit is contained in:
parent
5d56065e93
commit
13c97f572d
@ -74,7 +74,7 @@ endif(WITH_PCRE)
|
||||
set(Boost_USE_STATIC_LIBS OFF)
|
||||
set(Boost_USE_STATIC_RUNTIME OFF)
|
||||
find_package(Boost COMPONENTS
|
||||
serialization unit_test_framework system filesystem program_options iostreams REQUIRED)
|
||||
serialization unit_test_framework system filesystem program_options iostreams regex REQUIRED)
|
||||
|
||||
# ----------------------------------------------------
|
||||
# libconfig
|
||||
|
@ -6,6 +6,8 @@ foreach(dir ${ALL_DIRECTORIES})
|
||||
endforeach(dir)
|
||||
|
||||
add_library(concordia SHARED
|
||||
regex_replacement.cpp
|
||||
sentence_anonymizer.cpp
|
||||
interval.cpp
|
||||
tm_matches.cpp
|
||||
anubis_search_result.cpp
|
||||
|
@ -100,20 +100,20 @@ void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
|
||||
throw ConcordiaException("Index corrupt: empty markers file");
|
||||
}
|
||||
// generating suffix array
|
||||
_SA = _index->generateSuffixArray(_hashGenerator, _T);
|
||||
_SA = _index->generateSuffixArray(_T);
|
||||
} else {
|
||||
throw ConcordiaException("Index corrupt: missing files");
|
||||
}
|
||||
}
|
||||
|
||||
void Concordia::refreshSAfromRAM() throw(ConcordiaException) {
|
||||
_SA = _index->generateSuffixArray(_hashGenerator, _T);
|
||||
_SA = _index->generateSuffixArray(_T);
|
||||
}
|
||||
|
||||
|
||||
void Concordia::_initializeIndex() throw(ConcordiaException) {
|
||||
_hashGenerator = boost::shared_ptr<HashGenerator>(
|
||||
new HashGenerator(_config->getWordMapFilePath()));
|
||||
new HashGenerator(_config));
|
||||
_T = boost::shared_ptr<std::vector<sauchar_t> >(
|
||||
new std::vector<sauchar_t>);
|
||||
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(
|
||||
|
@ -17,7 +17,6 @@ ConcordiaIndex::~ConcordiaIndex() {
|
||||
}
|
||||
|
||||
boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<vector<sauchar_t> > T) {
|
||||
saidx_t * SA_array = new saidx_t[T->size()];
|
||||
if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) {
|
||||
|
@ -43,7 +43,6 @@ public:
|
||||
const boost::ptr_vector<Example > & examples);
|
||||
|
||||
boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<vector<sauchar_t> > T);
|
||||
|
||||
private:
|
||||
|
@ -5,10 +5,11 @@
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <fstream>
|
||||
|
||||
HashGenerator::HashGenerator(const string & wordMapFilePath)
|
||||
HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||
throw(ConcordiaException) :
|
||||
_wordMapFilePath(wordMapFilePath),
|
||||
_wordMap(boost::shared_ptr<WordMap>(new WordMap)) {
|
||||
_wordMapFilePath(config->getWordMapFilePath()),
|
||||
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
|
||||
_sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(new SentenceAnonymizer(config))) {
|
||||
if (boost::filesystem::exists(_wordMapFilePath)) {
|
||||
ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
|
||||
boost::archive::binary_iarchive ia(ifs);
|
||||
|
@ -8,6 +8,8 @@
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include "concordia/word_map.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/sentence_anonymizer.hpp"
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
|
||||
|
||||
@ -20,8 +22,8 @@ using namespace std;
|
||||
|
||||
class HashGenerator {
|
||||
public:
|
||||
explicit HashGenerator(const string & wordMapFilePath)
|
||||
throw(ConcordiaException);
|
||||
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
@ -35,6 +37,8 @@ public:
|
||||
|
||||
private:
|
||||
boost::shared_ptr<WordMap> _wordMap;
|
||||
|
||||
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
|
||||
|
||||
string _wordMapFilePath;
|
||||
};
|
||||
|
31
concordia/regex_replacement.cpp
Normal file
31
concordia/regex_replacement.cpp
Normal file
@ -0,0 +1,31 @@
|
||||
#include "concordia/regex_replacement.hpp"
|
||||
#include <sstream>
|
||||
#include <boost/exception/all.hpp>
|
||||
#include <boost/throw_exception.hpp>
|
||||
|
||||
RegexReplacement::RegexReplacement(string patternString, string replacement)
|
||||
throw(ConcordiaException):
|
||||
_replacement(replacement) {
|
||||
try {
|
||||
_pattern = boost::regex(patternString);
|
||||
} catch ( const std::exception & e ) {
|
||||
stringstream ss;
|
||||
|
||||
ss << "Bad regex pattern: " << patternString <<
|
||||
" Detailed info: " << e.what();
|
||||
|
||||
if ( std::string const * extra = boost::get_error_info<my_tag_error_info>(e) ) {
|
||||
ss << *extra;
|
||||
}
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
RegexReplacement::~RegexReplacement() {
|
||||
}
|
||||
|
||||
string RegexReplacement::apply(const string & text) {
|
||||
return boost::regex_replace(text, _pattern, _replacement,
|
||||
boost::match_default | boost::format_all);
|
||||
}
|
||||
|
37
concordia/regex_replacement.hpp
Normal file
37
concordia/regex_replacement.hpp
Normal file
@ -0,0 +1,37 @@
|
||||
#ifndef REGEX_REPLACEMENT_HDR
|
||||
#define REGEX_REPLACEMENT_HDR
|
||||
|
||||
#include <string>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/regex.hpp>
|
||||
|
||||
|
||||
/*!
|
||||
Class for replacing string occurences.
|
||||
|
||||
*/
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef boost::error_info<struct my_tag,std::string> my_tag_error_info;
|
||||
|
||||
class RegexReplacement {
|
||||
public:
|
||||
explicit RegexReplacement(string patternString, string replacement)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~RegexReplacement();
|
||||
|
||||
string apply(const string & text);
|
||||
|
||||
private:
|
||||
boost::regex _pattern;
|
||||
|
||||
string _replacement;
|
||||
};
|
||||
|
||||
#endif
|
13
concordia/sentence_anonymizer.cpp
Normal file
13
concordia/sentence_anonymizer.cpp
Normal file
@ -0,0 +1,13 @@
|
||||
#include "concordia/sentence_anonymizer.hpp"
|
||||
|
||||
SentenceAnonymizer::SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
||||
throw(ConcordiaException) {
|
||||
}
|
||||
|
||||
SentenceAnonymizer::~SentenceAnonymizer() {
|
||||
}
|
||||
|
||||
string SentenceAnonymizer::anonymize(const string & sentence) {
|
||||
|
||||
}
|
||||
|
32
concordia/sentence_anonymizer.hpp
Normal file
32
concordia/sentence_anonymizer.hpp
Normal file
@ -0,0 +1,32 @@
|
||||
#ifndef SENTENCE_ANONYMIZER_HDR
|
||||
#define SENTENCE_ANONYMIZER_HDR
|
||||
|
||||
#include <string>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
|
||||
/*!
|
||||
Class for anonymizing sentence before adding to index.
|
||||
|
||||
*/
|
||||
|
||||
using namespace std;
|
||||
|
||||
class SentenceAnonymizer {
|
||||
public:
|
||||
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~SentenceAnonymizer();
|
||||
|
||||
string anonymize(const string & sentence);
|
||||
|
||||
private:
|
||||
};
|
||||
|
||||
#endif
|
@ -1,4 +1,5 @@
|
||||
add_library(concordia-tests
|
||||
test_regex_replacement.cpp
|
||||
test_example.cpp
|
||||
test_tm_matches.cpp
|
||||
test_interval.cpp
|
||||
|
@ -15,10 +15,10 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
|
||||
{
|
||||
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg"));
|
||||
BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" );
|
||||
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "tmp/ma.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "/tmp/wm.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "/tmp/sa.bin" );
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
|
||||
|
@ -14,8 +14,6 @@ BOOST_AUTO_TEST_SUITE(concordia_index)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
||||
{
|
||||
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
|
||||
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
||||
@ -36,7 +34,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
||||
// n: 0 1 2 3 4 5 6 7 8
|
||||
//SA[n]: 0 3 1 7 4 2 8 5 6
|
||||
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(hashGenerator, T);
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||
|
||||
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
||||
expectedSA->push_back(0);
|
||||
@ -53,8 +51,6 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
|
||||
{
|
||||
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
|
||||
|
||||
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
|
||||
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
|
||||
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
|
||||
@ -79,7 +75,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
|
||||
// n: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
//SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
|
||||
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(hashGenerator, T);
|
||||
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
|
||||
|
||||
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
|
||||
expectedSA->push_back(0);
|
||||
|
@ -3,10 +3,11 @@
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/hash_generator.hpp"
|
||||
#include "tests/common/test_resources_manager.hpp"
|
||||
|
||||
#define TEST_WORD_MAP_PATH "/tmp/test_word_map.bin"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -14,11 +15,13 @@ BOOST_AUTO_TEST_SUITE(hash_generator)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||
{
|
||||
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
||||
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")));
|
||||
|
||||
if (boost::filesystem::exists(config->getWordMapFilePath())) {
|
||||
boost::filesystem::remove(config->getWordMapFilePath());
|
||||
}
|
||||
|
||||
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
HashGenerator hashGenerator = HashGenerator(config);
|
||||
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash("Ala ma kota");
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
|
||||
@ -31,11 +34,13 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
|
||||
|
||||
BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
||||
{
|
||||
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
||||
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")));
|
||||
|
||||
if (boost::filesystem::exists(config->getWordMapFilePath())) {
|
||||
boost::filesystem::remove(config->getWordMapFilePath());
|
||||
}
|
||||
|
||||
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
HashGenerator hashGenerator = HashGenerator(config);
|
||||
|
||||
stringstream ss;
|
||||
for (int i=0;i<256;i++) {
|
||||
@ -60,11 +65,14 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
||||
|
||||
BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
{
|
||||
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
|
||||
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")));
|
||||
|
||||
if (boost::filesystem::exists(config->getWordMapFilePath())) {
|
||||
boost::filesystem::remove(config->getWordMapFilePath());
|
||||
}
|
||||
|
||||
HashGenerator hashGenerator1 = HashGenerator(config);
|
||||
|
||||
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash1 = hashGenerator1.generateHash("Ala ma kota");
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected1(new vector<INDEX_CHARACTER_TYPE>());
|
||||
expected1->push_back(0);
|
||||
@ -74,7 +82,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
|
||||
hashGenerator1.serializeWordMap();
|
||||
|
||||
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
|
||||
HashGenerator hashGenerator2 = HashGenerator(config);
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash2 = hashGenerator2.generateHash("Ala ma psa");
|
||||
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected2(new vector<INDEX_CHARACTER_TYPE>());
|
||||
expected2->push_back(0);
|
||||
@ -82,7 +90,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
|
||||
expected2->push_back(3);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(hash2->begin(), hash2->end(), expected2->begin(), expected2->end());
|
||||
|
||||
boost::filesystem::remove(TEST_WORD_MAP_PATH);
|
||||
boost::filesystem::remove(config->getWordMapFilePath());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
42
concordia/t/test_regex_replacement.cpp
Normal file
42
concordia/t/test_regex_replacement.cpp
Normal file
@ -0,0 +1,42 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/regex_replacement.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
|
||||
using namespace std;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(regex_replacement)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
||||
{
|
||||
RegexReplacement rr("a","b");
|
||||
BOOST_CHECK_EQUAL(rr.apply("xxxxxxxaxxxaxxaxaxa"),"xxxxxxxbxxxbxxbxbxb");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( BadRegex )
|
||||
{
|
||||
bool exceptionThrown = false;
|
||||
string message = "";
|
||||
try {
|
||||
RegexReplacement rr("+a","b");
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
}
|
||||
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
||||
{
|
||||
RegexReplacement rr("['\"\\\\.]","");
|
||||
BOOST_CHECK_EQUAL(rr.apply("Don't stop believin' \\ Hold on to the feelin'."),"Dont stop believin Hold on to the feelin");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( BackrefReplacement )
|
||||
{
|
||||
RegexReplacement rr("(\\d+)","the number: \\1");
|
||||
BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812.");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
@ -6,12 +6,12 @@
|
||||
#Path to the Puddle tagset
|
||||
puddle_tagset_path = "puddle/tagset.txt";
|
||||
|
||||
word_map_path = "tmp/wm.bin"
|
||||
word_map_path = "/tmp/wm.bin"
|
||||
|
||||
hashed_index_path = "tmp/hi.bin"
|
||||
hashed_index_path = "/tmp/hi.bin"
|
||||
|
||||
markers_path = "tmp/ma.bin"
|
||||
markers_path = "/tmp/ma.bin"
|
||||
|
||||
suffix_array_path = "tmp/sa.bin"
|
||||
suffix_array_path = "/tmp/sa.bin"
|
||||
|
||||
### eof
|
||||
|
Loading…
Reference in New Issue
Block a user