sentence anonymizer stub, regex replacement

Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e
This commit is contained in:
rjawor 2014-04-13 12:21:30 +02:00
parent 5d56065e93
commit 13c97f572d
17 changed files with 202 additions and 37 deletions

View File

@ -74,7 +74,7 @@ endif(WITH_PCRE)
set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_STATIC_RUNTIME OFF)
find_package(Boost COMPONENTS
serialization unit_test_framework system filesystem program_options iostreams REQUIRED)
serialization unit_test_framework system filesystem program_options iostreams regex REQUIRED)
# ----------------------------------------------------
# libconfig

View File

@ -6,6 +6,8 @@ foreach(dir ${ALL_DIRECTORIES})
endforeach(dir)
add_library(concordia SHARED
regex_replacement.cpp
sentence_anonymizer.cpp
interval.cpp
tm_matches.cpp
anubis_search_result.cpp

View File

@ -100,20 +100,20 @@ void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) {
throw ConcordiaException("Index corrupt: empty markers file");
}
// generating suffix array
_SA = _index->generateSuffixArray(_hashGenerator, _T);
_SA = _index->generateSuffixArray(_T);
} else {
throw ConcordiaException("Index corrupt: missing files");
}
}
void Concordia::refreshSAfromRAM() throw(ConcordiaException) {
_SA = _index->generateSuffixArray(_hashGenerator, _T);
_SA = _index->generateSuffixArray(_T);
}
void Concordia::_initializeIndex() throw(ConcordiaException) {
_hashGenerator = boost::shared_ptr<HashGenerator>(
new HashGenerator(_config->getWordMapFilePath()));
new HashGenerator(_config));
_T = boost::shared_ptr<std::vector<sauchar_t> >(
new std::vector<sauchar_t>);
_markers = boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> >(

View File

@ -17,7 +17,6 @@ ConcordiaIndex::~ConcordiaIndex() {
}
boost::shared_ptr<vector<saidx_t> > ConcordiaIndex::generateSuffixArray(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T) {
saidx_t * SA_array = new saidx_t[T->size()];
if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) {

View File

@ -43,7 +43,6 @@ public:
const boost::ptr_vector<Example > & examples);
boost::shared_ptr<vector<saidx_t> > generateSuffixArray(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<vector<sauchar_t> > T);
private:

View File

@ -5,10 +5,11 @@
#include <boost/algorithm/string.hpp>
#include <fstream>
HashGenerator::HashGenerator(const string & wordMapFilePath)
HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException) :
_wordMapFilePath(wordMapFilePath),
_wordMap(boost::shared_ptr<WordMap>(new WordMap)) {
_wordMapFilePath(config->getWordMapFilePath()),
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
_sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(new SentenceAnonymizer(config))) {
if (boost::filesystem::exists(_wordMapFilePath)) {
ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
boost::archive::binary_iarchive ia(ifs);

View File

@ -8,6 +8,8 @@
#include <boost/algorithm/string/predicate.hpp>
#include "concordia/word_map.hpp"
#include "concordia/common/config.hpp"
#include "concordia/sentence_anonymizer.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
@ -20,8 +22,8 @@ using namespace std;
class HashGenerator {
public:
explicit HashGenerator(const string & wordMapFilePath)
throw(ConcordiaException);
explicit HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
/*! Destructor.
*/
@ -36,6 +38,8 @@ public:
private:
boost::shared_ptr<WordMap> _wordMap;
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
string _wordMapFilePath;
};

View File

@ -0,0 +1,31 @@
#include "concordia/regex_replacement.hpp"
#include <sstream>
#include <boost/exception/all.hpp>
#include <boost/throw_exception.hpp>
RegexReplacement::RegexReplacement(string patternString, string replacement)
throw(ConcordiaException):
_replacement(replacement) {
try {
_pattern = boost::regex(patternString);
} catch ( const std::exception & e ) {
stringstream ss;
ss << "Bad regex pattern: " << patternString <<
" Detailed info: " << e.what();
if ( std::string const * extra = boost::get_error_info<my_tag_error_info>(e) ) {
ss << *extra;
}
throw ConcordiaException(ss.str());
}
}
RegexReplacement::~RegexReplacement() {
}
string RegexReplacement::apply(const string & text) {
return boost::regex_replace(text, _pattern, _replacement,
boost::match_default | boost::format_all);
}

View File

@ -0,0 +1,37 @@
#ifndef REGEX_REPLACEMENT_HDR
#define REGEX_REPLACEMENT_HDR
#include <string>
#include "concordia/common/config.hpp"
#include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/regex.hpp>
/*!
Class for replacing string occurences.
*/
using namespace std;
typedef boost::error_info<struct my_tag,std::string> my_tag_error_info;
class RegexReplacement {
public:
explicit RegexReplacement(string patternString, string replacement)
throw(ConcordiaException);
/*! Destructor.
*/
virtual ~RegexReplacement();
string apply(const string & text);
private:
boost::regex _pattern;
string _replacement;
};
#endif

View File

@ -0,0 +1,13 @@
#include "concordia/sentence_anonymizer.hpp"
SentenceAnonymizer::SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException) {
}
SentenceAnonymizer::~SentenceAnonymizer() {
}
string SentenceAnonymizer::anonymize(const string & sentence) {
}

View File

@ -0,0 +1,32 @@
#ifndef SENTENCE_ANONYMIZER_HDR
#define SENTENCE_ANONYMIZER_HDR
#include <string>
#include "concordia/common/config.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp>
/*!
Class for anonymizing sentence before adding to index.
*/
using namespace std;
class SentenceAnonymizer {
public:
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
/*! Destructor.
*/
virtual ~SentenceAnonymizer();
string anonymize(const string & sentence);
private:
};
#endif

View File

@ -1,4 +1,5 @@
add_library(concordia-tests
test_regex_replacement.cpp
test_example.cpp
test_tm_matches.cpp
test_interval.cpp

View File

@ -15,10 +15,10 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
{
ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg"));
BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" );
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" );
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" );
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "tmp/ma.bin" );
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" );
BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "/tmp/wm.bin" );
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "/tmp/sa.bin" );
}
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )

View File

@ -14,8 +14,6 @@ BOOST_AUTO_TEST_SUITE(concordia_index)
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
{
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
@ -36,7 +34,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
// n: 0 1 2 3 4 5 6 7 8
//SA[n]: 0 3 1 7 4 2 8 5 6
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(hashGenerator, T);
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
expectedSA->push_back(0);
@ -53,8 +51,6 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest )
BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
{
boost::shared_ptr<HashGenerator> hashGenerator (new HashGenerator("nonexistent"));
ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"),
TestResourcesManager::getTestFilePath("temp","test_markers.bin"));
boost::shared_ptr<vector<sauchar_t> > T = boost::shared_ptr<vector<sauchar_t> >(new vector<sauchar_t>());
@ -79,7 +75,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 )
// n: 0 1 2 3 4 5 6 7 8 9 10 11
//SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(hashGenerator, T);
boost::shared_ptr<std::vector<saidx_t> > SA = index.generateSuffixArray(T);
boost::shared_ptr<vector<saidx_t> > expectedSA = boost::shared_ptr<vector<saidx_t> >(new vector<saidx_t>());
expectedSA->push_back(0);

View File

@ -3,10 +3,11 @@
#include <string>
#include <sstream>
#include <boost/shared_ptr.hpp>
#include "concordia/common/config.hpp"
#include "concordia/hash_generator.hpp"
#include "tests/common/test_resources_manager.hpp"
#define TEST_WORD_MAP_PATH "/tmp/test_word_map.bin"
using namespace std;
@ -14,11 +15,13 @@ BOOST_AUTO_TEST_SUITE(hash_generator)
BOOST_AUTO_TEST_CASE( SimpleHashTest )
{
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
boost::filesystem::remove(TEST_WORD_MAP_PATH);
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")));
if (boost::filesystem::exists(config->getWordMapFilePath())) {
boost::filesystem::remove(config->getWordMapFilePath());
}
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
HashGenerator hashGenerator = HashGenerator(config);
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash = hashGenerator.generateHash("Ala ma kota");
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected(new vector<INDEX_CHARACTER_TYPE>());
@ -31,11 +34,13 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest )
BOOST_AUTO_TEST_CASE( TooLongHashTest )
{
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
boost::filesystem::remove(TEST_WORD_MAP_PATH);
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")));
if (boost::filesystem::exists(config->getWordMapFilePath())) {
boost::filesystem::remove(config->getWordMapFilePath());
}
HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH);
HashGenerator hashGenerator = HashGenerator(config);
stringstream ss;
for (int i=0;i<256;i++) {
@ -60,11 +65,14 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
BOOST_AUTO_TEST_CASE( HashSerializationTest )
{
if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) {
boost::filesystem::remove(TEST_WORD_MAP_PATH);
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")));
if (boost::filesystem::exists(config->getWordMapFilePath())) {
boost::filesystem::remove(config->getWordMapFilePath());
}
HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH);
HashGenerator hashGenerator1 = HashGenerator(config);
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash1 = hashGenerator1.generateHash("Ala ma kota");
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected1(new vector<INDEX_CHARACTER_TYPE>());
expected1->push_back(0);
@ -74,7 +82,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
hashGenerator1.serializeWordMap();
HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH);
HashGenerator hashGenerator2 = HashGenerator(config);
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > hash2 = hashGenerator2.generateHash("Ala ma psa");
boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > expected2(new vector<INDEX_CHARACTER_TYPE>());
expected2->push_back(0);
@ -82,7 +90,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest )
expected2->push_back(3);
BOOST_CHECK_EQUAL_COLLECTIONS(hash2->begin(), hash2->end(), expected2->begin(), expected2->end());
boost::filesystem::remove(TEST_WORD_MAP_PATH);
boost::filesystem::remove(config->getWordMapFilePath());
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -0,0 +1,42 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/regex_replacement.hpp"
#include "concordia/common/config.hpp"
#include <boost/algorithm/string/predicate.hpp>
using namespace std;
BOOST_AUTO_TEST_SUITE(regex_replacement)
BOOST_AUTO_TEST_CASE( SimpleReplacement )
{
RegexReplacement rr("a","b");
BOOST_CHECK_EQUAL(rr.apply("xxxxxxxaxxxaxxaxaxa"),"xxxxxxxbxxxbxxbxbxb");
}
BOOST_AUTO_TEST_CASE( BadRegex )
{
bool exceptionThrown = false;
string message = "";
try {
RegexReplacement rr("+a","b");
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
}
BOOST_CHECK_EQUAL(exceptionThrown, true);
BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);
}
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
{
RegexReplacement rr("['\"\\\\.]","");
BOOST_CHECK_EQUAL(rr.apply("Don't stop believin' \\ Hold on to the feelin'."),"Dont stop believin Hold on to the feelin");
}
BOOST_AUTO_TEST_CASE( BackrefReplacement )
{
RegexReplacement rr("(\\d+)","the number: \\1");
BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812.");
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -6,12 +6,12 @@
#Path to the Puddle tagset
puddle_tagset_path = "puddle/tagset.txt";
word_map_path = "tmp/wm.bin"
word_map_path = "/tmp/wm.bin"
hashed_index_path = "tmp/hi.bin"
hashed_index_path = "/tmp/hi.bin"
markers_path = "tmp/ma.bin"
markers_path = "/tmp/ma.bin"
suffix_array_path = "tmp/sa.bin"
suffix_array_path = "/tmp/sa.bin"
### eof