From 13c97f572d8cea9a74ff4f29fd09c7b50547192d Mon Sep 17 00:00:00 2001 From: rjawor Date: Sun, 13 Apr 2014 12:21:30 +0200 Subject: [PATCH] sentence anonymizer stub, regex replacement Former-commit-id: edb1247f7b29fd62913114be84d3391507a0890e --- CMakeLists.txt | 2 +- concordia/CMakeLists.txt | 2 + concordia/concordia.cpp | 6 +-- concordia/concordia_index.cpp | 1 - concordia/concordia_index.hpp | 1 - concordia/hash_generator.cpp | 7 ++-- concordia/hash_generator.hpp | 8 +++- concordia/regex_replacement.cpp | 31 ++++++++++++++ concordia/regex_replacement.hpp | 37 ++++++++++++++++ concordia/sentence_anonymizer.cpp | 13 ++++++ concordia/sentence_anonymizer.hpp | 32 ++++++++++++++ concordia/t/CMakeLists.txt | 1 + concordia/t/test_concordia_config.cpp | 8 ++-- concordia/t/test_concordia_index.cpp | 8 +--- concordia/t/test_hash_generator.cpp | 32 ++++++++------ concordia/t/test_regex_replacement.cpp | 42 +++++++++++++++++++ .../concordia-config/concordia-mock.cfg | 8 ++-- 17 files changed, 202 insertions(+), 37 deletions(-) create mode 100644 concordia/regex_replacement.cpp create mode 100644 concordia/regex_replacement.hpp create mode 100644 concordia/sentence_anonymizer.cpp create mode 100644 concordia/sentence_anonymizer.hpp create mode 100644 concordia/t/test_regex_replacement.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b39b11..f4f27d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,7 +74,7 @@ endif(WITH_PCRE) set(Boost_USE_STATIC_LIBS OFF) set(Boost_USE_STATIC_RUNTIME OFF) find_package(Boost COMPONENTS - serialization unit_test_framework system filesystem program_options iostreams REQUIRED) + serialization unit_test_framework system filesystem program_options iostreams regex REQUIRED) # ---------------------------------------------------- # libconfig diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index 727f9a1..6a971c7 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -6,6 +6,8 @@ foreach(dir ${ALL_DIRECTORIES}) endforeach(dir) add_library(concordia SHARED + regex_replacement.cpp + sentence_anonymizer.cpp interval.cpp tm_matches.cpp anubis_search_result.cpp diff --git a/concordia/concordia.cpp b/concordia/concordia.cpp index 7d18b3c..30cac80 100644 --- a/concordia/concordia.cpp +++ b/concordia/concordia.cpp @@ -100,20 +100,20 @@ void Concordia::loadRAMIndexFromDisk() throw(ConcordiaException) { throw ConcordiaException("Index corrupt: empty markers file"); } // generating suffix array - _SA = _index->generateSuffixArray(_hashGenerator, _T); + _SA = _index->generateSuffixArray(_T); } else { throw ConcordiaException("Index corrupt: missing files"); } } void Concordia::refreshSAfromRAM() throw(ConcordiaException) { - _SA = _index->generateSuffixArray(_hashGenerator, _T); + _SA = _index->generateSuffixArray(_T); } void Concordia::_initializeIndex() throw(ConcordiaException) { _hashGenerator = boost::shared_ptr( - new HashGenerator(_config->getWordMapFilePath())); + new HashGenerator(_config)); _T = boost::shared_ptr >( new std::vector); _markers = boost::shared_ptr >( diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index 6f6ac42..011da29 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -17,7 +17,6 @@ ConcordiaIndex::~ConcordiaIndex() { } boost::shared_ptr > ConcordiaIndex::generateSuffixArray( - boost::shared_ptr hashGenerator, boost::shared_ptr > T) { saidx_t * SA_array = new saidx_t[T->size()]; if (divsufsort(T->data(), SA_array, (saidx_t) T->size()) != 0) { diff --git a/concordia/concordia_index.hpp b/concordia/concordia_index.hpp index 6d71d98..41d9d9e 100644 --- a/concordia/concordia_index.hpp +++ b/concordia/concordia_index.hpp @@ -43,7 +43,6 @@ public: const boost::ptr_vector & examples); boost::shared_ptr > generateSuffixArray( - boost::shared_ptr hashGenerator, boost::shared_ptr > T); private: diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index fc736f3..6014d01 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -5,10 +5,11 @@ #include #include -HashGenerator::HashGenerator(const string & wordMapFilePath) +HashGenerator::HashGenerator(boost::shared_ptr config) throw(ConcordiaException) : - _wordMapFilePath(wordMapFilePath), - _wordMap(boost::shared_ptr(new WordMap)) { + _wordMapFilePath(config->getWordMapFilePath()), + _wordMap(boost::shared_ptr(new WordMap)), + _sentenceAnonymizer(boost::shared_ptr(new SentenceAnonymizer(config))) { if (boost::filesystem::exists(_wordMapFilePath)) { ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary); boost::archive::binary_iarchive ia(ifs); diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index 450ce6d..982df2d 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -8,6 +8,8 @@ #include #include "concordia/word_map.hpp" #include "concordia/common/config.hpp" +#include "concordia/sentence_anonymizer.hpp" +#include "concordia/concordia_config.hpp" #include "concordia/concordia_exception.hpp" @@ -20,8 +22,8 @@ using namespace std; class HashGenerator { public: - explicit HashGenerator(const string & wordMapFilePath) - throw(ConcordiaException); + explicit HashGenerator(boost::shared_ptr config) + throw(ConcordiaException); /*! Destructor. */ @@ -35,6 +37,8 @@ public: private: boost::shared_ptr _wordMap; + + boost::shared_ptr _sentenceAnonymizer; string _wordMapFilePath; }; diff --git a/concordia/regex_replacement.cpp b/concordia/regex_replacement.cpp new file mode 100644 index 0000000..cc09964 --- /dev/null +++ b/concordia/regex_replacement.cpp @@ -0,0 +1,31 @@ +#include "concordia/regex_replacement.hpp" +#include +#include +#include + +RegexReplacement::RegexReplacement(string patternString, string replacement) + throw(ConcordiaException): + _replacement(replacement) { + try { + _pattern = boost::regex(patternString); + } catch ( const std::exception & e ) { + stringstream ss; + + ss << "Bad regex pattern: " << patternString << + " Detailed info: " << e.what(); + + if ( std::string const * extra = boost::get_error_info(e) ) { + ss << *extra; + } + throw ConcordiaException(ss.str()); + } +} + +RegexReplacement::~RegexReplacement() { +} + +string RegexReplacement::apply(const string & text) { + return boost::regex_replace(text, _pattern, _replacement, + boost::match_default | boost::format_all); +} + diff --git a/concordia/regex_replacement.hpp b/concordia/regex_replacement.hpp new file mode 100644 index 0000000..88c33ce --- /dev/null +++ b/concordia/regex_replacement.hpp @@ -0,0 +1,37 @@ +#ifndef REGEX_REPLACEMENT_HDR +#define REGEX_REPLACEMENT_HDR + +#include +#include "concordia/common/config.hpp" +#include "concordia/concordia_exception.hpp" +#include +#include + + +/*! + Class for replacing string occurences. + +*/ + +using namespace std; + +typedef boost::error_info my_tag_error_info; + +class RegexReplacement { +public: + explicit RegexReplacement(string patternString, string replacement) + throw(ConcordiaException); + + /*! Destructor. + */ + virtual ~RegexReplacement(); + + string apply(const string & text); + +private: + boost::regex _pattern; + + string _replacement; +}; + +#endif diff --git a/concordia/sentence_anonymizer.cpp b/concordia/sentence_anonymizer.cpp new file mode 100644 index 0000000..9b530bc --- /dev/null +++ b/concordia/sentence_anonymizer.cpp @@ -0,0 +1,13 @@ +#include "concordia/sentence_anonymizer.hpp" + +SentenceAnonymizer::SentenceAnonymizer(boost::shared_ptr config) + throw(ConcordiaException) { +} + +SentenceAnonymizer::~SentenceAnonymizer() { +} + +string SentenceAnonymizer::anonymize(const string & sentence) { + +} + diff --git a/concordia/sentence_anonymizer.hpp b/concordia/sentence_anonymizer.hpp new file mode 100644 index 0000000..6d0ded3 --- /dev/null +++ b/concordia/sentence_anonymizer.hpp @@ -0,0 +1,32 @@ +#ifndef SENTENCE_ANONYMIZER_HDR +#define SENTENCE_ANONYMIZER_HDR + +#include +#include "concordia/common/config.hpp" +#include "concordia/concordia_config.hpp" +#include "concordia/concordia_exception.hpp" +#include + + +/*! + Class for anonymizing sentence before adding to index. + +*/ + +using namespace std; + +class SentenceAnonymizer { +public: + explicit SentenceAnonymizer(boost::shared_ptr config) + throw(ConcordiaException); + + /*! Destructor. + */ + virtual ~SentenceAnonymizer(); + + string anonymize(const string & sentence); + +private: +}; + +#endif diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt index bdd3c92..054178c 100644 --- a/concordia/t/CMakeLists.txt +++ b/concordia/t/CMakeLists.txt @@ -1,4 +1,5 @@ add_library(concordia-tests + test_regex_replacement.cpp test_example.cpp test_tm_matches.cpp test_interval.cpp diff --git a/concordia/t/test_concordia_config.cpp b/concordia/t/test_concordia_config.cpp index 4af8602..5fc66c0 100644 --- a/concordia/t/test_concordia_config.cpp +++ b/concordia/t/test_concordia_config.cpp @@ -15,10 +15,10 @@ BOOST_AUTO_TEST_CASE( ConfigParameters ) { ConcordiaConfig config(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg")); BOOST_CHECK_EQUAL( config.getPuddleTagsetFilePath() , "puddle/tagset.txt" ); - BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "tmp/wm.bin" ); - BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "tmp/hi.bin" ); - BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "tmp/ma.bin" ); - BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "tmp/sa.bin" ); + BOOST_CHECK_EQUAL( config.getWordMapFilePath() , "/tmp/wm.bin" ); + BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" ); + BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" ); + BOOST_CHECK_EQUAL( config.getSuffixArrayFilePath() , "/tmp/sa.bin" ); } BOOST_AUTO_TEST_CASE( NonexistentConfigTest ) diff --git a/concordia/t/test_concordia_index.cpp b/concordia/t/test_concordia_index.cpp index 3efeb64..afdcf34 100644 --- a/concordia/t/test_concordia_index.cpp +++ b/concordia/t/test_concordia_index.cpp @@ -14,8 +14,6 @@ BOOST_AUTO_TEST_SUITE(concordia_index) BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest ) { - boost::shared_ptr hashGenerator (new HashGenerator("nonexistent")); - ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"), TestResourcesManager::getTestFilePath("temp","test_markers.bin")); boost::shared_ptr > T = boost::shared_ptr >(new vector()); @@ -36,7 +34,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest ) // n: 0 1 2 3 4 5 6 7 8 //SA[n]: 0 3 1 7 4 2 8 5 6 - boost::shared_ptr > SA = index.generateSuffixArray(hashGenerator, T); + boost::shared_ptr > SA = index.generateSuffixArray(T); boost::shared_ptr > expectedSA = boost::shared_ptr >(new vector()); expectedSA->push_back(0); @@ -53,8 +51,6 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest ) BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 ) { - boost::shared_ptr hashGenerator (new HashGenerator("nonexistent")); - ConcordiaIndex index(TestResourcesManager::getTestFilePath("temp","test_hash_index.bin"), TestResourcesManager::getTestFilePath("temp","test_markers.bin")); boost::shared_ptr > T = boost::shared_ptr >(new vector()); @@ -79,7 +75,7 @@ BOOST_AUTO_TEST_CASE( SuffixArrayGenerationTest2 ) // n: 0 1 2 3 4 5 6 7 8 9 10 11 //SA[n]: 0 4 1 9 5 2 10 6 8 11 3 7 - boost::shared_ptr > SA = index.generateSuffixArray(hashGenerator, T); + boost::shared_ptr > SA = index.generateSuffixArray(T); boost::shared_ptr > expectedSA = boost::shared_ptr >(new vector()); expectedSA->push_back(0); diff --git a/concordia/t/test_hash_generator.cpp b/concordia/t/test_hash_generator.cpp index 6df7cfe..3fc4e44 100644 --- a/concordia/t/test_hash_generator.cpp +++ b/concordia/t/test_hash_generator.cpp @@ -3,10 +3,11 @@ #include #include +#include #include "concordia/common/config.hpp" #include "concordia/hash_generator.hpp" +#include "tests/common/test_resources_manager.hpp" -#define TEST_WORD_MAP_PATH "/tmp/test_word_map.bin" using namespace std; @@ -14,11 +15,13 @@ BOOST_AUTO_TEST_SUITE(hash_generator) BOOST_AUTO_TEST_CASE( SimpleHashTest ) { - if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) { - boost::filesystem::remove(TEST_WORD_MAP_PATH); + boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg"))); + + if (boost::filesystem::exists(config->getWordMapFilePath())) { + boost::filesystem::remove(config->getWordMapFilePath()); } - HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH); + HashGenerator hashGenerator = HashGenerator(config); boost::shared_ptr > hash = hashGenerator.generateHash("Ala ma kota"); boost::shared_ptr > expected(new vector()); @@ -31,11 +34,13 @@ BOOST_AUTO_TEST_CASE( SimpleHashTest ) BOOST_AUTO_TEST_CASE( TooLongHashTest ) { - if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) { - boost::filesystem::remove(TEST_WORD_MAP_PATH); + boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg"))); + + if (boost::filesystem::exists(config->getWordMapFilePath())) { + boost::filesystem::remove(config->getWordMapFilePath()); } - HashGenerator hashGenerator = HashGenerator(TEST_WORD_MAP_PATH); + HashGenerator hashGenerator = HashGenerator(config); stringstream ss; for (int i=0;i<256;i++) { @@ -60,11 +65,14 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest ) BOOST_AUTO_TEST_CASE( HashSerializationTest ) { - if (boost::filesystem::exists(TEST_WORD_MAP_PATH)) { - boost::filesystem::remove(TEST_WORD_MAP_PATH); + boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia-mock.cfg"))); + + if (boost::filesystem::exists(config->getWordMapFilePath())) { + boost::filesystem::remove(config->getWordMapFilePath()); } + + HashGenerator hashGenerator1 = HashGenerator(config); - HashGenerator hashGenerator1 = HashGenerator(TEST_WORD_MAP_PATH); boost::shared_ptr > hash1 = hashGenerator1.generateHash("Ala ma kota"); boost::shared_ptr > expected1(new vector()); expected1->push_back(0); @@ -74,7 +82,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest ) hashGenerator1.serializeWordMap(); - HashGenerator hashGenerator2 = HashGenerator(TEST_WORD_MAP_PATH); + HashGenerator hashGenerator2 = HashGenerator(config); boost::shared_ptr > hash2 = hashGenerator2.generateHash("Ala ma psa"); boost::shared_ptr > expected2(new vector()); expected2->push_back(0); @@ -82,7 +90,7 @@ BOOST_AUTO_TEST_CASE( HashSerializationTest ) expected2->push_back(3); BOOST_CHECK_EQUAL_COLLECTIONS(hash2->begin(), hash2->end(), expected2->begin(), expected2->end()); - boost::filesystem::remove(TEST_WORD_MAP_PATH); + boost::filesystem::remove(config->getWordMapFilePath()); } BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_regex_replacement.cpp b/concordia/t/test_regex_replacement.cpp new file mode 100644 index 0000000..02a6065 --- /dev/null +++ b/concordia/t/test_regex_replacement.cpp @@ -0,0 +1,42 @@ +#include "tests/unit-tests/unit_tests_globals.hpp" +#include "concordia/regex_replacement.hpp" +#include "concordia/common/config.hpp" +#include + +using namespace std; + +BOOST_AUTO_TEST_SUITE(regex_replacement) + +BOOST_AUTO_TEST_CASE( SimpleReplacement ) +{ + RegexReplacement rr("a","b"); + BOOST_CHECK_EQUAL(rr.apply("xxxxxxxaxxxaxxaxaxa"),"xxxxxxxbxxxbxxbxbxb"); +} + +BOOST_AUTO_TEST_CASE( BadRegex ) +{ + bool exceptionThrown = false; + string message = ""; + try { + RegexReplacement rr("+a","b"); + } catch (ConcordiaException & e) { + exceptionThrown = true; + message = e.what(); + } + BOOST_CHECK_EQUAL(exceptionThrown, true); + BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true); +} + +BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement ) +{ + RegexReplacement rr("['\"\\\\.]",""); + BOOST_CHECK_EQUAL(rr.apply("Don't stop believin' \\ Hold on to the feelin'."),"Dont stop believin Hold on to the feelin"); +} + +BOOST_AUTO_TEST_CASE( BackrefReplacement ) +{ + RegexReplacement rr("(\\d+)","the number: \\1"); + BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812."); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/resources/concordia-config/concordia-mock.cfg b/tests/resources/concordia-config/concordia-mock.cfg index 37bd27b..8170e3f 100644 --- a/tests/resources/concordia-config/concordia-mock.cfg +++ b/tests/resources/concordia-config/concordia-mock.cfg @@ -6,12 +6,12 @@ #Path to the Puddle tagset puddle_tagset_path = "puddle/tagset.txt"; -word_map_path = "tmp/wm.bin" +word_map_path = "/tmp/wm.bin" -hashed_index_path = "tmp/hi.bin" +hashed_index_path = "/tmp/hi.bin" -markers_path = "tmp/ma.bin" +markers_path = "/tmp/ma.bin" -suffix_array_path = "tmp/sa.bin" +suffix_array_path = "/tmp/sa.bin" ### eof