diff --git a/CMakeLists.txt b/CMakeLists.txt index a304441..8cff576 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,26 +103,43 @@ find_package(Boost COMPONENTS # ---------------------------------------------------- # libconfig # ---------------------------------------------------- -find_library(LIBCONFIG_LIB NAMES config++ REQUIRED) +find_library(LIBCONFIG_LIB NAMES config++) find_path(LIBCONFIG_INCLUDE libconfig.h++) if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE}) message(STATUS "Found Libconfig") include_directories(${LIBCONFIG_INCLUDE}) link_directories(${LIBCONFIG_LIB}) +else() + message(FATAL_ERROR "Libconfig not found") endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE}) +# ---------------------------------------------------- +# ICU (I feeeeel youuuuu...) +# ---------------------------------------------------- +find_library(ICU_LIB NAMES icui18n) +find_path(ICU_INCLUDE unicode) + +if(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE}) + message(STATUS "Found ICU") + include_directories(${ICU_INCLUDE}) + link_directories(${ICU_LIB}) +else() + message(FATAL_ERROR "ICU not found") +endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE}) # ---------------------------------------------------- # Logging # ---------------------------------------------------- -find_library(LOG4CPP_LIB NAMES log4cpp REQUIRED) +find_library(LOG4CPP_LIB NAMES log4cpp) find_path(LOG4CPP_INCLUDE log4cpp/Appender.hh) if(EXISTS ${LOG4CPP_LIB} AND EXISTS ${LOG4CPP_INCLUDE}) message(STATUS "Found Log4cpp") include_directories(${LOG4CPP_INCLUDE}) link_directories(${LOG4CPP_LIB}) +else() + message(FATAL_ERROR "Log4cpp not found") endif(EXISTS ${LOG4CPP_LIB} AND EXISTS ${LOG4CPP_INCLUDE}) # ================================================ diff --git a/TODO.txt b/TODO.txt index 7f5ebc0..e06f0be 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,6 +1,8 @@ ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) ----------------------------- +- implement tokenAnnotations vector as interval tree IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją) +- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()). - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń. - testy zużycia pamięci - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła. diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index 70ac20d..f59f12e 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -6,10 +6,13 @@ foreach(dir ${ALL_DIRECTORIES}) endforeach(dir) add_library(concordia SHARED + token_annotation.cpp + anonymized_sentence.cpp + hashed_sentence.cpp concordia_search_result.cpp matched_pattern_fragment.cpp concordia_searcher.cpp - regex_replacement.cpp + regex_rule.cpp sentence_anonymizer.cpp interval.cpp tm_matches.cpp @@ -33,10 +36,13 @@ add_subdirectory(t) install(TARGETS concordia DESTINATION lib/) install(FILES + token_annotation.hpp + anonymized_sentence.hpp + hashed_sentence.hpp concordia_search_result.hpp matched_pattern_fragment.hpp concordia_searcher.hpp - regex_replacement.hpp + regex_rule.hpp sentence_anonymizer.hpp interval.hpp tm_matches.hpp diff --git a/concordia/anonymized_sentence.cpp b/concordia/anonymized_sentence.cpp new file mode 100644 index 0000000..6f7c687 --- /dev/null +++ b/concordia/anonymized_sentence.cpp @@ -0,0 +1,48 @@ +#include "concordia/anonymized_sentence.hpp" +#include "concordia/common/text_utils.hpp" + +#include + +AnonymizedSentence::AnonymizedSentence(std::string sentence): + _sentence(sentence) { +} + +AnonymizedSentence::~AnonymizedSentence() { +} + +void AnonymizedSentence::addAnnotations(std::vector annotations) { + std::vector::iterator newAnnotation = annotations.begin(); + std::list::iterator existingAnnotation = _tokenAnnotations.begin(); + + while(newAnnotation != annotations.end()) { + if (existingAnnotation != _tokenAnnotations.end()) { + // there are still some existing annotations, so perform checks + if (newAnnotation->intersects(*existingAnnotation)) { + // The new annotation intersects with the existing. + // We can not add it, so let us just move on to the + // next new annoation. + newAnnotation++; + } else { + // it is now important whether the new interval is before + // or after existing + if (newAnnotation->getStart() < existingAnnotation->getStart()) { + // New interval does not intersect and is before existing. We add it. + _tokenAnnotations.insert(existingAnnotation, *newAnnotation); + newAnnotation++; + } else { + // If the new interval is after existing we move to the next existing annoation. + existingAnnotation++; + } + } + } else { + // no more existing annotations, so just add the new annotation + _tokenAnnotations.push_back(*newAnnotation); + newAnnotation++; + } + } + +} + +void AnonymizedSentence::toLowerCase() { + _sentence = TextUtils::getInstance().toLowerCase(_sentence); +} diff --git a/concordia/anonymized_sentence.hpp b/concordia/anonymized_sentence.hpp new file mode 100644 index 0000000..e805be0 --- /dev/null +++ b/concordia/anonymized_sentence.hpp @@ -0,0 +1,64 @@ +#ifndef ANONYMIZED_SENTENCE_HDR +#define ANONYMIZED_SENTENCE_HDR + +#include "concordia/common/config.hpp" +#include "concordia/token_annotation.hpp" +#include +#include +#include + +/*! + A sentence after anonymization operations. The class + holds the current string represenation of the sentence + along with the annotations list. +*/ + +class AnonymizedSentence { +public: + /*! + Constructor. + + */ + AnonymizedSentence(std::string sentence); + + /*! Destructor. + */ + virtual ~AnonymizedSentence(); + + /*! Getter for sentence + \returns sentence + */ + std::string getSentence() const { + return _sentence; + } + + /*! Getter for annotations list + \returns annotations list + */ + std::list getAnnotations() const { + return _tokenAnnotations; + } + + /*! + Transform the sentence to lower case. + */ + void toLowerCase(); + + /*! + Add new annotations to the existing annotations list. Assumptions: + 1. existing _tokenAnnotations vector contains disjoint, sorted intervals; + 2. the annotations to be added list also has the above properties. + The below algorithm will only add the annotations that do not + intersect with any of the existing ones. + + \param annotations list of annotations to be added + */ + void addAnnotations(std::vector annotations); + +private: + std::string _sentence; + + std::list _tokenAnnotations; +}; + +#endif diff --git a/concordia/compilation.dox b/concordia/compilation.dox index c40141e..27c834b 100644 --- a/concordia/compilation.dox +++ b/concordia/compilation.dox @@ -30,6 +30,7 @@ On Ubuntu 14.04, the above software comes in standard packages. Here is the comp - libconfig++-dev - libconfig-dev - libpcre3-dev +- libicu-dev - doxygen - texlive-font-utils @@ -39,7 +40,7 @@ If you want to install all the above packages at once, simply use the below comm \verbatim -sudo apt-get install g++ cmake libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev doxygen texlive-font-utils +sudo apt-get install g++ cmake libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev libicu-dev doxygen texlive-font-utils \endverbatim diff --git a/concordia/concordia_index.cpp b/concordia/concordia_index.cpp index 339d275..2519c03 100644 --- a/concordia/concordia_index.cpp +++ b/concordia/concordia_index.cpp @@ -91,7 +91,6 @@ void ConcordiaIndex::_addSingleExample( Utils::appendCharToSaucharVector(T, character); // append to markersFile - SUFFIX_MARKER_TYPE marker = Utils::createMarker( example.getId(), offset, diff --git a/concordia/concordia_search_result.hpp b/concordia/concordia_search_result.hpp index 64a6a43..6a7069f 100644 --- a/concordia/concordia_search_result.hpp +++ b/concordia/concordia_search_result.hpp @@ -22,7 +22,7 @@ class ConcordiaSearchResult { public: /*! Constructor. - \param tokenVector tokenized patter which was used for searching + \param tokenVector tokenized pattern which was used for searching */ explicit ConcordiaSearchResult( const std::vector & tokenVector); diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index 0385652..a004f60 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -44,7 +44,8 @@ std::vector HashGenerator::generateHash( std::vector HashGenerator::generateTokenVector( const std::string & sentence) { - std::string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence); + boost::shared_ptr as = _sentenceAnonymizer->anonymize(sentence); + std::string anonymizedSentence = as->getSentence(); boost::trim(anonymizedSentence); std::vector tokenTexts; boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"), diff --git a/concordia/hashed_sentence.cpp b/concordia/hashed_sentence.cpp new file mode 100644 index 0000000..93c1147 --- /dev/null +++ b/concordia/hashed_sentence.cpp @@ -0,0 +1,7 @@ +#include "concordia/hashed_sentence.hpp" + +HashedSentence::HashedSentence() { +} + +HashedSentence::~HashedSentence() { +} diff --git a/concordia/hashed_sentence.hpp b/concordia/hashed_sentence.hpp new file mode 100644 index 0000000..59ebd3e --- /dev/null +++ b/concordia/hashed_sentence.hpp @@ -0,0 +1,61 @@ +#ifndef HASHED_SENTENCE_HDR +#define HASHED_SENTENCE_HDR + +#include "concordia/common/config.hpp" +#include "concordia/interval.hpp" +#include +#include + +/*! + A sentence after hashing by the HashGenerator. The class holds + the list of word codes and intervals representing original + word positions in the sentence (char-based). +*/ + +class HashedSentence { +public: + /*! + Constructor. + + */ + HashedSentence(); + + /*! Destructor. + */ + virtual ~HashedSentence(); + + /*! Getter for original word positions list. + \returns original word positions list + */ + std::vector getOriginalWordPositions() const { + return _originalWordPositions; + } + + /*! Getter for word codes list. + \returns word codes list + */ + std::vector getWordCodes() const { + return _wordCodes; + } + + /*! Method for adding a word code to the list + \param word code to be added + */ + void addWordCode(INDEX_CHARACTER_TYPE wordCode) { + _wordCodes.push_back(wordCode); + } + + /*! Method for adding an original word position to the list. + \param original word position + */ + void addWordOriginalWordPosition(Interval & originalWordPosition) { + _originalWordPositions.push_back(originalWordPosition); + } + +private: + std::vector _originalWordPositions; + + std::vector _wordCodes; +}; + +#endif diff --git a/concordia/interval.hpp b/concordia/interval.hpp index cf63c7d..c06dfec 100644 --- a/concordia/interval.hpp +++ b/concordia/interval.hpp @@ -2,13 +2,14 @@ #define INTERVAL_HDR #include "concordia/common/config.hpp" +#include /*! - Class representing interval of a sentence, i.e. a sequence of words + Class representing interval of a sentence, i.e. a sequence of words or chars coming from that sentence. An interval only has its start and end indexes, where the start index is inclusive and end index is exclusive. For example, - an interval [2,5] of the sentence "This is just for testing purposes" is: - "just for testing". + an interval [2,5] of words of the sentence "This is just for + testing purposes" is: "just for testing". */ @@ -50,6 +51,9 @@ public: return _end; } + friend std::ostream & operator << (std::ostream & o, const Interval & interval) { + return o << "[" << interval.getStart() << "," << interval.getEnd() << ")"; + } protected: SUFFIX_MARKER_TYPE _start; diff --git a/concordia/regex_replacement.cpp b/concordia/regex_replacement.cpp deleted file mode 100644 index 37f5914..0000000 --- a/concordia/regex_replacement.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include "concordia/regex_replacement.hpp" -#include -#include -#include - -RegexReplacement::RegexReplacement(std::string patternString, - std::string replacement, - bool caseSensitive) - throw(ConcordiaException): - _replacement(replacement) { - try { - if (caseSensitive) { - _pattern = boost::make_u32regex(patternString); - } else { - _pattern = boost::make_u32regex(patternString, - boost::regex::icase); - } - } catch(const std::exception & e) { - std::stringstream ss; - - ss << "Bad regex pattern: " << patternString << - " Detailed info: " << e.what(); - - if (std::string const * extra = - boost::get_error_info(e) ) { - ss << *extra; - } - throw ConcordiaException(ss.str()); - } -} - -RegexReplacement::~RegexReplacement() { -} - -std::string RegexReplacement::apply(const std::string & text) { - try { - return boost::u32regex_replace(text, _pattern, _replacement, - boost::match_default | boost::format_all); - } catch(...) { - throw ConcordiaException("Exception while applying replacement rule: " - +_replacement+" to text: "+text); - } -} - diff --git a/concordia/regex_rule.cpp b/concordia/regex_rule.cpp new file mode 100644 index 0000000..83ae20f --- /dev/null +++ b/concordia/regex_rule.cpp @@ -0,0 +1,56 @@ +#include "concordia/regex_rule.hpp" +#include +#include +#include +#include + +RegexRule::RegexRule(std::string patternString, + std::string value, + bool caseSensitive) + throw(ConcordiaException): + _value(value) { + try { + if (caseSensitive) { + _pattern = boost::make_u32regex(UnicodeString(patternString.c_str())); + } else { + _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase); + } + } catch(const std::exception & e) { + std::stringstream ss; + + ss << "Bad regex pattern: " << patternString << + " Detailed info: " << e.what(); + + if (std::string const * extra = + boost::get_error_info(e) ) { + ss << *extra; + } + throw ConcordiaException(ss.str()); + } +} + +RegexRule::~RegexRule() { +} + +void RegexRule::apply(boost::shared_ptr sentence) { + try { + UnicodeString s(sentence->getSentence().c_str()); + boost::u32regex_iterator begin(boost::make_u32regex_iterator(s, _pattern)); + boost::u32regex_iterator end; + std::vector annotations; + for (; begin != end; ++begin) { + SUFFIX_MARKER_TYPE matchBegin = begin->position(); + SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length(); + TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value); + annotations.push_back(annotation); + } + sentence->addAnnotations(annotations); + } catch(const std::exception & e) { + std::stringstream ss; + ss << "Exception while applying regex rule: " + << _value << " to text: " << sentence->getSentence(); + ss << ", message: " << e.what(); + throw ConcordiaException(ss.str()); + } +} + diff --git a/concordia/regex_replacement.hpp b/concordia/regex_rule.hpp similarity index 50% rename from concordia/regex_replacement.hpp rename to concordia/regex_rule.hpp index d118100..2f74c30 100644 --- a/concordia/regex_replacement.hpp +++ b/concordia/regex_rule.hpp @@ -1,24 +1,25 @@ -#ifndef REGEX_REPLACEMENT_HDR -#define REGEX_REPLACEMENT_HDR +#ifndef REGEX_ANNOTATION_HDR +#define REGEX_ANNOTATION_HDR #include #include "concordia/common/config.hpp" +#include "concordia/anonymized_sentence.hpp" #include "concordia/concordia_exception.hpp" #include #include #include - +#include typedef boost::error_info my_tag_error_info; /*! - Class for representing a regular expression replacement operation. + Class for representing a regular expression annotation rule. Holds regex pattern string for matching and replacement string for - replacing found matches. + annotating found matches. */ -class RegexReplacement { +class RegexRule { public: /*! Constructor. @@ -26,24 +27,23 @@ public: \param replacement string to substitute the found match \param caseSensitive case sensitivity of the pattern */ - RegexReplacement(std::string patternString, std::string replacement, - bool caseSensitive = true) - throw(ConcordiaException); + RegexRule(std::string patternString, std::string value, + bool caseSensitive = true) + throw(ConcordiaException); /*! Destructor. */ - virtual ~RegexReplacement(); + virtual ~RegexRule(); - /*! Applies the operation on input string. - \param text the input string - \returns altered version of the input string + /*! Applies the operation on anonymized sentence. + \param sentence the input sentence */ - std::string apply(const std::string & text); + void apply(boost::shared_ptr sentence); private: boost::u32regex _pattern; - std::string _replacement; + std::string _value; }; #endif diff --git a/concordia/sentence_anonymizer.cpp b/concordia/sentence_anonymizer.cpp index 85598d5..e0715f3 100644 --- a/concordia/sentence_anonymizer.cpp +++ b/concordia/sentence_anonymizer.cpp @@ -1,6 +1,5 @@ #include "concordia/sentence_anonymizer.hpp" -#include "concordia/common/text_utils.hpp" #include #include #include @@ -26,22 +25,24 @@ SentenceAnonymizer::SentenceAnonymizer( SentenceAnonymizer::~SentenceAnonymizer() { } -std::string SentenceAnonymizer::anonymize(const std::string & sentence) { - std::string result = sentence; +boost::shared_ptr + SentenceAnonymizer::anonymize(const std::string & sentence) { + boost::shared_ptr + result(new AnonymizedSentence(sentence)); - result = _htmlTags->apply(result); + _htmlTags->apply(result); - BOOST_FOREACH(RegexReplacement & neRule, _namedEntities) { - result = neRule.apply(result); + BOOST_FOREACH(RegexRule & neRule, _namedEntities) { + neRule.apply(result); } - result = TextUtils::getInstance().toLowerCase(result); + result->toLowerCase(); if (_stopWordsEnabled) { - result = _stopWords->apply(result); + _stopWords->apply(result); } - result = _stopSymbols->apply(result); - result = _spaceSymbols->apply(result); + _stopSymbols->apply(result); + _spaceSymbols->apply(result); return result; } @@ -64,7 +65,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) { << " in NE file: " << namedEntitiesPath; throw ConcordiaException(ss.str()); } else { - _namedEntities.push_back(RegexReplacement( + _namedEntities.push_back(RegexRule( tokenTexts->at(0), tokenTexts->at(1))); } } @@ -95,11 +96,11 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) { } tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression += "br).*?>"; - _htmlTags = boost::shared_ptr( - new RegexReplacement(tagsExpression, "", false)); + _htmlTags = boost::shared_ptr( + new RegexRule(tagsExpression, "", false)); } -boost::shared_ptr +boost::shared_ptr SentenceAnonymizer::_getMultipleReplacementRule( std::string & filePath, std::string replacement, bool wholeWord) { std::string expression = "("; @@ -126,7 +127,7 @@ boost::shared_ptr } expression = expression.substr(0, expression.size()-1); expression += ")"; - return boost::shared_ptr( - new RegexReplacement(expression, replacement, false)); + return boost::shared_ptr( + new RegexRule(expression, replacement, false)); } diff --git a/concordia/sentence_anonymizer.hpp b/concordia/sentence_anonymizer.hpp index 99715d8..db8e102 100644 --- a/concordia/sentence_anonymizer.hpp +++ b/concordia/sentence_anonymizer.hpp @@ -4,7 +4,8 @@ #include #include #include "concordia/common/config.hpp" -#include "concordia/regex_replacement.hpp" +#include "concordia/anonymized_sentence.hpp" +#include "concordia/regex_rule.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_exception.hpp" #include @@ -37,29 +38,30 @@ public: \param sentence input sentence \returns altered version of the input sentence */ - std::string anonymize(const std::string & sentence); + boost::shared_ptr + anonymize(const std::string & sentence); private: void _createNeRules(std::string & namedEntitiesPath); void _createHtmlTagsRule(std::string & htmlTagsPath); - boost::shared_ptr _getMultipleReplacementRule( + boost::shared_ptr _getMultipleReplacementRule( std::string & filePath, std::string replacement, bool wholeWord = false); - std::vector _namedEntities; + std::vector _namedEntities; - boost::shared_ptr _htmlTags; + boost::shared_ptr _htmlTags; bool _stopWordsEnabled; - boost::shared_ptr _stopWords; + boost::shared_ptr _stopWords; - boost::shared_ptr _stopSymbols; + boost::shared_ptr _stopSymbols; - boost::shared_ptr _spaceSymbols; + boost::shared_ptr _spaceSymbols; }; #endif diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt index 5886596..9020c3b 100644 --- a/concordia/t/CMakeLists.txt +++ b/concordia/t/CMakeLists.txt @@ -1,8 +1,9 @@ add_library(concordia-tests + test_regex_rule.cpp + test_anonymized_sentence.cpp test_concordia_searcher.cpp test_sentence_anonymizer.cpp test_text_utils.cpp - test_regex_replacement.cpp test_example.cpp test_tm_matches.cpp test_interval.cpp diff --git a/concordia/t/test_anonymized_sentence.cpp b/concordia/t/test_anonymized_sentence.cpp new file mode 100644 index 0000000..334cbda --- /dev/null +++ b/concordia/t/test_anonymized_sentence.cpp @@ -0,0 +1,86 @@ +#include "tests/unit-tests/unit_tests_globals.hpp" +#include "concordia/anonymized_sentence.hpp" +#include "concordia/token_annotation.hpp" +#include "concordia/common/config.hpp" +#include + +BOOST_AUTO_TEST_SUITE(anonymized_sentence) + +BOOST_AUTO_TEST_CASE( AnnotationsTrivial ) +{ + AnonymizedSentence as("This is a test sentence"); + + std::vector annotations; + annotations.push_back(TokenAnnotation(0,1,'a',"val")); + annotations.push_back(TokenAnnotation(4,6,'a',"val")); + annotations.push_back(TokenAnnotation(7,10,'a',"val")); + annotations.push_back(TokenAnnotation(12,14,'a',"val")); + + as.addAnnotations(annotations); + + BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4); + +} + +BOOST_AUTO_TEST_CASE( AnnotationsIntersecting ) +{ + AnonymizedSentence as("This is a test sentence"); + + std::vector annotations1; + annotations1.push_back(TokenAnnotation(0,1,'a',"val")); + annotations1.push_back(TokenAnnotation(4,6,'a',"val")); + annotations1.push_back(TokenAnnotation(7,10,'a',"val")); + annotations1.push_back(TokenAnnotation(12,14,'a',"val")); + as.addAnnotations(annotations1); + /* annotation + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 + - ---- ------- ----- + + */ + + std::vector annotations2; + annotations2.push_back(TokenAnnotation(1,4,'a',"val")); + annotations2.push_back(TokenAnnotation(4,7,'a',"val")); + annotations2.push_back(TokenAnnotation(10,11,'a',"val")); + annotations2.push_back(TokenAnnotation(11,13,'a',"val")); + as.addAnnotations(annotations2); + /* annotations2 + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 + ------- ------- -- ----- + + expecting: + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 + - ------- ---- ------- -- ----- + + */ + BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6); + std::list annotations = as.getAnnotations(); + std::list::iterator iter = annotations.begin(); + + BOOST_CHECK_EQUAL(iter->getStart(),0); + BOOST_CHECK_EQUAL(iter->getEnd(),1); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),1); + BOOST_CHECK_EQUAL(iter->getEnd(),4); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),4); + BOOST_CHECK_EQUAL(iter->getEnd(),6); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),7); + BOOST_CHECK_EQUAL(iter->getEnd(),10); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),10); + BOOST_CHECK_EQUAL(iter->getEnd(),11); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),12); + BOOST_CHECK_EQUAL(iter->getEnd(),14); + +} + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_regex_replacement.cpp b/concordia/t/test_regex_replacement.cpp deleted file mode 100644 index 7311e10..0000000 --- a/concordia/t/test_regex_replacement.cpp +++ /dev/null @@ -1,66 +0,0 @@ -#include "tests/unit-tests/unit_tests_globals.hpp" -#include "concordia/regex_replacement.hpp" -#include "concordia/common/config.hpp" -#include -#include -#include - -BOOST_AUTO_TEST_SUITE(regex_replacement) - -BOOST_AUTO_TEST_CASE( SimpleReplacement ) -{ - RegexReplacement rr("a","b"); - BOOST_CHECK_EQUAL(rr.apply("xxxxxxxaxxxaxxaxaxa"),"xxxxxxxbxxxbxxbxbxb"); -} - -BOOST_AUTO_TEST_CASE( BadRegex ) -{ - bool exceptionThrown = false; - std::string message = ""; - try { - RegexReplacement rr("+a","b"); - } catch (ConcordiaException & e) { - exceptionThrown = true; - message = e.what(); - } - BOOST_CHECK_EQUAL(exceptionThrown, true); - BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true); -} - -BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement ) -{ - RegexReplacement rr("['\"\\\\.]",""); - BOOST_CHECK_EQUAL(rr.apply("Don't stop believin' \\ Hold on to the feelin'."),"Dont stop believin Hold on to the feelin"); -} - -BOOST_AUTO_TEST_CASE( BackrefReplacement ) -{ - RegexReplacement rr("(\\d+)","the number: \\1"); - BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812."); -} - -BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement ) -{ - RegexReplacement rr("abc","xxx", false); - BOOST_CHECK_EQUAL(rr.apply("This is AbC and ABC and abc and aBC."),"This is xxx and xxx and xxx and xxx."); -} - -BOOST_AUTO_TEST_CASE( UnicodeReplacement ) -{ - RegexReplacement rr("ą","x"); - BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń"),"zażółć gęślx jaźń"); -} - -BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement ) -{ - RegexReplacement rr("ą","x", false); - BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zażółć gęślx jaźń ZAŻÓŁĆ GĘŚLx JAŹŃ"); -} - -BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement ) -{ - RegexReplacement rr("[ąćęłńóśżź]","x", false); - BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zaxxxx gxxlx jaxx ZAxxxx GxxLx JAxx"); -} - -BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_regex_rule.cpp b/concordia/t/test_regex_rule.cpp new file mode 100644 index 0000000..7922452 --- /dev/null +++ b/concordia/t/test_regex_rule.cpp @@ -0,0 +1,221 @@ +#include "tests/unit-tests/unit_tests_globals.hpp" +#include "concordia/regex_rule.hpp" +#include "concordia/anonymized_sentence.hpp" +#include "concordia/common/config.hpp" +#include +#include +#include +#include + +BOOST_AUTO_TEST_SUITE(regex_rule) + +BOOST_AUTO_TEST_CASE( SimpleReplacement ) +{ + RegexRule rr("a","b"); + boost::shared_ptr as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa")); + rr.apply(as); + BOOST_CHECK_EQUAL(as->getAnnotations().size(),5); + std::list annotations = as->getAnnotations(); + std::list::iterator iter = annotations.begin(); + + BOOST_CHECK_EQUAL(iter->getStart(),7); + BOOST_CHECK_EQUAL(iter->getEnd(),8); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),11); + BOOST_CHECK_EQUAL(iter->getEnd(),12); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),14); + BOOST_CHECK_EQUAL(iter->getEnd(),15); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),16); + BOOST_CHECK_EQUAL(iter->getEnd(),17); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),18); + BOOST_CHECK_EQUAL(iter->getEnd(),19); +} + +BOOST_AUTO_TEST_CASE( BadRegex ) +{ + bool exceptionThrown = false; + std::string message = ""; + try { + RegexRule rr("+a","b"); + } catch (ConcordiaException & e) { + exceptionThrown = true; + message = e.what(); + } + BOOST_CHECK_EQUAL(exceptionThrown, true); + BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true); +} + +BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement ) +{ + RegexRule rr("['\"\\\\.]",""); + boost::shared_ptr as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'.")); + rr.apply(as); + BOOST_CHECK_EQUAL(as->getAnnotations().size(),5); + std::list annotations = as->getAnnotations(); + std::list::iterator iter = annotations.begin(); + + BOOST_CHECK_EQUAL(iter->getStart(),3); + BOOST_CHECK_EQUAL(iter->getEnd(),4); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),19); + BOOST_CHECK_EQUAL(iter->getEnd(),20); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),21); + BOOST_CHECK_EQUAL(iter->getEnd(),22); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),44); + BOOST_CHECK_EQUAL(iter->getEnd(),45); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),45); + BOOST_CHECK_EQUAL(iter->getEnd(),46); +} + + +BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement ) +{ + RegexRule rr("abc","xxx", false); + boost::shared_ptr as(new AnonymizedSentence("This is AbC and ABC and abc and aBC.")); + rr.apply(as); + BOOST_CHECK_EQUAL(as->getAnnotations().size(),4); + std::list annotations = as->getAnnotations(); + std::list::iterator iter = annotations.begin(); + + BOOST_CHECK_EQUAL(iter->getStart(),8); + BOOST_CHECK_EQUAL(iter->getEnd(),11); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),16); + BOOST_CHECK_EQUAL(iter->getEnd(),19); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),24); + BOOST_CHECK_EQUAL(iter->getEnd(),27); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),32); + BOOST_CHECK_EQUAL(iter->getEnd(),35); +} + +BOOST_AUTO_TEST_CASE( UnicodeReplacement ) +{ + RegexRule rr("ą","x"); + boost::shared_ptr as(new AnonymizedSentence("zażółć gęślą jaźń")); + rr.apply(as); + BOOST_CHECK_EQUAL(as->getAnnotations().size(),1); + std::list annotations = as->getAnnotations(); + std::list::iterator iter = annotations.begin(); + + BOOST_CHECK_EQUAL(iter->getStart(),11); + BOOST_CHECK_EQUAL(iter->getEnd(),12); +} + +BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement ) +{ + RegexRule rr("ą","x", false); + boost::shared_ptr as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); + rr.apply(as); + BOOST_CHECK_EQUAL(as->getAnnotations().size(),2); + std::list annotations = as->getAnnotations(); + std::list::iterator iter = annotations.begin(); + + BOOST_CHECK_EQUAL(iter->getStart(),11); + BOOST_CHECK_EQUAL(iter->getEnd(),12); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),29); + BOOST_CHECK_EQUAL(iter->getEnd(),30); +} + +BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement ) +{ + RegexRule rr("[ąćęłńóśżź]","x", false); + boost::shared_ptr as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); + rr.apply(as); + BOOST_CHECK_EQUAL(as->getAnnotations().size(),18); + std::list annotations = as->getAnnotations(); + std::list::iterator iter = annotations.begin(); + + BOOST_CHECK_EQUAL(iter->getStart(),2); + BOOST_CHECK_EQUAL(iter->getEnd(),3); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),3); + BOOST_CHECK_EQUAL(iter->getEnd(),4); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),4); + BOOST_CHECK_EQUAL(iter->getEnd(),5); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),5); + BOOST_CHECK_EQUAL(iter->getEnd(),6); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),8); + BOOST_CHECK_EQUAL(iter->getEnd(),9); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),9); + BOOST_CHECK_EQUAL(iter->getEnd(),10); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),11); + BOOST_CHECK_EQUAL(iter->getEnd(),12); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),15); + BOOST_CHECK_EQUAL(iter->getEnd(),16); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),16); + BOOST_CHECK_EQUAL(iter->getEnd(),17); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),20); + BOOST_CHECK_EQUAL(iter->getEnd(),21); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),21); + BOOST_CHECK_EQUAL(iter->getEnd(),22); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),22); + BOOST_CHECK_EQUAL(iter->getEnd(),23); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),23); + BOOST_CHECK_EQUAL(iter->getEnd(),24); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),26); + BOOST_CHECK_EQUAL(iter->getEnd(),27); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),27); + BOOST_CHECK_EQUAL(iter->getEnd(),28); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),29); + BOOST_CHECK_EQUAL(iter->getEnd(),30); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),33); + BOOST_CHECK_EQUAL(iter->getEnd(),34); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),34); + BOOST_CHECK_EQUAL(iter->getEnd(),35); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_sentence_anonymizer.cpp b/concordia/t/test_sentence_anonymizer.cpp index 932552c..a712059 100644 --- a/concordia/t/test_sentence_anonymizer.cpp +++ b/concordia/t/test_sentence_anonymizer.cpp @@ -17,7 +17,7 @@ BOOST_AUTO_TEST_CASE( NETest ) std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"date ne_date mail ne_email number ne_number"); + BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date ne_date mail ne_email number ne_number"); } BOOST_AUTO_TEST_CASE( HtmlTagsTest ) @@ -27,7 +27,7 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest ) std::string sentence = "link and bold and newline
"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"link and bold and newline "); + BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline "); } @@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest ) if (config->isStopWordsEnabled()) { SentenceAnonymizer anonymizer(config); std::string sentence = "Aczkolwiek nie wiem, czy to konieczne"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne"); + BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence()," wiem konieczne"); } } @@ -48,7 +48,7 @@ BOOST_AUTO_TEST_CASE( StopSymbolsTest ) std::string sentence = "xxx, . xxx # xx $xx@ xx"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx xx"); + BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx xx"); } @@ -59,7 +59,7 @@ BOOST_AUTO_TEST_CASE( SpaceSymbolsTest ) std::string sentence = "xxx-xxx xx|xx"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx"); + BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx"); } @@ -69,7 +69,7 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest ) SentenceAnonymizer anonymizer(config); std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number "); + BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number "); } diff --git a/concordia/token_annotation.cpp b/concordia/token_annotation.cpp new file mode 100644 index 0000000..a0b7c03 --- /dev/null +++ b/concordia/token_annotation.cpp @@ -0,0 +1,15 @@ +#include "concordia/token_annotation.hpp" + + +TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start, + const SUFFIX_MARKER_TYPE end, + const char annotationType, + const std::string & value): + Interval(start, end), + _annotationType(annotationType), + _value(value) { +} + +TokenAnnotation::~TokenAnnotation() { +} + diff --git a/concordia/token_annotation.hpp b/concordia/token_annotation.hpp new file mode 100644 index 0000000..0c805bb --- /dev/null +++ b/concordia/token_annotation.hpp @@ -0,0 +1,53 @@ +#ifndef TOKEN_ANNOTATION_HDR +#define TOKEN_ANNOTATION_HDR + +#include "concordia/common/config.hpp" +#include "concordia/interval.hpp" + +#include + +/*! + Class representing annotatio of char sequence as a token. + It is a type of interval that is also storing information + about the annoation type and value. + +*/ + +class TokenAnnotation : public Interval { +public: + /*! Constructor. + \param start start index of the annotation (char-level, 0-based) + \param end end index of the annotation (char-level, 0-based) + \param type annotation type + \param value annotation value + */ + TokenAnnotation(const SUFFIX_MARKER_TYPE start, + const SUFFIX_MARKER_TYPE end, + const char annotationType, + const std::string & value); + + /*! Destructor. + */ + virtual ~TokenAnnotation(); + + /*! Getter for annotation type. + \returns annotation type + */ + char getType() const { + return _annotationType; + } + + /*! Getter for annotation value. + \returns annotation value + */ + std::string getValue() const { + return _value; + } + +protected: + char _annotationType; + + std::string _value; +}; + +#endif diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index c0033a7..78beef7 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -17,6 +17,7 @@ add_executable(first first.cpp) target_link_libraries(first concordia) target_link_libraries(first config++) target_link_libraries(first log4cpp) +target_link_libraries(first icui18n) target_link_libraries(first ${Boost_LIBRARIES}) target_link_libraries(first divsufsort) target_link_libraries(first utf8case) @@ -27,6 +28,7 @@ add_executable(simple_search simple_search.cpp) target_link_libraries(simple_search concordia) target_link_libraries(simple_search config++) target_link_libraries(simple_search log4cpp) +target_link_libraries(simple_search icui18n) target_link_libraries(simple_search ${Boost_LIBRARIES}) target_link_libraries(simple_search divsufsort) target_link_libraries(simple_search utf8case) @@ -38,6 +40,7 @@ add_executable(concordia_search concordia_search.cpp) target_link_libraries(concordia_search concordia) target_link_libraries(concordia_search config++) target_link_libraries(concordia_search log4cpp) +target_link_libraries(concordia_search icui18n) target_link_libraries(concordia_search ${Boost_LIBRARIES}) target_link_libraries(concordia_search divsufsort) target_link_libraries(concordia_search utf8case)