character intervals in progress

This commit is contained in:
rjawor 2015-06-22 13:52:56 +02:00
parent 4c0f2fd08d
commit 0baf3e4ef2
25 changed files with 705 additions and 167 deletions

View File

@ -103,26 +103,43 @@ find_package(Boost COMPONENTS
# ----------------------------------------------------
# libconfig
# ----------------------------------------------------
find_library(LIBCONFIG_LIB NAMES config++ REQUIRED)
find_library(LIBCONFIG_LIB NAMES config++)
find_path(LIBCONFIG_INCLUDE libconfig.h++)
if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
message(STATUS "Found Libconfig")
include_directories(${LIBCONFIG_INCLUDE})
link_directories(${LIBCONFIG_LIB})
else()
message(FATAL_ERROR "Libconfig not found")
endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
# ----------------------------------------------------
# ICU (I feeeeel youuuuu...)
# ----------------------------------------------------
find_library(ICU_LIB NAMES icui18n)
find_path(ICU_INCLUDE unicode)
if(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
message(STATUS "Found ICU")
include_directories(${ICU_INCLUDE})
link_directories(${ICU_LIB})
else()
message(FATAL_ERROR "ICU not found")
endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
# ----------------------------------------------------
# Logging
# ----------------------------------------------------
find_library(LOG4CPP_LIB NAMES log4cpp REQUIRED)
find_library(LOG4CPP_LIB NAMES log4cpp)
find_path(LOG4CPP_INCLUDE log4cpp/Appender.hh)
if(EXISTS ${LOG4CPP_LIB} AND EXISTS ${LOG4CPP_INCLUDE})
message(STATUS "Found Log4cpp")
include_directories(${LOG4CPP_INCLUDE})
link_directories(${LOG4CPP_LIB})
else()
message(FATAL_ERROR "Log4cpp not found")
endif(EXISTS ${LOG4CPP_LIB} AND EXISTS ${LOG4CPP_INCLUDE})
# ================================================

View File

@ -1,6 +1,8 @@
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
- implement tokenAnnotations vector as interval tree
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
- testy zużycia pamięci
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.

View File

@ -6,10 +6,13 @@ foreach(dir ${ALL_DIRECTORIES})
endforeach(dir)
add_library(concordia SHARED
token_annotation.cpp
anonymized_sentence.cpp
hashed_sentence.cpp
concordia_search_result.cpp
matched_pattern_fragment.cpp
concordia_searcher.cpp
regex_replacement.cpp
regex_rule.cpp
sentence_anonymizer.cpp
interval.cpp
tm_matches.cpp
@ -33,10 +36,13 @@ add_subdirectory(t)
install(TARGETS concordia DESTINATION lib/)
install(FILES
token_annotation.hpp
anonymized_sentence.hpp
hashed_sentence.hpp
concordia_search_result.hpp
matched_pattern_fragment.hpp
concordia_searcher.hpp
regex_replacement.hpp
regex_rule.hpp
sentence_anonymizer.hpp
interval.hpp
tm_matches.hpp

View File

@ -0,0 +1,48 @@
#include "concordia/anonymized_sentence.hpp"
#include "concordia/common/text_utils.hpp"
#include <iostream>
AnonymizedSentence::AnonymizedSentence(std::string sentence):
_sentence(sentence) {
}
AnonymizedSentence::~AnonymizedSentence() {
}
void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
while(newAnnotation != annotations.end()) {
if (existingAnnotation != _tokenAnnotations.end()) {
// there are still some existing annotations, so perform checks
if (newAnnotation->intersects(*existingAnnotation)) {
// The new annotation intersects with the existing.
// We can not add it, so let us just move on to the
// next new annoation.
newAnnotation++;
} else {
// it is now important whether the new interval is before
// or after existing
if (newAnnotation->getStart() < existingAnnotation->getStart()) {
// New interval does not intersect and is before existing. We add it.
_tokenAnnotations.insert(existingAnnotation, *newAnnotation);
newAnnotation++;
} else {
// If the new interval is after existing we move to the next existing annoation.
existingAnnotation++;
}
}
} else {
// no more existing annotations, so just add the new annotation
_tokenAnnotations.push_back(*newAnnotation);
newAnnotation++;
}
}
}
void AnonymizedSentence::toLowerCase() {
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
}

View File

@ -0,0 +1,64 @@
#ifndef ANONYMIZED_SENTENCE_HDR
#define ANONYMIZED_SENTENCE_HDR
#include "concordia/common/config.hpp"
#include "concordia/token_annotation.hpp"
#include <string>
#include <vector>
#include <list>
/*!
A sentence after anonymization operations. The class
holds the current string represenation of the sentence
along with the annotations list.
*/
class AnonymizedSentence {
public:
/*!
Constructor.
*/
AnonymizedSentence(std::string sentence);
/*! Destructor.
*/
virtual ~AnonymizedSentence();
/*! Getter for sentence
\returns sentence
*/
std::string getSentence() const {
return _sentence;
}
/*! Getter for annotations list
\returns annotations list
*/
std::list<TokenAnnotation> getAnnotations() const {
return _tokenAnnotations;
}
/*!
Transform the sentence to lower case.
*/
void toLowerCase();
/*!
Add new annotations to the existing annotations list. Assumptions:
1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
2. the annotations to be added list also has the above properties.
The below algorithm will only add the annotations that do not
intersect with any of the existing ones.
\param annotations list of annotations to be added
*/
void addAnnotations(std::vector<TokenAnnotation> annotations);
private:
std::string _sentence;
std::list<TokenAnnotation> _tokenAnnotations;
};
#endif

View File

@ -30,6 +30,7 @@ On Ubuntu 14.04, the above software comes in standard packages. Here is the comp
- libconfig++-dev
- libconfig-dev
- libpcre3-dev
- libicu-dev
- doxygen
- texlive-font-utils
@ -39,7 +40,7 @@ If you want to install all the above packages at once, simply use the below comm
\verbatim
sudo apt-get install g++ cmake libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev doxygen texlive-font-utils
sudo apt-get install g++ cmake libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev libicu-dev doxygen texlive-font-utils
\endverbatim

View File

@ -91,7 +91,6 @@ void ConcordiaIndex::_addSingleExample(
Utils::appendCharToSaucharVector(T, character);
// append to markersFile
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
example.getId(),
offset,

View File

@ -22,7 +22,7 @@
class ConcordiaSearchResult {
public:
/*! Constructor.
\param tokenVector tokenized patter which was used for searching
\param tokenVector tokenized pattern which was used for searching
*/
explicit ConcordiaSearchResult(
const std::vector<std::string> & tokenVector);

View File

@ -44,7 +44,8 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
std::vector<std::string> HashGenerator::generateTokenVector(
const std::string & sentence) {
std::string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence);
std::string anonymizedSentence = as->getSentence();
boost::trim(anonymizedSentence);
std::vector<std::string> tokenTexts;
boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),

View File

@ -0,0 +1,7 @@
#include "concordia/hashed_sentence.hpp"
HashedSentence::HashedSentence() {
}
HashedSentence::~HashedSentence() {
}

View File

@ -0,0 +1,61 @@
#ifndef HASHED_SENTENCE_HDR
#define HASHED_SENTENCE_HDR
#include "concordia/common/config.hpp"
#include "concordia/interval.hpp"
#include <vector>
#include <string>
/*!
A sentence after hashing by the HashGenerator. The class holds
the list of word codes and intervals representing original
word positions in the sentence (char-based).
*/
class HashedSentence {
public:
/*!
Constructor.
*/
HashedSentence();
/*! Destructor.
*/
virtual ~HashedSentence();
/*! Getter for original word positions list.
\returns original word positions list
*/
std::vector<Interval> getOriginalWordPositions() const {
return _originalWordPositions;
}
/*! Getter for word codes list.
\returns word codes list
*/
std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
return _wordCodes;
}
/*! Method for adding a word code to the list
\param word code to be added
*/
void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
_wordCodes.push_back(wordCode);
}
/*! Method for adding an original word position to the list.
\param original word position
*/
void addWordOriginalWordPosition(Interval & originalWordPosition) {
_originalWordPositions.push_back(originalWordPosition);
}
private:
std::vector<Interval> _originalWordPositions;
std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
};
#endif

View File

@ -2,13 +2,14 @@
#define INTERVAL_HDR
#include "concordia/common/config.hpp"
#include <iostream>
/*!
Class representing interval of a sentence, i.e. a sequence of words
Class representing interval of a sentence, i.e. a sequence of words or chars
coming from that sentence. An interval only has its start and end indexes,
where the start index is inclusive and end index is exclusive. For example,
an interval [2,5] of the sentence "This is just for testing purposes" is:
"just for testing".
an interval [2,5] of words of the sentence "This is just for
testing purposes" is: "just for testing".
*/
@ -50,6 +51,9 @@ public:
return _end;
}
friend std::ostream & operator << (std::ostream & o, const Interval & interval) {
return o << "[" << interval.getStart() << "," << interval.getEnd() << ")";
}
protected:
SUFFIX_MARKER_TYPE _start;

View File

@ -1,44 +0,0 @@
#include "concordia/regex_replacement.hpp"
#include <sstream>
#include <boost/exception/all.hpp>
#include <boost/throw_exception.hpp>
RegexReplacement::RegexReplacement(std::string patternString,
std::string replacement,
bool caseSensitive)
throw(ConcordiaException):
_replacement(replacement) {
try {
if (caseSensitive) {
_pattern = boost::make_u32regex(patternString);
} else {
_pattern = boost::make_u32regex(patternString,
boost::regex::icase);
}
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Bad regex pattern: " << patternString <<
" Detailed info: " << e.what();
if (std::string const * extra =
boost::get_error_info<my_tag_error_info>(e) ) {
ss << *extra;
}
throw ConcordiaException(ss.str());
}
}
RegexReplacement::~RegexReplacement() {
}
std::string RegexReplacement::apply(const std::string & text) {
try {
return boost::u32regex_replace(text, _pattern, _replacement,
boost::match_default | boost::format_all);
} catch(...) {
throw ConcordiaException("Exception while applying replacement rule: "
+_replacement+" to text: "+text);
}
}

56
concordia/regex_rule.cpp Normal file
View File

@ -0,0 +1,56 @@
#include "concordia/regex_rule.hpp"
#include <sstream>
#include <iostream>
#include <boost/exception/all.hpp>
#include <boost/throw_exception.hpp>
RegexRule::RegexRule(std::string patternString,
std::string value,
bool caseSensitive)
throw(ConcordiaException):
_value(value) {
try {
if (caseSensitive) {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
} else {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
}
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Bad regex pattern: " << patternString <<
" Detailed info: " << e.what();
if (std::string const * extra =
boost::get_error_info<my_tag_error_info>(e) ) {
ss << *extra;
}
throw ConcordiaException(ss.str());
}
}
RegexRule::~RegexRule() {
}
void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
try {
UnicodeString s(sentence->getSentence().c_str());
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
boost::u32regex_iterator<const UChar*> end;
std::vector<TokenAnnotation> annotations;
for (; begin != end; ++begin) {
SUFFIX_MARKER_TYPE matchBegin = begin->position();
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
annotations.push_back(annotation);
}
sentence->addAnnotations(annotations);
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Exception while applying regex rule: "
<< _value << " to text: " << sentence->getSentence();
ss << ", message: " << e.what();
throw ConcordiaException(ss.str());
}
}

View File

@ -1,24 +1,25 @@
#ifndef REGEX_REPLACEMENT_HDR
#define REGEX_REPLACEMENT_HDR
#ifndef REGEX_ANNOTATION_HDR
#define REGEX_ANNOTATION_HDR
#include <string>
#include "concordia/common/config.hpp"
#include "concordia/anonymized_sentence.hpp"
#include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/regex.hpp>
#include <boost/regex/icu.hpp>
#include <unicode/unistr.h>
typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
/*!
Class for representing a regular expression replacement operation.
Class for representing a regular expression annotation rule.
Holds regex pattern string for matching and replacement string for
replacing found matches.
annotating found matches.
*/
class RegexReplacement {
class RegexRule {
public:
/*!
Constructor.
@ -26,24 +27,23 @@ public:
\param replacement string to substitute the found match
\param caseSensitive case sensitivity of the pattern
*/
RegexReplacement(std::string patternString, std::string replacement,
RegexRule(std::string patternString, std::string value,
bool caseSensitive = true)
throw(ConcordiaException);
/*! Destructor.
*/
virtual ~RegexReplacement();
virtual ~RegexRule();
/*! Applies the operation on input string.
\param text the input string
\returns altered version of the input string
/*! Applies the operation on anonymized sentence.
\param sentence the input sentence
*/
std::string apply(const std::string & text);
void apply(boost::shared_ptr<AnonymizedSentence> sentence);
private:
boost::u32regex _pattern;
std::string _replacement;
std::string _value;
};
#endif

View File

@ -1,6 +1,5 @@
#include "concordia/sentence_anonymizer.hpp"
#include "concordia/common/text_utils.hpp"
#include <boost/foreach.hpp>
#include <fstream>
#include <sstream>
@ -26,22 +25,24 @@ SentenceAnonymizer::SentenceAnonymizer(
SentenceAnonymizer::~SentenceAnonymizer() {
}
std::string SentenceAnonymizer::anonymize(const std::string & sentence) {
std::string result = sentence;
boost::shared_ptr<AnonymizedSentence>
SentenceAnonymizer::anonymize(const std::string & sentence) {
boost::shared_ptr<AnonymizedSentence>
result(new AnonymizedSentence(sentence));
result = _htmlTags->apply(result);
_htmlTags->apply(result);
BOOST_FOREACH(RegexReplacement & neRule, _namedEntities) {
result = neRule.apply(result);
BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
neRule.apply(result);
}
result = TextUtils::getInstance().toLowerCase(result);
result->toLowerCase();
if (_stopWordsEnabled) {
result = _stopWords->apply(result);
_stopWords->apply(result);
}
result = _stopSymbols->apply(result);
result = _spaceSymbols->apply(result);
_stopSymbols->apply(result);
_spaceSymbols->apply(result);
return result;
}
@ -64,7 +65,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
<< " in NE file: " << namedEntitiesPath;
throw ConcordiaException(ss.str());
} else {
_namedEntities.push_back(RegexReplacement(
_namedEntities.push_back(RegexRule(
tokenTexts->at(0), tokenTexts->at(1)));
}
}
@ -95,11 +96,11 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
}
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
tagsExpression += "br).*?>";
_htmlTags = boost::shared_ptr<RegexReplacement>(
new RegexReplacement(tagsExpression, "", false));
_htmlTags = boost::shared_ptr<RegexRule>(
new RegexRule(tagsExpression, "", false));
}
boost::shared_ptr<RegexReplacement>
boost::shared_ptr<RegexRule>
SentenceAnonymizer::_getMultipleReplacementRule(
std::string & filePath, std::string replacement, bool wholeWord) {
std::string expression = "(";
@ -126,7 +127,7 @@ boost::shared_ptr<RegexReplacement>
}
expression = expression.substr(0, expression.size()-1);
expression += ")";
return boost::shared_ptr<RegexReplacement>(
new RegexReplacement(expression, replacement, false));
return boost::shared_ptr<RegexRule>(
new RegexRule(expression, replacement, false));
}

View File

@ -4,7 +4,8 @@
#include <string>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/regex_replacement.hpp"
#include "concordia/anonymized_sentence.hpp"
#include "concordia/regex_rule.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp>
@ -37,29 +38,30 @@ public:
\param sentence input sentence
\returns altered version of the input sentence
*/
std::string anonymize(const std::string & sentence);
boost::shared_ptr<AnonymizedSentence>
anonymize(const std::string & sentence);
private:
void _createNeRules(std::string & namedEntitiesPath);
void _createHtmlTagsRule(std::string & htmlTagsPath);
boost::shared_ptr<RegexReplacement> _getMultipleReplacementRule(
boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
std::string & filePath,
std::string replacement,
bool wholeWord = false);
std::vector<RegexReplacement> _namedEntities;
std::vector<RegexRule> _namedEntities;
boost::shared_ptr<RegexReplacement> _htmlTags;
boost::shared_ptr<RegexRule> _htmlTags;
bool _stopWordsEnabled;
boost::shared_ptr<RegexReplacement> _stopWords;
boost::shared_ptr<RegexRule> _stopWords;
boost::shared_ptr<RegexReplacement> _stopSymbols;
boost::shared_ptr<RegexRule> _stopSymbols;
boost::shared_ptr<RegexReplacement> _spaceSymbols;
boost::shared_ptr<RegexRule> _spaceSymbols;
};
#endif

View File

@ -1,8 +1,9 @@
add_library(concordia-tests
test_regex_rule.cpp
test_anonymized_sentence.cpp
test_concordia_searcher.cpp
test_sentence_anonymizer.cpp
test_text_utils.cpp
test_regex_replacement.cpp
test_example.cpp
test_tm_matches.cpp
test_interval.cpp

View File

@ -0,0 +1,86 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/anonymized_sentence.hpp"
#include "concordia/token_annotation.hpp"
#include "concordia/common/config.hpp"
#include <iostream>
BOOST_AUTO_TEST_SUITE(anonymized_sentence)
BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
{
AnonymizedSentence as("This is a test sentence");
std::vector<TokenAnnotation> annotations;
annotations.push_back(TokenAnnotation(0,1,'a',"val"));
annotations.push_back(TokenAnnotation(4,6,'a',"val"));
annotations.push_back(TokenAnnotation(7,10,'a',"val"));
annotations.push_back(TokenAnnotation(12,14,'a',"val"));
as.addAnnotations(annotations);
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4);
}
BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
{
AnonymizedSentence as("This is a test sentence");
std::vector<TokenAnnotation> annotations1;
annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
as.addAnnotations(annotations1);
/* annotation
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
- ---- ------- -----
*/
std::vector<TokenAnnotation> annotations2;
annotations2.push_back(TokenAnnotation(1,4,'a',"val"));
annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
as.addAnnotations(annotations2);
/* annotations2
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
------- ------- -- -----
expecting:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
- ------- ---- ------- -- -----
*/
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6);
std::list<TokenAnnotation> annotations = as.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),1);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),1);
BOOST_CHECK_EQUAL(iter->getEnd(),4);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),4);
BOOST_CHECK_EQUAL(iter->getEnd(),6);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),7);
BOOST_CHECK_EQUAL(iter->getEnd(),10);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),10);
BOOST_CHECK_EQUAL(iter->getEnd(),11);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),12);
BOOST_CHECK_EQUAL(iter->getEnd(),14);
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -1,66 +0,0 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/regex_replacement.hpp"
#include "concordia/common/config.hpp"
#include <boost/algorithm/string/predicate.hpp>
#include <boost/locale.hpp>
#include <boost/algorithm/string/case_conv.hpp>
BOOST_AUTO_TEST_SUITE(regex_replacement)
BOOST_AUTO_TEST_CASE( SimpleReplacement )
{
RegexReplacement rr("a","b");
BOOST_CHECK_EQUAL(rr.apply("xxxxxxxaxxxaxxaxaxa"),"xxxxxxxbxxxbxxbxbxb");
}
BOOST_AUTO_TEST_CASE( BadRegex )
{
bool exceptionThrown = false;
std::string message = "";
try {
RegexReplacement rr("+a","b");
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
}
BOOST_CHECK_EQUAL(exceptionThrown, true);
BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);
}
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
{
RegexReplacement rr("['\"\\\\.]","");
BOOST_CHECK_EQUAL(rr.apply("Don't stop believin' \\ Hold on to the feelin'."),"Dont stop believin Hold on to the feelin");
}
BOOST_AUTO_TEST_CASE( BackrefReplacement )
{
RegexReplacement rr("(\\d+)","the number: \\1");
BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812.");
}
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
{
RegexReplacement rr("abc","xxx", false);
BOOST_CHECK_EQUAL(rr.apply("This is AbC and ABC and abc and aBC."),"This is xxx and xxx and xxx and xxx.");
}
BOOST_AUTO_TEST_CASE( UnicodeReplacement )
{
RegexReplacement rr("ą","x");
BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń"),"zażółć gęślx jaźń");
}
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
{
RegexReplacement rr("ą","x", false);
BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zażółć gęślx jaźń ZAŻÓŁĆ GĘŚLx JAŹŃ");
}
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
{
RegexReplacement rr("[ąćęłńóśżź]","x", false);
BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zaxxxx gxxlx jaxx ZAxxxx GxxLx JAxx");
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -0,0 +1,221 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/regex_rule.hpp"
#include "concordia/anonymized_sentence.hpp"
#include "concordia/common/config.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/locale.hpp>
#include <boost/algorithm/string/case_conv.hpp>
BOOST_AUTO_TEST_SUITE(regex_rule)
BOOST_AUTO_TEST_CASE( SimpleReplacement )
{
RegexRule rr("a","b");
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa"));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = as->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),7);
BOOST_CHECK_EQUAL(iter->getEnd(),8);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),11);
BOOST_CHECK_EQUAL(iter->getEnd(),12);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),14);
BOOST_CHECK_EQUAL(iter->getEnd(),15);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),16);
BOOST_CHECK_EQUAL(iter->getEnd(),17);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),18);
BOOST_CHECK_EQUAL(iter->getEnd(),19);
}
BOOST_AUTO_TEST_CASE( BadRegex )
{
bool exceptionThrown = false;
std::string message = "";
try {
RegexRule rr("+a","b");
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
}
BOOST_CHECK_EQUAL(exceptionThrown, true);
BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);
}
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
{
RegexRule rr("['\"\\\\.]","");
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = as->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),3);
BOOST_CHECK_EQUAL(iter->getEnd(),4);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),19);
BOOST_CHECK_EQUAL(iter->getEnd(),20);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),21);
BOOST_CHECK_EQUAL(iter->getEnd(),22);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),44);
BOOST_CHECK_EQUAL(iter->getEnd(),45);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),45);
BOOST_CHECK_EQUAL(iter->getEnd(),46);
}
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
{
RegexRule rr("abc","xxx", false);
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC."));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),4);
std::list<TokenAnnotation> annotations = as->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),8);
BOOST_CHECK_EQUAL(iter->getEnd(),11);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),16);
BOOST_CHECK_EQUAL(iter->getEnd(),19);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),24);
BOOST_CHECK_EQUAL(iter->getEnd(),27);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),32);
BOOST_CHECK_EQUAL(iter->getEnd(),35);
}
BOOST_AUTO_TEST_CASE( UnicodeReplacement )
{
RegexRule rr("ą","x");
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń"));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),1);
std::list<TokenAnnotation> annotations = as->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),11);
BOOST_CHECK_EQUAL(iter->getEnd(),12);
}
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
{
RegexRule rr("ą","x", false);
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),2);
std::list<TokenAnnotation> annotations = as->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),11);
BOOST_CHECK_EQUAL(iter->getEnd(),12);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),29);
BOOST_CHECK_EQUAL(iter->getEnd(),30);
}
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
{
RegexRule rr("[ąćęłńóśżź]","x", false);
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),18);
std::list<TokenAnnotation> annotations = as->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),2);
BOOST_CHECK_EQUAL(iter->getEnd(),3);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),3);
BOOST_CHECK_EQUAL(iter->getEnd(),4);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),4);
BOOST_CHECK_EQUAL(iter->getEnd(),5);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),5);
BOOST_CHECK_EQUAL(iter->getEnd(),6);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),8);
BOOST_CHECK_EQUAL(iter->getEnd(),9);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),9);
BOOST_CHECK_EQUAL(iter->getEnd(),10);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),11);
BOOST_CHECK_EQUAL(iter->getEnd(),12);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),15);
BOOST_CHECK_EQUAL(iter->getEnd(),16);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),16);
BOOST_CHECK_EQUAL(iter->getEnd(),17);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),20);
BOOST_CHECK_EQUAL(iter->getEnd(),21);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),21);
BOOST_CHECK_EQUAL(iter->getEnd(),22);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),22);
BOOST_CHECK_EQUAL(iter->getEnd(),23);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),23);
BOOST_CHECK_EQUAL(iter->getEnd(),24);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),26);
BOOST_CHECK_EQUAL(iter->getEnd(),27);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),27);
BOOST_CHECK_EQUAL(iter->getEnd(),28);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),29);
BOOST_CHECK_EQUAL(iter->getEnd(),30);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),33);
BOOST_CHECK_EQUAL(iter->getEnd(),34);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),34);
BOOST_CHECK_EQUAL(iter->getEnd(),35);
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -17,7 +17,7 @@ BOOST_AUTO_TEST_CASE( NETest )
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"date ne_date mail ne_email number ne_number");
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date ne_date mail ne_email number ne_number");
}
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
@ -27,7 +27,7 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"link and bold and newline ");
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
}
@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
if (config->isStopWordsEnabled()) {
SentenceAnonymizer anonymizer(config);
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne");
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence()," wiem konieczne");
}
}
@ -48,7 +48,7 @@ BOOST_AUTO_TEST_CASE( StopSymbolsTest )
std::string sentence = "xxx, . xxx # xx $xx@ xx";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx xx");
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx xx");
}
@ -59,7 +59,7 @@ BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
std::string sentence = "xxx-xxx xx|xx";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx");
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
}
@ -69,7 +69,7 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
SentenceAnonymizer anonymizer(config);
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
}

View File

@ -0,0 +1,15 @@
#include "concordia/token_annotation.hpp"
TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
const SUFFIX_MARKER_TYPE end,
const char annotationType,
const std::string & value):
Interval(start, end),
_annotationType(annotationType),
_value(value) {
}
TokenAnnotation::~TokenAnnotation() {
}

View File

@ -0,0 +1,53 @@
#ifndef TOKEN_ANNOTATION_HDR
#define TOKEN_ANNOTATION_HDR
#include "concordia/common/config.hpp"
#include "concordia/interval.hpp"
#include <string>
/*!
Class representing annotatio of char sequence as a token.
It is a type of interval that is also storing information
about the annoation type and value.
*/
class TokenAnnotation : public Interval {
public:
/*! Constructor.
\param start start index of the annotation (char-level, 0-based)
\param end end index of the annotation (char-level, 0-based)
\param type annotation type
\param value annotation value
*/
TokenAnnotation(const SUFFIX_MARKER_TYPE start,
const SUFFIX_MARKER_TYPE end,
const char annotationType,
const std::string & value);
/*! Destructor.
*/
virtual ~TokenAnnotation();
/*! Getter for annotation type.
\returns annotation type
*/
char getType() const {
return _annotationType;
}
/*! Getter for annotation value.
\returns annotation value
*/
std::string getValue() const {
return _value;
}
protected:
char _annotationType;
std::string _value;
};
#endif

View File

@ -17,6 +17,7 @@ add_executable(first first.cpp)
target_link_libraries(first concordia)
target_link_libraries(first config++)
target_link_libraries(first log4cpp)
target_link_libraries(first icui18n)
target_link_libraries(first ${Boost_LIBRARIES})
target_link_libraries(first divsufsort)
target_link_libraries(first utf8case)
@ -27,6 +28,7 @@ add_executable(simple_search simple_search.cpp)
target_link_libraries(simple_search concordia)
target_link_libraries(simple_search config++)
target_link_libraries(simple_search log4cpp)
target_link_libraries(simple_search icui18n)
target_link_libraries(simple_search ${Boost_LIBRARIES})
target_link_libraries(simple_search divsufsort)
target_link_libraries(simple_search utf8case)
@ -38,6 +40,7 @@ add_executable(concordia_search concordia_search.cpp)
target_link_libraries(concordia_search concordia)
target_link_libraries(concordia_search config++)
target_link_libraries(concordia_search log4cpp)
target_link_libraries(concordia_search icui18n)
target_link_libraries(concordia_search ${Boost_LIBRARIES})
target_link_libraries(concordia_search divsufsort)
target_link_libraries(concordia_search utf8case)