character intervals in progress
This commit is contained in:
parent
4c0f2fd08d
commit
0baf3e4ef2
@ -103,26 +103,43 @@ find_package(Boost COMPONENTS
|
||||
# ----------------------------------------------------
|
||||
# libconfig
|
||||
# ----------------------------------------------------
|
||||
find_library(LIBCONFIG_LIB NAMES config++ REQUIRED)
|
||||
find_library(LIBCONFIG_LIB NAMES config++)
|
||||
find_path(LIBCONFIG_INCLUDE libconfig.h++)
|
||||
|
||||
if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
|
||||
message(STATUS "Found Libconfig")
|
||||
include_directories(${LIBCONFIG_INCLUDE})
|
||||
link_directories(${LIBCONFIG_LIB})
|
||||
else()
|
||||
message(FATAL_ERROR "Libconfig not found")
|
||||
endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE})
|
||||
|
||||
# ----------------------------------------------------
|
||||
# ICU (I feeeeel youuuuu...)
|
||||
# ----------------------------------------------------
|
||||
find_library(ICU_LIB NAMES icui18n)
|
||||
find_path(ICU_INCLUDE unicode)
|
||||
|
||||
if(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
|
||||
message(STATUS "Found ICU")
|
||||
include_directories(${ICU_INCLUDE})
|
||||
link_directories(${ICU_LIB})
|
||||
else()
|
||||
message(FATAL_ERROR "ICU not found")
|
||||
endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
|
||||
|
||||
# ----------------------------------------------------
|
||||
# Logging
|
||||
# ----------------------------------------------------
|
||||
find_library(LOG4CPP_LIB NAMES log4cpp REQUIRED)
|
||||
find_library(LOG4CPP_LIB NAMES log4cpp)
|
||||
find_path(LOG4CPP_INCLUDE log4cpp/Appender.hh)
|
||||
|
||||
if(EXISTS ${LOG4CPP_LIB} AND EXISTS ${LOG4CPP_INCLUDE})
|
||||
message(STATUS "Found Log4cpp")
|
||||
include_directories(${LOG4CPP_INCLUDE})
|
||||
link_directories(${LOG4CPP_LIB})
|
||||
else()
|
||||
message(FATAL_ERROR "Log4cpp not found")
|
||||
endif(EXISTS ${LOG4CPP_LIB} AND EXISTS ${LOG4CPP_INCLUDE})
|
||||
|
||||
# ================================================
|
||||
|
2
TODO.txt
2
TODO.txt
@ -1,6 +1,8 @@
|
||||
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
||||
|
||||
- implement tokenAnnotations vector as interval tree
|
||||
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
|
||||
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
|
||||
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
|
||||
- testy zużycia pamięci
|
||||
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
||||
|
@ -6,10 +6,13 @@ foreach(dir ${ALL_DIRECTORIES})
|
||||
endforeach(dir)
|
||||
|
||||
add_library(concordia SHARED
|
||||
token_annotation.cpp
|
||||
anonymized_sentence.cpp
|
||||
hashed_sentence.cpp
|
||||
concordia_search_result.cpp
|
||||
matched_pattern_fragment.cpp
|
||||
concordia_searcher.cpp
|
||||
regex_replacement.cpp
|
||||
regex_rule.cpp
|
||||
sentence_anonymizer.cpp
|
||||
interval.cpp
|
||||
tm_matches.cpp
|
||||
@ -33,10 +36,13 @@ add_subdirectory(t)
|
||||
|
||||
install(TARGETS concordia DESTINATION lib/)
|
||||
install(FILES
|
||||
token_annotation.hpp
|
||||
anonymized_sentence.hpp
|
||||
hashed_sentence.hpp
|
||||
concordia_search_result.hpp
|
||||
matched_pattern_fragment.hpp
|
||||
concordia_searcher.hpp
|
||||
regex_replacement.hpp
|
||||
regex_rule.hpp
|
||||
sentence_anonymizer.hpp
|
||||
interval.hpp
|
||||
tm_matches.hpp
|
||||
|
48
concordia/anonymized_sentence.cpp
Normal file
48
concordia/anonymized_sentence.cpp
Normal file
@ -0,0 +1,48 @@
|
||||
#include "concordia/anonymized_sentence.hpp"
|
||||
#include "concordia/common/text_utils.hpp"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
AnonymizedSentence::AnonymizedSentence(std::string sentence):
|
||||
_sentence(sentence) {
|
||||
}
|
||||
|
||||
AnonymizedSentence::~AnonymizedSentence() {
|
||||
}
|
||||
|
||||
void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
|
||||
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
|
||||
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
|
||||
|
||||
while(newAnnotation != annotations.end()) {
|
||||
if (existingAnnotation != _tokenAnnotations.end()) {
|
||||
// there are still some existing annotations, so perform checks
|
||||
if (newAnnotation->intersects(*existingAnnotation)) {
|
||||
// The new annotation intersects with the existing.
|
||||
// We can not add it, so let us just move on to the
|
||||
// next new annoation.
|
||||
newAnnotation++;
|
||||
} else {
|
||||
// it is now important whether the new interval is before
|
||||
// or after existing
|
||||
if (newAnnotation->getStart() < existingAnnotation->getStart()) {
|
||||
// New interval does not intersect and is before existing. We add it.
|
||||
_tokenAnnotations.insert(existingAnnotation, *newAnnotation);
|
||||
newAnnotation++;
|
||||
} else {
|
||||
// If the new interval is after existing we move to the next existing annoation.
|
||||
existingAnnotation++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// no more existing annotations, so just add the new annotation
|
||||
_tokenAnnotations.push_back(*newAnnotation);
|
||||
newAnnotation++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void AnonymizedSentence::toLowerCase() {
|
||||
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
|
||||
}
|
64
concordia/anonymized_sentence.hpp
Normal file
64
concordia/anonymized_sentence.hpp
Normal file
@ -0,0 +1,64 @@
|
||||
#ifndef ANONYMIZED_SENTENCE_HDR
|
||||
#define ANONYMIZED_SENTENCE_HDR
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/token_annotation.hpp"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
|
||||
/*!
|
||||
A sentence after anonymization operations. The class
|
||||
holds the current string represenation of the sentence
|
||||
along with the annotations list.
|
||||
*/
|
||||
|
||||
class AnonymizedSentence {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
|
||||
*/
|
||||
AnonymizedSentence(std::string sentence);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~AnonymizedSentence();
|
||||
|
||||
/*! Getter for sentence
|
||||
\returns sentence
|
||||
*/
|
||||
std::string getSentence() const {
|
||||
return _sentence;
|
||||
}
|
||||
|
||||
/*! Getter for annotations list
|
||||
\returns annotations list
|
||||
*/
|
||||
std::list<TokenAnnotation> getAnnotations() const {
|
||||
return _tokenAnnotations;
|
||||
}
|
||||
|
||||
/*!
|
||||
Transform the sentence to lower case.
|
||||
*/
|
||||
void toLowerCase();
|
||||
|
||||
/*!
|
||||
Add new annotations to the existing annotations list. Assumptions:
|
||||
1. existing _tokenAnnotations vector contains disjoint, sorted intervals;
|
||||
2. the annotations to be added list also has the above properties.
|
||||
The below algorithm will only add the annotations that do not
|
||||
intersect with any of the existing ones.
|
||||
|
||||
\param annotations list of annotations to be added
|
||||
*/
|
||||
void addAnnotations(std::vector<TokenAnnotation> annotations);
|
||||
|
||||
private:
|
||||
std::string _sentence;
|
||||
|
||||
std::list<TokenAnnotation> _tokenAnnotations;
|
||||
};
|
||||
|
||||
#endif
|
@ -30,6 +30,7 @@ On Ubuntu 14.04, the above software comes in standard packages. Here is the comp
|
||||
- libconfig++-dev
|
||||
- libconfig-dev
|
||||
- libpcre3-dev
|
||||
- libicu-dev
|
||||
- doxygen
|
||||
- texlive-font-utils
|
||||
|
||||
@ -39,7 +40,7 @@ If you want to install all the above packages at once, simply use the below comm
|
||||
|
||||
\verbatim
|
||||
|
||||
sudo apt-get install g++ cmake libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev doxygen texlive-font-utils
|
||||
sudo apt-get install g++ cmake libboost-dev libboost-serialization-dev libboost-test-dev libboost-filesystem-dev libboost-system-dev libboost-program-options-dev libboost-iostreams-dev libboost-regex-dev libboost-locale-dev liblog4cpp5-dev libconfig++-dev libconfig-dev libpcre3-dev libicu-dev doxygen texlive-font-utils
|
||||
|
||||
\endverbatim
|
||||
|
||||
|
@ -91,7 +91,6 @@ void ConcordiaIndex::_addSingleExample(
|
||||
Utils::appendCharToSaucharVector(T, character);
|
||||
|
||||
// append to markersFile
|
||||
|
||||
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
|
||||
example.getId(),
|
||||
offset,
|
||||
|
@ -22,7 +22,7 @@
|
||||
class ConcordiaSearchResult {
|
||||
public:
|
||||
/*! Constructor.
|
||||
\param tokenVector tokenized patter which was used for searching
|
||||
\param tokenVector tokenized pattern which was used for searching
|
||||
*/
|
||||
explicit ConcordiaSearchResult(
|
||||
const std::vector<std::string> & tokenVector);
|
||||
|
@ -44,7 +44,8 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||
|
||||
std::vector<std::string> HashGenerator::generateTokenVector(
|
||||
const std::string & sentence) {
|
||||
std::string anonymizedSentence = _sentenceAnonymizer->anonymize(sentence);
|
||||
boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence);
|
||||
std::string anonymizedSentence = as->getSentence();
|
||||
boost::trim(anonymizedSentence);
|
||||
std::vector<std::string> tokenTexts;
|
||||
boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
|
||||
|
7
concordia/hashed_sentence.cpp
Normal file
7
concordia/hashed_sentence.cpp
Normal file
@ -0,0 +1,7 @@
|
||||
#include "concordia/hashed_sentence.hpp"
|
||||
|
||||
HashedSentence::HashedSentence() {
|
||||
}
|
||||
|
||||
HashedSentence::~HashedSentence() {
|
||||
}
|
61
concordia/hashed_sentence.hpp
Normal file
61
concordia/hashed_sentence.hpp
Normal file
@ -0,0 +1,61 @@
|
||||
#ifndef HASHED_SENTENCE_HDR
|
||||
#define HASHED_SENTENCE_HDR
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/interval.hpp"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
/*!
|
||||
A sentence after hashing by the HashGenerator. The class holds
|
||||
the list of word codes and intervals representing original
|
||||
word positions in the sentence (char-based).
|
||||
*/
|
||||
|
||||
class HashedSentence {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
|
||||
*/
|
||||
HashedSentence();
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~HashedSentence();
|
||||
|
||||
/*! Getter for original word positions list.
|
||||
\returns original word positions list
|
||||
*/
|
||||
std::vector<Interval> getOriginalWordPositions() const {
|
||||
return _originalWordPositions;
|
||||
}
|
||||
|
||||
/*! Getter for word codes list.
|
||||
\returns word codes list
|
||||
*/
|
||||
std::vector<INDEX_CHARACTER_TYPE> getWordCodes() const {
|
||||
return _wordCodes;
|
||||
}
|
||||
|
||||
/*! Method for adding a word code to the list
|
||||
\param word code to be added
|
||||
*/
|
||||
void addWordCode(INDEX_CHARACTER_TYPE wordCode) {
|
||||
_wordCodes.push_back(wordCode);
|
||||
}
|
||||
|
||||
/*! Method for adding an original word position to the list.
|
||||
\param original word position
|
||||
*/
|
||||
void addWordOriginalWordPosition(Interval & originalWordPosition) {
|
||||
_originalWordPositions.push_back(originalWordPosition);
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<Interval> _originalWordPositions;
|
||||
|
||||
std::vector<INDEX_CHARACTER_TYPE> _wordCodes;
|
||||
};
|
||||
|
||||
#endif
|
@ -2,13 +2,14 @@
|
||||
#define INTERVAL_HDR
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <iostream>
|
||||
|
||||
/*!
|
||||
Class representing interval of a sentence, i.e. a sequence of words
|
||||
Class representing interval of a sentence, i.e. a sequence of words or chars
|
||||
coming from that sentence. An interval only has its start and end indexes,
|
||||
where the start index is inclusive and end index is exclusive. For example,
|
||||
an interval [2,5] of the sentence "This is just for testing purposes" is:
|
||||
"just for testing".
|
||||
an interval [2,5] of words of the sentence "This is just for
|
||||
testing purposes" is: "just for testing".
|
||||
|
||||
*/
|
||||
|
||||
@ -50,6 +51,9 @@ public:
|
||||
return _end;
|
||||
}
|
||||
|
||||
friend std::ostream & operator << (std::ostream & o, const Interval & interval) {
|
||||
return o << "[" << interval.getStart() << "," << interval.getEnd() << ")";
|
||||
}
|
||||
protected:
|
||||
SUFFIX_MARKER_TYPE _start;
|
||||
|
||||
|
@ -1,44 +0,0 @@
|
||||
#include "concordia/regex_replacement.hpp"
|
||||
#include <sstream>
|
||||
#include <boost/exception/all.hpp>
|
||||
#include <boost/throw_exception.hpp>
|
||||
|
||||
RegexReplacement::RegexReplacement(std::string patternString,
|
||||
std::string replacement,
|
||||
bool caseSensitive)
|
||||
throw(ConcordiaException):
|
||||
_replacement(replacement) {
|
||||
try {
|
||||
if (caseSensitive) {
|
||||
_pattern = boost::make_u32regex(patternString);
|
||||
} else {
|
||||
_pattern = boost::make_u32regex(patternString,
|
||||
boost::regex::icase);
|
||||
}
|
||||
} catch(const std::exception & e) {
|
||||
std::stringstream ss;
|
||||
|
||||
ss << "Bad regex pattern: " << patternString <<
|
||||
" Detailed info: " << e.what();
|
||||
|
||||
if (std::string const * extra =
|
||||
boost::get_error_info<my_tag_error_info>(e) ) {
|
||||
ss << *extra;
|
||||
}
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
RegexReplacement::~RegexReplacement() {
|
||||
}
|
||||
|
||||
std::string RegexReplacement::apply(const std::string & text) {
|
||||
try {
|
||||
return boost::u32regex_replace(text, _pattern, _replacement,
|
||||
boost::match_default | boost::format_all);
|
||||
} catch(...) {
|
||||
throw ConcordiaException("Exception while applying replacement rule: "
|
||||
+_replacement+" to text: "+text);
|
||||
}
|
||||
}
|
||||
|
56
concordia/regex_rule.cpp
Normal file
56
concordia/regex_rule.cpp
Normal file
@ -0,0 +1,56 @@
|
||||
#include "concordia/regex_rule.hpp"
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <boost/exception/all.hpp>
|
||||
#include <boost/throw_exception.hpp>
|
||||
|
||||
RegexRule::RegexRule(std::string patternString,
|
||||
std::string value,
|
||||
bool caseSensitive)
|
||||
throw(ConcordiaException):
|
||||
_value(value) {
|
||||
try {
|
||||
if (caseSensitive) {
|
||||
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
|
||||
} else {
|
||||
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()), boost::regex::icase);
|
||||
}
|
||||
} catch(const std::exception & e) {
|
||||
std::stringstream ss;
|
||||
|
||||
ss << "Bad regex pattern: " << patternString <<
|
||||
" Detailed info: " << e.what();
|
||||
|
||||
if (std::string const * extra =
|
||||
boost::get_error_info<my_tag_error_info>(e) ) {
|
||||
ss << *extra;
|
||||
}
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
RegexRule::~RegexRule() {
|
||||
}
|
||||
|
||||
void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
|
||||
try {
|
||||
UnicodeString s(sentence->getSentence().c_str());
|
||||
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
|
||||
boost::u32regex_iterator<const UChar*> end;
|
||||
std::vector<TokenAnnotation> annotations;
|
||||
for (; begin != end; ++begin) {
|
||||
SUFFIX_MARKER_TYPE matchBegin = begin->position();
|
||||
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
|
||||
TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
|
||||
annotations.push_back(annotation);
|
||||
}
|
||||
sentence->addAnnotations(annotations);
|
||||
} catch(const std::exception & e) {
|
||||
std::stringstream ss;
|
||||
ss << "Exception while applying regex rule: "
|
||||
<< _value << " to text: " << sentence->getSentence();
|
||||
ss << ", message: " << e.what();
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
}
|
||||
|
@ -1,24 +1,25 @@
|
||||
#ifndef REGEX_REPLACEMENT_HDR
|
||||
#define REGEX_REPLACEMENT_HDR
|
||||
#ifndef REGEX_ANNOTATION_HDR
|
||||
#define REGEX_ANNOTATION_HDR
|
||||
|
||||
#include <string>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/anonymized_sentence.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/regex.hpp>
|
||||
#include <boost/regex/icu.hpp>
|
||||
|
||||
#include <unicode/unistr.h>
|
||||
|
||||
|
||||
typedef boost::error_info<struct my_tag, std::string> my_tag_error_info;
|
||||
|
||||
/*!
|
||||
Class for representing a regular expression replacement operation.
|
||||
Class for representing a regular expression annotation rule.
|
||||
Holds regex pattern string for matching and replacement string for
|
||||
replacing found matches.
|
||||
annotating found matches.
|
||||
|
||||
*/
|
||||
class RegexReplacement {
|
||||
class RegexRule {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
@ -26,24 +27,23 @@ public:
|
||||
\param replacement string to substitute the found match
|
||||
\param caseSensitive case sensitivity of the pattern
|
||||
*/
|
||||
RegexReplacement(std::string patternString, std::string replacement,
|
||||
bool caseSensitive = true)
|
||||
throw(ConcordiaException);
|
||||
RegexRule(std::string patternString, std::string value,
|
||||
bool caseSensitive = true)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~RegexReplacement();
|
||||
virtual ~RegexRule();
|
||||
|
||||
/*! Applies the operation on input string.
|
||||
\param text the input string
|
||||
\returns altered version of the input string
|
||||
/*! Applies the operation on anonymized sentence.
|
||||
\param sentence the input sentence
|
||||
*/
|
||||
std::string apply(const std::string & text);
|
||||
void apply(boost::shared_ptr<AnonymizedSentence> sentence);
|
||||
|
||||
private:
|
||||
boost::u32regex _pattern;
|
||||
|
||||
std::string _replacement;
|
||||
std::string _value;
|
||||
};
|
||||
|
||||
#endif
|
@ -1,6 +1,5 @@
|
||||
#include "concordia/sentence_anonymizer.hpp"
|
||||
|
||||
#include "concordia/common/text_utils.hpp"
|
||||
#include <boost/foreach.hpp>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
@ -26,22 +25,24 @@ SentenceAnonymizer::SentenceAnonymizer(
|
||||
SentenceAnonymizer::~SentenceAnonymizer() {
|
||||
}
|
||||
|
||||
std::string SentenceAnonymizer::anonymize(const std::string & sentence) {
|
||||
std::string result = sentence;
|
||||
boost::shared_ptr<AnonymizedSentence>
|
||||
SentenceAnonymizer::anonymize(const std::string & sentence) {
|
||||
boost::shared_ptr<AnonymizedSentence>
|
||||
result(new AnonymizedSentence(sentence));
|
||||
|
||||
result = _htmlTags->apply(result);
|
||||
_htmlTags->apply(result);
|
||||
|
||||
BOOST_FOREACH(RegexReplacement & neRule, _namedEntities) {
|
||||
result = neRule.apply(result);
|
||||
BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
|
||||
neRule.apply(result);
|
||||
}
|
||||
|
||||
result = TextUtils::getInstance().toLowerCase(result);
|
||||
result->toLowerCase();
|
||||
|
||||
if (_stopWordsEnabled) {
|
||||
result = _stopWords->apply(result);
|
||||
_stopWords->apply(result);
|
||||
}
|
||||
result = _stopSymbols->apply(result);
|
||||
result = _spaceSymbols->apply(result);
|
||||
_stopSymbols->apply(result);
|
||||
_spaceSymbols->apply(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -64,7 +65,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
|
||||
<< " in NE file: " << namedEntitiesPath;
|
||||
throw ConcordiaException(ss.str());
|
||||
} else {
|
||||
_namedEntities.push_back(RegexReplacement(
|
||||
_namedEntities.push_back(RegexRule(
|
||||
tokenTexts->at(0), tokenTexts->at(1)));
|
||||
}
|
||||
}
|
||||
@ -95,11 +96,11 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
||||
}
|
||||
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
||||
tagsExpression += "br).*?>";
|
||||
_htmlTags = boost::shared_ptr<RegexReplacement>(
|
||||
new RegexReplacement(tagsExpression, "", false));
|
||||
_htmlTags = boost::shared_ptr<RegexRule>(
|
||||
new RegexRule(tagsExpression, "", false));
|
||||
}
|
||||
|
||||
boost::shared_ptr<RegexReplacement>
|
||||
boost::shared_ptr<RegexRule>
|
||||
SentenceAnonymizer::_getMultipleReplacementRule(
|
||||
std::string & filePath, std::string replacement, bool wholeWord) {
|
||||
std::string expression = "(";
|
||||
@ -126,7 +127,7 @@ boost::shared_ptr<RegexReplacement>
|
||||
}
|
||||
expression = expression.substr(0, expression.size()-1);
|
||||
expression += ")";
|
||||
return boost::shared_ptr<RegexReplacement>(
|
||||
new RegexReplacement(expression, replacement, false));
|
||||
return boost::shared_ptr<RegexRule>(
|
||||
new RegexRule(expression, replacement, false));
|
||||
}
|
||||
|
||||
|
@ -4,7 +4,8 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/regex_replacement.hpp"
|
||||
#include "concordia/anonymized_sentence.hpp"
|
||||
#include "concordia/regex_rule.hpp"
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
@ -37,29 +38,30 @@ public:
|
||||
\param sentence input sentence
|
||||
\returns altered version of the input sentence
|
||||
*/
|
||||
std::string anonymize(const std::string & sentence);
|
||||
boost::shared_ptr<AnonymizedSentence>
|
||||
anonymize(const std::string & sentence);
|
||||
|
||||
private:
|
||||
void _createNeRules(std::string & namedEntitiesPath);
|
||||
|
||||
void _createHtmlTagsRule(std::string & htmlTagsPath);
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _getMultipleReplacementRule(
|
||||
boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
|
||||
std::string & filePath,
|
||||
std::string replacement,
|
||||
bool wholeWord = false);
|
||||
|
||||
std::vector<RegexReplacement> _namedEntities;
|
||||
std::vector<RegexRule> _namedEntities;
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _htmlTags;
|
||||
boost::shared_ptr<RegexRule> _htmlTags;
|
||||
|
||||
bool _stopWordsEnabled;
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _stopWords;
|
||||
boost::shared_ptr<RegexRule> _stopWords;
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _stopSymbols;
|
||||
boost::shared_ptr<RegexRule> _stopSymbols;
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _spaceSymbols;
|
||||
boost::shared_ptr<RegexRule> _spaceSymbols;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,8 +1,9 @@
|
||||
add_library(concordia-tests
|
||||
test_regex_rule.cpp
|
||||
test_anonymized_sentence.cpp
|
||||
test_concordia_searcher.cpp
|
||||
test_sentence_anonymizer.cpp
|
||||
test_text_utils.cpp
|
||||
test_regex_replacement.cpp
|
||||
test_example.cpp
|
||||
test_tm_matches.cpp
|
||||
test_interval.cpp
|
||||
|
86
concordia/t/test_anonymized_sentence.cpp
Normal file
86
concordia/t/test_anonymized_sentence.cpp
Normal file
@ -0,0 +1,86 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/anonymized_sentence.hpp"
|
||||
#include "concordia/token_annotation.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <iostream>
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(anonymized_sentence)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
|
||||
{
|
||||
AnonymizedSentence as("This is a test sentence");
|
||||
|
||||
std::vector<TokenAnnotation> annotations;
|
||||
annotations.push_back(TokenAnnotation(0,1,'a',"val"));
|
||||
annotations.push_back(TokenAnnotation(4,6,'a',"val"));
|
||||
annotations.push_back(TokenAnnotation(7,10,'a',"val"));
|
||||
annotations.push_back(TokenAnnotation(12,14,'a',"val"));
|
||||
|
||||
as.addAnnotations(annotations);
|
||||
|
||||
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4);
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
|
||||
{
|
||||
AnonymizedSentence as("This is a test sentence");
|
||||
|
||||
std::vector<TokenAnnotation> annotations1;
|
||||
annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
|
||||
annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
|
||||
annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
|
||||
annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
|
||||
as.addAnnotations(annotations1);
|
||||
/* annotation
|
||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
- ---- ------- -----
|
||||
|
||||
*/
|
||||
|
||||
std::vector<TokenAnnotation> annotations2;
|
||||
annotations2.push_back(TokenAnnotation(1,4,'a',"val"));
|
||||
annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
|
||||
annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
|
||||
annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
|
||||
as.addAnnotations(annotations2);
|
||||
/* annotations2
|
||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
------- ------- -- -----
|
||||
|
||||
expecting:
|
||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
- ------- ---- ------- -- -----
|
||||
|
||||
*/
|
||||
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6);
|
||||
std::list<TokenAnnotation> annotations = as.getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),1);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),1);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),4);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),6);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),7);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),10);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),10);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),11);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),12);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),14);
|
||||
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
@ -1,66 +0,0 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/regex_replacement.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/locale.hpp>
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(regex_replacement)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
||||
{
|
||||
RegexReplacement rr("a","b");
|
||||
BOOST_CHECK_EQUAL(rr.apply("xxxxxxxaxxxaxxaxaxa"),"xxxxxxxbxxxbxxbxbxb");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( BadRegex )
|
||||
{
|
||||
bool exceptionThrown = false;
|
||||
std::string message = "";
|
||||
try {
|
||||
RegexReplacement rr("+a","b");
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
}
|
||||
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
||||
{
|
||||
RegexReplacement rr("['\"\\\\.]","");
|
||||
BOOST_CHECK_EQUAL(rr.apply("Don't stop believin' \\ Hold on to the feelin'."),"Dont stop believin Hold on to the feelin");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( BackrefReplacement )
|
||||
{
|
||||
RegexReplacement rr("(\\d+)","the number: \\1");
|
||||
BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812.");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
|
||||
{
|
||||
RegexReplacement rr("abc","xxx", false);
|
||||
BOOST_CHECK_EQUAL(rr.apply("This is AbC and ABC and abc and aBC."),"This is xxx and xxx and xxx and xxx.");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( UnicodeReplacement )
|
||||
{
|
||||
RegexReplacement rr("ą","x");
|
||||
BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń"),"zażółć gęślx jaźń");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
|
||||
{
|
||||
RegexReplacement rr("ą","x", false);
|
||||
BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zażółć gęślx jaźń ZAŻÓŁĆ GĘŚLx JAŹŃ");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
||||
{
|
||||
RegexReplacement rr("[ąćęłńóśżź]","x", false);
|
||||
BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zaxxxx gxxlx jaxx ZAxxxx GxxLx JAxx");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
221
concordia/t/test_regex_rule.cpp
Normal file
221
concordia/t/test_regex_rule.cpp
Normal file
@ -0,0 +1,221 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/regex_rule.hpp"
|
||||
#include "concordia/anonymized_sentence.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/locale.hpp>
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(regex_rule)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
||||
{
|
||||
RegexRule rr("a","b");
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa"));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),7);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),8);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),14);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),15);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),17);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),18);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),19);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( BadRegex )
|
||||
{
|
||||
bool exceptionThrown = false;
|
||||
std::string message = "";
|
||||
try {
|
||||
RegexRule rr("+a","b");
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
}
|
||||
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
||||
{
|
||||
RegexRule rr("['\"\\\\.]","");
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),19);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),20);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),21);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),44);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),45);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),45);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),46);
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
|
||||
{
|
||||
RegexRule rr("abc","xxx", false);
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC."));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),4);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),11);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),19);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),24);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),32);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( UnicodeReplacement )
|
||||
{
|
||||
RegexRule rr("ą","x");
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń"));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),1);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
|
||||
{
|
||||
RegexRule rr("ą","x", false);
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),2);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),29);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),30);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
||||
{
|
||||
RegexRule rr("[ąćęłńóśżź]","x", false);
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),18);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),2);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),3);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),4);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),5);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),5);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),6);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),9);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),9);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),10);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),15);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),16);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),16);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),17);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),20);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),21);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),21);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),22);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),23);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),23);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),24);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),26);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),27);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),28);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),29);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),30);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),33);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),34);
|
||||
iter++;
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),34);
|
||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
@ -17,7 +17,7 @@ BOOST_AUTO_TEST_CASE( NETest )
|
||||
|
||||
|
||||
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"date ne_date mail ne_email number ne_number");
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date ne_date mail ne_email number ne_number");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||
@ -27,7 +27,7 @@ BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||
|
||||
|
||||
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"link and bold and newline ");
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
|
||||
|
||||
}
|
||||
|
||||
@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
|
||||
if (config->isStopWordsEnabled()) {
|
||||
SentenceAnonymizer anonymizer(config);
|
||||
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)," wiem konieczne");
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence()," wiem konieczne");
|
||||
}
|
||||
}
|
||||
|
||||
@ -48,7 +48,7 @@ BOOST_AUTO_TEST_CASE( StopSymbolsTest )
|
||||
|
||||
|
||||
std::string sentence = "xxx, . xxx # xx $xx@ xx";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx xx");
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx xx");
|
||||
|
||||
}
|
||||
|
||||
@ -59,7 +59,7 @@ BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
|
||||
|
||||
|
||||
std::string sentence = "xxx-xxx xx|xx";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"xxx xxx xx xx");
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
|
||||
|
||||
}
|
||||
|
||||
@ -69,7 +69,7 @@ BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
||||
SentenceAnonymizer anonymizer(config);
|
||||
|
||||
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
|
||||
|
||||
}
|
||||
|
||||
|
15
concordia/token_annotation.cpp
Normal file
15
concordia/token_annotation.cpp
Normal file
@ -0,0 +1,15 @@
|
||||
#include "concordia/token_annotation.hpp"
|
||||
|
||||
|
||||
TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
||||
const SUFFIX_MARKER_TYPE end,
|
||||
const char annotationType,
|
||||
const std::string & value):
|
||||
Interval(start, end),
|
||||
_annotationType(annotationType),
|
||||
_value(value) {
|
||||
}
|
||||
|
||||
TokenAnnotation::~TokenAnnotation() {
|
||||
}
|
||||
|
53
concordia/token_annotation.hpp
Normal file
53
concordia/token_annotation.hpp
Normal file
@ -0,0 +1,53 @@
|
||||
#ifndef TOKEN_ANNOTATION_HDR
|
||||
#define TOKEN_ANNOTATION_HDR
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/interval.hpp"
|
||||
|
||||
#include <string>
|
||||
|
||||
/*!
|
||||
Class representing annotatio of char sequence as a token.
|
||||
It is a type of interval that is also storing information
|
||||
about the annoation type and value.
|
||||
|
||||
*/
|
||||
|
||||
class TokenAnnotation : public Interval {
|
||||
public:
|
||||
/*! Constructor.
|
||||
\param start start index of the annotation (char-level, 0-based)
|
||||
\param end end index of the annotation (char-level, 0-based)
|
||||
\param type annotation type
|
||||
\param value annotation value
|
||||
*/
|
||||
TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
||||
const SUFFIX_MARKER_TYPE end,
|
||||
const char annotationType,
|
||||
const std::string & value);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~TokenAnnotation();
|
||||
|
||||
/*! Getter for annotation type.
|
||||
\returns annotation type
|
||||
*/
|
||||
char getType() const {
|
||||
return _annotationType;
|
||||
}
|
||||
|
||||
/*! Getter for annotation value.
|
||||
\returns annotation value
|
||||
*/
|
||||
std::string getValue() const {
|
||||
return _value;
|
||||
}
|
||||
|
||||
protected:
|
||||
char _annotationType;
|
||||
|
||||
std::string _value;
|
||||
};
|
||||
|
||||
#endif
|
@ -17,6 +17,7 @@ add_executable(first first.cpp)
|
||||
target_link_libraries(first concordia)
|
||||
target_link_libraries(first config++)
|
||||
target_link_libraries(first log4cpp)
|
||||
target_link_libraries(first icui18n)
|
||||
target_link_libraries(first ${Boost_LIBRARIES})
|
||||
target_link_libraries(first divsufsort)
|
||||
target_link_libraries(first utf8case)
|
||||
@ -27,6 +28,7 @@ add_executable(simple_search simple_search.cpp)
|
||||
target_link_libraries(simple_search concordia)
|
||||
target_link_libraries(simple_search config++)
|
||||
target_link_libraries(simple_search log4cpp)
|
||||
target_link_libraries(simple_search icui18n)
|
||||
target_link_libraries(simple_search ${Boost_LIBRARIES})
|
||||
target_link_libraries(simple_search divsufsort)
|
||||
target_link_libraries(simple_search utf8case)
|
||||
@ -38,6 +40,7 @@ add_executable(concordia_search concordia_search.cpp)
|
||||
target_link_libraries(concordia_search concordia)
|
||||
target_link_libraries(concordia_search config++)
|
||||
target_link_libraries(concordia_search log4cpp)
|
||||
target_link_libraries(concordia_search icui18n)
|
||||
target_link_libraries(concordia_search ${Boost_LIBRARIES})
|
||||
target_link_libraries(concordia_search divsufsort)
|
||||
target_link_libraries(concordia_search utf8case)
|
||||
|
Loading…
Reference in New Issue
Block a user