tokenizer in progress
This commit is contained in:
parent
0baf3e4ef2
commit
8432dd321f
4
TODO.txt
4
TODO.txt
@ -1,6 +1,7 @@
|
|||||||
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
||||||
|
|
||||||
- implement tokenAnnotations vector as interval tree
|
- work on word regex pattern (allow for some symbols and digits within the word)
|
||||||
|
- document the code (classes, cfg files) and update tutorial
|
||||||
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
|
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
|
||||||
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
|
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
|
||||||
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
|
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
|
||||||
@ -11,6 +12,7 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś
|
|||||||
|
|
||||||
|
|
||||||
---------------------------- Archive -----------------------------
|
---------------------------- Archive -----------------------------
|
||||||
|
DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
|
||||||
DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
|
DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
|
||||||
DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
|
DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
|
||||||
DONE - document the code
|
DONE - document the code
|
||||||
|
@ -7,13 +7,13 @@ endforeach(dir)
|
|||||||
|
|
||||||
add_library(concordia SHARED
|
add_library(concordia SHARED
|
||||||
token_annotation.cpp
|
token_annotation.cpp
|
||||||
anonymized_sentence.cpp
|
tokenized_sentence.cpp
|
||||||
hashed_sentence.cpp
|
hashed_sentence.cpp
|
||||||
concordia_search_result.cpp
|
concordia_search_result.cpp
|
||||||
matched_pattern_fragment.cpp
|
matched_pattern_fragment.cpp
|
||||||
concordia_searcher.cpp
|
concordia_searcher.cpp
|
||||||
regex_rule.cpp
|
regex_rule.cpp
|
||||||
sentence_anonymizer.cpp
|
sentence_tokenizer.cpp
|
||||||
interval.cpp
|
interval.cpp
|
||||||
tm_matches.cpp
|
tm_matches.cpp
|
||||||
anubis_search_result.cpp
|
anubis_search_result.cpp
|
||||||
@ -37,13 +37,13 @@ add_subdirectory(t)
|
|||||||
install(TARGETS concordia DESTINATION lib/)
|
install(TARGETS concordia DESTINATION lib/)
|
||||||
install(FILES
|
install(FILES
|
||||||
token_annotation.hpp
|
token_annotation.hpp
|
||||||
anonymized_sentence.hpp
|
tokenized_sentence.hpp
|
||||||
hashed_sentence.hpp
|
hashed_sentence.hpp
|
||||||
concordia_search_result.hpp
|
concordia_search_result.hpp
|
||||||
matched_pattern_fragment.hpp
|
matched_pattern_fragment.hpp
|
||||||
concordia_searcher.hpp
|
concordia_searcher.hpp
|
||||||
regex_rule.hpp
|
regex_rule.hpp
|
||||||
sentence_anonymizer.hpp
|
sentence_tokenizer.hpp
|
||||||
interval.hpp
|
interval.hpp
|
||||||
tm_matches.hpp
|
tm_matches.hpp
|
||||||
anubis_search_result.hpp
|
anubis_search_result.hpp
|
||||||
|
@ -9,11 +9,9 @@
|
|||||||
#define MARKERS_PARAM "markers_path"
|
#define MARKERS_PARAM "markers_path"
|
||||||
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
|
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
|
||||||
#define HTML_TAGS_PARAM "html_tags_path"
|
#define HTML_TAGS_PARAM "html_tags_path"
|
||||||
#define SPACE_SYMBOLS_PARAM "space_symbols_path"
|
|
||||||
#define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
|
#define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
|
||||||
#define STOP_WORDS_PARAM "stop_words_path"
|
#define STOP_WORDS_PARAM "stop_words_path"
|
||||||
#define NAMED_ENTITIES_PARAM "named_entities_path"
|
#define NAMED_ENTITIES_PARAM "named_entities_path"
|
||||||
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
|
|
||||||
#define ANUBIS_THRESHOLD_PARAM "anubis_threshold"
|
#define ANUBIS_THRESHOLD_PARAM "anubis_threshold"
|
||||||
|
|
||||||
ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
||||||
@ -35,8 +33,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
|||||||
ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
|
||||||
_htmlTagsFilePath =
|
_htmlTagsFilePath =
|
||||||
ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
|
||||||
_spaceSymbolsFilePath =
|
|
||||||
ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
|
|
||||||
_stopWordsEnabled =
|
_stopWordsEnabled =
|
||||||
ConcordiaConfig::_readConfigParameterStr(
|
ConcordiaConfig::_readConfigParameterStr(
|
||||||
STOP_WORDS_ENABLED_PARAM) != "false";
|
STOP_WORDS_ENABLED_PARAM) != "false";
|
||||||
@ -44,8 +40,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
|||||||
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, "");
|
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, "");
|
||||||
_namedEntitiesFilePath =
|
_namedEntitiesFilePath =
|
||||||
ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
|
ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
|
||||||
_stopSymbolsFilePath =
|
|
||||||
ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
|
|
||||||
_anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr(
|
_anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr(
|
||||||
ANUBIS_THRESHOLD_PARAM,
|
ANUBIS_THRESHOLD_PARAM,
|
||||||
"0.3").c_str());
|
"0.3").c_str());
|
||||||
|
@ -56,14 +56,6 @@ public:
|
|||||||
return _htmlTagsFilePath;
|
return _htmlTagsFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! Getter for space symbols file path.
|
|
||||||
For more information see \ref tutorial3.
|
|
||||||
\returns space symbols file path
|
|
||||||
*/
|
|
||||||
std::string & getSpaceSymbolsFilePath() {
|
|
||||||
return _spaceSymbolsFilePath;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! Getter for stop symbols enabled parameter.
|
/*! Getter for stop symbols enabled parameter.
|
||||||
For more information see \ref tutorial3.
|
For more information see \ref tutorial3.
|
||||||
\returns true if stop words are enabled
|
\returns true if stop words are enabled
|
||||||
@ -88,14 +80,6 @@ public:
|
|||||||
return _namedEntitiesFilePath;
|
return _namedEntitiesFilePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! Getter for stop symbols file path.
|
|
||||||
For more information see \ref tutorial3.
|
|
||||||
\returns stop symbols file path
|
|
||||||
*/
|
|
||||||
std::string & getStopSymbolsFilePath() {
|
|
||||||
return _stopSymbolsFilePath;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! Getter for anubis threshold. Anubis search results with
|
/*! Getter for anubis threshold. Anubis search results with
|
||||||
scores below that threshold will be discarded.
|
scores below that threshold will be discarded.
|
||||||
\returns anubis threshold
|
\returns anubis threshold
|
||||||
@ -115,16 +99,12 @@ private:
|
|||||||
|
|
||||||
std::string _htmlTagsFilePath;
|
std::string _htmlTagsFilePath;
|
||||||
|
|
||||||
std::string _spaceSymbolsFilePath;
|
|
||||||
|
|
||||||
bool _stopWordsEnabled;
|
bool _stopWordsEnabled;
|
||||||
|
|
||||||
std::string _stopWordsFilePath;
|
std::string _stopWordsFilePath;
|
||||||
|
|
||||||
std::string _namedEntitiesFilePath;
|
std::string _namedEntitiesFilePath;
|
||||||
|
|
||||||
std::string _stopSymbolsFilePath;
|
|
||||||
|
|
||||||
double _anubisThreshold;
|
double _anubisThreshold;
|
||||||
|
|
||||||
std::string _readConfigParameterStr(const std::string & name)
|
std::string _readConfigParameterStr(const std::string & name)
|
||||||
|
@ -12,8 +12,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
|||||||
throw(ConcordiaException) :
|
throw(ConcordiaException) :
|
||||||
_wordMapFilePath(config->getWordMapFilePath()),
|
_wordMapFilePath(config->getWordMapFilePath()),
|
||||||
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
|
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
|
||||||
_sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(
|
_sentenceTokenizer(boost::shared_ptr<SentenceTokenizer>(
|
||||||
new SentenceAnonymizer(config))) {
|
new SentenceTokenizer(config))) {
|
||||||
if (boost::filesystem::exists(_wordMapFilePath)) {
|
if (boost::filesystem::exists(_wordMapFilePath)) {
|
||||||
std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
|
std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
|
||||||
boost::archive::binary_iarchive ia(ifs);
|
boost::archive::binary_iarchive ia(ifs);
|
||||||
@ -44,11 +44,11 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
|||||||
|
|
||||||
std::vector<std::string> HashGenerator::generateTokenVector(
|
std::vector<std::string> HashGenerator::generateTokenVector(
|
||||||
const std::string & sentence) {
|
const std::string & sentence) {
|
||||||
boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence);
|
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
||||||
std::string anonymizedSentence = as->getSentence();
|
std::string tokenizedSentence = ts->getSentence();
|
||||||
boost::trim(anonymizedSentence);
|
boost::trim(tokenizedSentence);
|
||||||
std::vector<std::string> tokenTexts;
|
std::vector<std::string> tokenTexts;
|
||||||
boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
|
boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
|
||||||
boost::algorithm::token_compress_on);
|
boost::algorithm::token_compress_on);
|
||||||
return tokenTexts;
|
return tokenTexts;
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
#include <boost/algorithm/string/predicate.hpp>
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
#include "concordia/word_map.hpp"
|
#include "concordia/word_map.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/sentence_anonymizer.hpp"
|
#include "concordia/sentence_tokenizer.hpp"
|
||||||
#include "concordia/concordia_config.hpp"
|
#include "concordia/concordia_config.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
|
|
||||||
@ -71,7 +71,7 @@ public:
|
|||||||
private:
|
private:
|
||||||
boost::shared_ptr<WordMap> _wordMap;
|
boost::shared_ptr<WordMap> _wordMap;
|
||||||
|
|
||||||
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
|
boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
|
||||||
|
|
||||||
std::string _wordMapFilePath;
|
std::string _wordMapFilePath;
|
||||||
};
|
};
|
||||||
|
@ -48,7 +48,7 @@ public:
|
|||||||
/*! Method for adding an original word position to the list.
|
/*! Method for adding an original word position to the list.
|
||||||
\param original word position
|
\param original word position
|
||||||
*/
|
*/
|
||||||
void addWordOriginalWordPosition(Interval & originalWordPosition) {
|
void addOriginalWordPosition(Interval & originalWordPosition) {
|
||||||
_originalWordPositions.push_back(originalWordPosition);
|
_originalWordPositions.push_back(originalWordPosition);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,10 +5,12 @@
|
|||||||
#include <boost/throw_exception.hpp>
|
#include <boost/throw_exception.hpp>
|
||||||
|
|
||||||
RegexRule::RegexRule(std::string patternString,
|
RegexRule::RegexRule(std::string patternString,
|
||||||
std::string value,
|
char annotationType,
|
||||||
bool caseSensitive)
|
std::string value,
|
||||||
throw(ConcordiaException):
|
bool caseSensitive)
|
||||||
_value(value) {
|
throw(ConcordiaException):
|
||||||
|
_annotationType(annotationType),
|
||||||
|
_value(value) {
|
||||||
try {
|
try {
|
||||||
if (caseSensitive) {
|
if (caseSensitive) {
|
||||||
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
|
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
|
||||||
@ -32,7 +34,7 @@ RegexRule::RegexRule(std::string patternString,
|
|||||||
RegexRule::~RegexRule() {
|
RegexRule::~RegexRule() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
|
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
|
||||||
try {
|
try {
|
||||||
UnicodeString s(sentence->getSentence().c_str());
|
UnicodeString s(sentence->getSentence().c_str());
|
||||||
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
|
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
|
||||||
@ -41,14 +43,14 @@ void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
|
|||||||
for (; begin != end; ++begin) {
|
for (; begin != end; ++begin) {
|
||||||
SUFFIX_MARKER_TYPE matchBegin = begin->position();
|
SUFFIX_MARKER_TYPE matchBegin = begin->position();
|
||||||
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
|
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
|
||||||
TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
|
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, "");
|
||||||
annotations.push_back(annotation);
|
annotations.push_back(annotation);
|
||||||
}
|
}
|
||||||
sentence->addAnnotations(annotations);
|
sentence->addAnnotations(annotations);
|
||||||
} catch(const std::exception & e) {
|
} catch(const std::exception & e) {
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "Exception while applying regex rule: "
|
ss << "Exception while applying regex rule: "
|
||||||
<< _value << " to text: " << sentence->getSentence();
|
<< _annotationType << " to text: " << sentence->getSentence();
|
||||||
ss << ", message: " << e.what();
|
ss << ", message: " << e.what();
|
||||||
throw ConcordiaException(ss.str());
|
throw ConcordiaException(ss.str());
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/anonymized_sentence.hpp"
|
#include "concordia/tokenized_sentence.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
#include <boost/regex.hpp>
|
#include <boost/regex.hpp>
|
||||||
@ -24,12 +24,14 @@ public:
|
|||||||
/*!
|
/*!
|
||||||
Constructor.
|
Constructor.
|
||||||
\param patternString regex pattern to match
|
\param patternString regex pattern to match
|
||||||
\param replacement string to substitute the found match
|
\param annoationType type of annotation
|
||||||
\param caseSensitive case sensitivity of the pattern
|
\param caseSensitive case sensitivity of the pattern
|
||||||
*/
|
*/
|
||||||
RegexRule(std::string patternString, std::string value,
|
RegexRule(std::string patternString,
|
||||||
bool caseSensitive = true)
|
char annotationType,
|
||||||
throw(ConcordiaException);
|
std::string value,
|
||||||
|
bool caseSensitive = true)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
@ -38,12 +40,14 @@ public:
|
|||||||
/*! Applies the operation on anonymized sentence.
|
/*! Applies the operation on anonymized sentence.
|
||||||
\param sentence the input sentence
|
\param sentence the input sentence
|
||||||
*/
|
*/
|
||||||
void apply(boost::shared_ptr<AnonymizedSentence> sentence);
|
void apply(boost::shared_ptr<TokenizedSentence> sentence);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
boost::u32regex _pattern;
|
char _annotationType;
|
||||||
|
|
||||||
std::string _value;
|
std::string _value;
|
||||||
|
|
||||||
|
boost::u32regex _pattern;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include "concordia/sentence_anonymizer.hpp"
|
#include "concordia/sentence_tokenizer.hpp"
|
||||||
|
#include "concordia/token_annotation.hpp"
|
||||||
|
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
@ -6,29 +7,27 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
|
|
||||||
SentenceAnonymizer::SentenceAnonymizer(
|
SentenceTokenizer::SentenceTokenizer(
|
||||||
boost::shared_ptr<ConcordiaConfig> config)
|
boost::shared_ptr<ConcordiaConfig> config)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
_createNeRules(config->getNamedEntitiesFilePath());
|
_createNeRules(config->getNamedEntitiesFilePath());
|
||||||
_createHtmlTagsRule(config->getHtmlTagsFilePath());
|
_createHtmlTagsRule(config->getHtmlTagsFilePath());
|
||||||
_stopWordsEnabled = config->isStopWordsEnabled();
|
_stopWordsEnabled = config->isStopWordsEnabled();
|
||||||
if (_stopWordsEnabled) {
|
if (_stopWordsEnabled) {
|
||||||
_stopWords = _getMultipleReplacementRule(
|
_stopWords = _getMultipleRegexRule(
|
||||||
config->getStopWordsFilePath(), "", true);
|
config->getStopWordsFilePath(),
|
||||||
|
TokenAnnotation::STOP_WORD_TYPE,
|
||||||
|
"", true);
|
||||||
}
|
}
|
||||||
_stopSymbols = _getMultipleReplacementRule(
|
|
||||||
config->getStopSymbolsFilePath(), "");
|
|
||||||
_spaceSymbols = _getMultipleReplacementRule(
|
|
||||||
config->getSpaceSymbolsFilePath(), " ");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SentenceAnonymizer::~SentenceAnonymizer() {
|
SentenceTokenizer::~SentenceTokenizer() {
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<AnonymizedSentence>
|
boost::shared_ptr<TokenizedSentence>
|
||||||
SentenceAnonymizer::anonymize(const std::string & sentence) {
|
SentenceTokenizer::tokenize(const std::string & sentence) {
|
||||||
boost::shared_ptr<AnonymizedSentence>
|
boost::shared_ptr<TokenizedSentence>
|
||||||
result(new AnonymizedSentence(sentence));
|
result(new TokenizedSentence(sentence));
|
||||||
|
|
||||||
_htmlTags->apply(result);
|
_htmlTags->apply(result);
|
||||||
|
|
||||||
@ -41,13 +40,14 @@ boost::shared_ptr<AnonymizedSentence>
|
|||||||
if (_stopWordsEnabled) {
|
if (_stopWordsEnabled) {
|
||||||
_stopWords->apply(result);
|
_stopWords->apply(result);
|
||||||
}
|
}
|
||||||
_stopSymbols->apply(result);
|
|
||||||
_spaceSymbols->apply(result);
|
boost::shared_ptr<RegexRule> wordsRule(
|
||||||
|
new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word"));
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
|
void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
|
||||||
if (boost::filesystem::exists(namedEntitiesPath)) {
|
if (boost::filesystem::exists(namedEntitiesPath)) {
|
||||||
std::string line;
|
std::string line;
|
||||||
std::ifstream neFile(namedEntitiesPath.c_str());
|
std::ifstream neFile(namedEntitiesPath.c_str());
|
||||||
@ -66,7 +66,9 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
|
|||||||
throw ConcordiaException(ss.str());
|
throw ConcordiaException(ss.str());
|
||||||
} else {
|
} else {
|
||||||
_namedEntities.push_back(RegexRule(
|
_namedEntities.push_back(RegexRule(
|
||||||
tokenTexts->at(0), tokenTexts->at(1)));
|
tokenTexts->at(0),
|
||||||
|
TokenAnnotation::NE_TYPE,
|
||||||
|
tokenTexts->at(1)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
neFile.close();
|
neFile.close();
|
||||||
@ -78,7 +80,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
||||||
std::string tagsExpression = "<\\/?(";
|
std::string tagsExpression = "<\\/?(";
|
||||||
if (boost::filesystem::exists(htmlTagsPath)) {
|
if (boost::filesystem::exists(htmlTagsPath)) {
|
||||||
std::string line;
|
std::string line;
|
||||||
@ -97,12 +99,15 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
|||||||
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
||||||
tagsExpression += "br).*?>";
|
tagsExpression += "br).*?>";
|
||||||
_htmlTags = boost::shared_ptr<RegexRule>(
|
_htmlTags = boost::shared_ptr<RegexRule>(
|
||||||
new RegexRule(tagsExpression, "", false));
|
new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false));
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule>
|
boost::shared_ptr<RegexRule>
|
||||||
SentenceAnonymizer::_getMultipleReplacementRule(
|
SentenceTokenizer::_getMultipleRegexRule(
|
||||||
std::string & filePath, std::string replacement, bool wholeWord) {
|
std::string filePath,
|
||||||
|
char annotationType,
|
||||||
|
std::string value,
|
||||||
|
bool wholeWord) {
|
||||||
std::string expression = "(";
|
std::string expression = "(";
|
||||||
if (boost::filesystem::exists(filePath)) {
|
if (boost::filesystem::exists(filePath)) {
|
||||||
std::string line;
|
std::string line;
|
||||||
@ -128,6 +133,6 @@ boost::shared_ptr<RegexRule>
|
|||||||
expression = expression.substr(0, expression.size()-1);
|
expression = expression.substr(0, expression.size()-1);
|
||||||
expression += ")";
|
expression += ")";
|
||||||
return boost::shared_ptr<RegexRule>(
|
return boost::shared_ptr<RegexRule>(
|
||||||
new RegexRule(expression, replacement, false));
|
new RegexRule(expression, annotationType, value, false));
|
||||||
}
|
}
|
||||||
|
|
@ -1,10 +1,10 @@
|
|||||||
#ifndef SENTENCE_ANONYMIZER_HDR
|
#ifndef SENTENCE_TOKENIZER_HDR
|
||||||
#define SENTENCE_ANONYMIZER_HDR
|
#define SENTENCE_TOKENIZER_HDR
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/anonymized_sentence.hpp"
|
#include "concordia/tokenized_sentence.hpp"
|
||||||
#include "concordia/regex_rule.hpp"
|
#include "concordia/regex_rule.hpp"
|
||||||
#include "concordia/concordia_config.hpp"
|
#include "concordia/concordia_config.hpp"
|
||||||
#include "concordia/concordia_exception.hpp"
|
#include "concordia/concordia_exception.hpp"
|
||||||
@ -13,42 +13,42 @@
|
|||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
Class for anonymizing sentence before generating hash.
|
Class for tokenizing sentence before generating hash.
|
||||||
This operation is is used to
|
This operation is is used to
|
||||||
remove unnecessary symbols and possibly words from sentences added to index
|
remove unnecessary symbols and possibly words from sentences added to index
|
||||||
and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
|
||||||
with a single space, removes stop words (if the option is enabled), as well as
|
as well as annotates named entities and special symbols. All these have to be listed in files
|
||||||
named entities and special symbols. All these have to be listed in files
|
|
||||||
(see \ref tutorial3).
|
(see \ref tutorial3).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class SentenceAnonymizer {
|
class SentenceTokenizer {
|
||||||
public:
|
public:
|
||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
\param config config object, holding paths to necessary files
|
\param config config object, holding paths to necessary files
|
||||||
*/
|
*/
|
||||||
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~SentenceAnonymizer();
|
virtual ~SentenceTokenizer();
|
||||||
|
|
||||||
/*! Anonymizes the sentence.
|
/*! Tokenizes the sentence.
|
||||||
\param sentence input sentence
|
\param sentence input sentence
|
||||||
\returns altered version of the input sentence
|
\returns altered version of the input sentence
|
||||||
*/
|
*/
|
||||||
boost::shared_ptr<AnonymizedSentence>
|
boost::shared_ptr<TokenizedSentence>
|
||||||
anonymize(const std::string & sentence);
|
tokenize(const std::string & sentence);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void _createNeRules(std::string & namedEntitiesPath);
|
void _createNeRules(std::string & namedEntitiesPath);
|
||||||
|
|
||||||
void _createHtmlTagsRule(std::string & htmlTagsPath);
|
void _createHtmlTagsRule(std::string & htmlTagsPath);
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
|
boost::shared_ptr<RegexRule> _getMultipleRegexRule(
|
||||||
std::string & filePath,
|
std::string filePath,
|
||||||
std::string replacement,
|
char annotationType,
|
||||||
|
std::string value,
|
||||||
bool wholeWord = false);
|
bool wholeWord = false);
|
||||||
|
|
||||||
std::vector<RegexRule> _namedEntities;
|
std::vector<RegexRule> _namedEntities;
|
||||||
@ -59,9 +59,6 @@ private:
|
|||||||
|
|
||||||
boost::shared_ptr<RegexRule> _stopWords;
|
boost::shared_ptr<RegexRule> _stopWords;
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule> _stopSymbols;
|
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule> _spaceSymbols;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
@ -1,8 +1,8 @@
|
|||||||
add_library(concordia-tests
|
add_library(concordia-tests
|
||||||
test_regex_rule.cpp
|
test_regex_rule.cpp
|
||||||
test_anonymized_sentence.cpp
|
test_tokenized_sentence.cpp
|
||||||
test_concordia_searcher.cpp
|
test_concordia_searcher.cpp
|
||||||
test_sentence_anonymizer.cpp
|
test_sentence_tokenizer.cpp
|
||||||
test_text_utils.cpp
|
test_text_utils.cpp
|
||||||
test_example.cpp
|
test_example.cpp
|
||||||
test_tm_matches.cpp
|
test_tm_matches.cpp
|
||||||
|
@ -16,10 +16,8 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
|
|||||||
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
|
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
|
||||||
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
|
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
|
||||||
BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" );
|
BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" );
|
||||||
BOOST_CHECK_EQUAL( config.getSpaceSymbolsFilePath() , "/tmp/space_symbols.txt" );
|
|
||||||
BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" );
|
BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" );
|
||||||
BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" );
|
BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" );
|
||||||
BOOST_CHECK_EQUAL( config.getStopSymbolsFilePath() , "/tmp/stop_symbols.txt" );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
|
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
#include "concordia/regex_rule.hpp"
|
#include "concordia/regex_rule.hpp"
|
||||||
#include "concordia/anonymized_sentence.hpp"
|
#include "concordia/tokenized_sentence.hpp"
|
||||||
|
#include "concordia/token_annotation.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
#include <boost/algorithm/string/predicate.hpp>
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
@ -11,11 +12,11 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
||||||
{
|
{
|
||||||
RegexRule rr("a","b");
|
RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b");
|
||||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa"));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
|
||||||
rr.apply(as);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
|
||||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),7);
|
BOOST_CHECK_EQUAL(iter->getStart(),7);
|
||||||
@ -43,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex )
|
|||||||
bool exceptionThrown = false;
|
bool exceptionThrown = false;
|
||||||
std::string message = "";
|
std::string message = "";
|
||||||
try {
|
try {
|
||||||
RegexRule rr("+a","b");
|
RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b");
|
||||||
} catch (ConcordiaException & e) {
|
} catch (ConcordiaException & e) {
|
||||||
exceptionThrown = true;
|
exceptionThrown = true;
|
||||||
message = e.what();
|
message = e.what();
|
||||||
@ -54,11 +55,11 @@ BOOST_AUTO_TEST_CASE( BadRegex )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
||||||
{
|
{
|
||||||
RegexRule rr("['\"\\\\.]","");
|
RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, "");
|
||||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
|
||||||
rr.apply(as);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
|
||||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
||||||
@ -84,11 +85,11 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
|
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
|
||||||
{
|
{
|
||||||
RegexRule rr("abc","xxx", false);
|
RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false);
|
||||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC."));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
|
||||||
rr.apply(as);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),4);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
|
||||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||||
@ -109,11 +110,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( UnicodeReplacement )
|
BOOST_AUTO_TEST_CASE( UnicodeReplacement )
|
||||||
{
|
{
|
||||||
RegexRule rr("ą","x");
|
RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x");
|
||||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń"));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
|
||||||
rr.apply(as);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),1);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
|
||||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||||
@ -122,11 +123,11 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
|
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
|
||||||
{
|
{
|
||||||
RegexRule rr("ą","x", false);
|
RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false);
|
||||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
||||||
rr.apply(as);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),2);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
|
||||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||||
@ -139,11 +140,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
||||||
{
|
{
|
||||||
RegexRule rr("[ąćęłńóśżź]","x", false);
|
RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false);
|
||||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
||||||
rr.apply(as);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),18);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
|
||||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),2);
|
BOOST_CHECK_EQUAL(iter->getStart(),2);
|
||||||
|
@ -1,76 +0,0 @@
|
|||||||
#include <boost/filesystem.hpp>
|
|
||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
|
||||||
#include <string>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include <boost/shared_ptr.hpp>
|
|
||||||
#include "concordia/common/config.hpp"
|
|
||||||
#include "concordia/sentence_anonymizer.hpp"
|
|
||||||
#include "tests/common/test_resources_manager.hpp"
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(sentence_anonymizer)
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( NETest )
|
|
||||||
{
|
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
|
||||||
SentenceAnonymizer anonymizer(config);
|
|
||||||
|
|
||||||
|
|
||||||
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
|
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date ne_date mail ne_email number ne_number");
|
|
||||||
}
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
|
||||||
{
|
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
|
||||||
SentenceAnonymizer anonymizer(config);
|
|
||||||
|
|
||||||
|
|
||||||
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( StopWordsTest )
|
|
||||||
{
|
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
|
||||||
if (config->isStopWordsEnabled()) {
|
|
||||||
SentenceAnonymizer anonymizer(config);
|
|
||||||
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence()," wiem konieczne");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
|
|
||||||
{
|
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
|
||||||
SentenceAnonymizer anonymizer(config);
|
|
||||||
|
|
||||||
|
|
||||||
std::string sentence = "xxx, . xxx # xx $xx@ xx";
|
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx xx");
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
|
|
||||||
{
|
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
|
||||||
SentenceAnonymizer anonymizer(config);
|
|
||||||
|
|
||||||
|
|
||||||
std::string sentence = "xxx-xxx xx|xx";
|
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
|
||||||
{
|
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
|
||||||
SentenceAnonymizer anonymizer(config);
|
|
||||||
|
|
||||||
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
|
||||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
|
89
concordia/t/test_sentence_tokenizer.cpp
Normal file
89
concordia/t/test_sentence_tokenizer.cpp
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
#include <boost/filesystem.hpp>
|
||||||
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
|
#include <string>
|
||||||
|
#include <sstream>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include <boost/shared_ptr.hpp>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/sentence_tokenizer.hpp"
|
||||||
|
#include "concordia/tokenized_sentence.hpp"
|
||||||
|
#include "tests/common/test_resources_manager.hpp"
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE(sentence_tokenizer)
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( NETest )
|
||||||
|
{
|
||||||
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
|
|
||||||
|
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
|
||||||
|
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
||||||
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(8,annotations.size());
|
||||||
|
BOOST_FOREACH(TokenAnnotation annotation, annotations) {
|
||||||
|
std::cout << annotation.getStart() << ","
|
||||||
|
<< annotation.getEnd() << " type: "
|
||||||
|
<< annotation.getType() << " value: "
|
||||||
|
<< annotation.getValue() << std::endl;
|
||||||
|
}
|
||||||
|
// BOOST_CHECK_EQUAL(,"date ne_date mail ne_email number ne_number");
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||||
|
{
|
||||||
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
|
|
||||||
|
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
||||||
|
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline ");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( StopWordsTest )
|
||||||
|
{
|
||||||
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
|
if (config->isStopWordsEnabled()) {
|
||||||
|
SentenceTokenizer tokenizer(config);
|
||||||
|
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
||||||
|
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence()," wiem konieczne");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
|
||||||
|
{
|
||||||
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
|
|
||||||
|
std::string sentence = "xxx, . xxx # xx $xx@ xx";
|
||||||
|
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx xx");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
|
||||||
|
{
|
||||||
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
|
|
||||||
|
std::string sentence = "xxx-xxx xx|xx";
|
||||||
|
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
||||||
|
{
|
||||||
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
|
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
||||||
|
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_SUITE_END()
|
@ -1,14 +1,14 @@
|
|||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
#include "concordia/anonymized_sentence.hpp"
|
#include "concordia/tokenized_sentence.hpp"
|
||||||
#include "concordia/token_annotation.hpp"
|
#include "concordia/token_annotation.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(anonymized_sentence)
|
BOOST_AUTO_TEST_SUITE(tokenized_sentence)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
|
BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
|
||||||
{
|
{
|
||||||
AnonymizedSentence as("This is a test sentence");
|
TokenizedSentence ts("This is a test sentence");
|
||||||
|
|
||||||
std::vector<TokenAnnotation> annotations;
|
std::vector<TokenAnnotation> annotations;
|
||||||
annotations.push_back(TokenAnnotation(0,1,'a',"val"));
|
annotations.push_back(TokenAnnotation(0,1,'a',"val"));
|
||||||
@ -16,22 +16,22 @@ BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
|
|||||||
annotations.push_back(TokenAnnotation(7,10,'a',"val"));
|
annotations.push_back(TokenAnnotation(7,10,'a',"val"));
|
||||||
annotations.push_back(TokenAnnotation(12,14,'a',"val"));
|
annotations.push_back(TokenAnnotation(12,14,'a',"val"));
|
||||||
|
|
||||||
as.addAnnotations(annotations);
|
ts.addAnnotations(annotations);
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4);
|
BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 4);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
|
BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
|
||||||
{
|
{
|
||||||
AnonymizedSentence as("This is a test sentence");
|
TokenizedSentence ts("This is a test sentence");
|
||||||
|
|
||||||
std::vector<TokenAnnotation> annotations1;
|
std::vector<TokenAnnotation> annotations1;
|
||||||
annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
|
annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
|
||||||
annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
|
annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
|
||||||
annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
|
annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
|
||||||
annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
|
annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
|
||||||
as.addAnnotations(annotations1);
|
ts.addAnnotations(annotations1);
|
||||||
/* annotation
|
/* annotation
|
||||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||||
- ---- ------- -----
|
- ---- ------- -----
|
||||||
@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
|
|||||||
annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
|
annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
|
||||||
annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
|
annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
|
||||||
annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
|
annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
|
||||||
as.addAnnotations(annotations2);
|
ts.addAnnotations(annotations2);
|
||||||
/* annotations2
|
/* annotations2
|
||||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||||
------- ------- -- -----
|
------- ------- -- -----
|
||||||
@ -53,8 +53,8 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
|
|||||||
- ------- ---- ------- -- -----
|
- ------- ---- ------- -- -----
|
||||||
|
|
||||||
*/
|
*/
|
||||||
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6);
|
BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 6);
|
||||||
std::list<TokenAnnotation> annotations = as.getAnnotations();
|
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
@ -13,3 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
|||||||
TokenAnnotation::~TokenAnnotation() {
|
TokenAnnotation::~TokenAnnotation() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char TokenAnnotation::NE_TYPE = 0;
|
||||||
|
char TokenAnnotation::WORD_TYPE = 1;
|
||||||
|
char TokenAnnotation::HTML_TAG_TYPE = 2;
|
||||||
|
char TokenAnnotation::STOP_WORD_TYPE = 3;
|
||||||
|
@ -44,6 +44,14 @@ public:
|
|||||||
return _value;
|
return _value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static char NE_TYPE;
|
||||||
|
|
||||||
|
static char WORD_TYPE;
|
||||||
|
|
||||||
|
static char HTML_TAG_TYPE;
|
||||||
|
|
||||||
|
static char STOP_WORD_TYPE;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
char _annotationType;
|
char _annotationType;
|
||||||
|
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
#include "concordia/anonymized_sentence.hpp"
|
#include "concordia/tokenized_sentence.hpp"
|
||||||
#include "concordia/common/text_utils.hpp"
|
#include "concordia/common/text_utils.hpp"
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
AnonymizedSentence::AnonymizedSentence(std::string sentence):
|
TokenizedSentence::TokenizedSentence(std::string sentence):
|
||||||
_sentence(sentence) {
|
_sentence(sentence) {
|
||||||
}
|
}
|
||||||
|
|
||||||
AnonymizedSentence::~AnonymizedSentence() {
|
TokenizedSentence::~TokenizedSentence() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
|
void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
|
||||||
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
|
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
|
||||||
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
|
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
|
||||||
|
|
||||||
@ -43,6 +43,6 @@ void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void AnonymizedSentence::toLowerCase() {
|
void TokenizedSentence::toLowerCase() {
|
||||||
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
|
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
|
||||||
}
|
}
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef ANONYMIZED_SENTENCE_HDR
|
#ifndef TOKENIZED_SENTENCE_HDR
|
||||||
#define ANONYMIZED_SENTENCE_HDR
|
#define TOKENIZED_SENTENCE_HDR
|
||||||
|
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
#include "concordia/token_annotation.hpp"
|
#include "concordia/token_annotation.hpp"
|
||||||
@ -13,17 +13,17 @@
|
|||||||
along with the annotations list.
|
along with the annotations list.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class AnonymizedSentence {
|
class TokenizedSentence {
|
||||||
public:
|
public:
|
||||||
/*!
|
/*!
|
||||||
Constructor.
|
Constructor.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
AnonymizedSentence(std::string sentence);
|
TokenizedSentence(std::string sentence);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~AnonymizedSentence();
|
virtual ~TokenizedSentence();
|
||||||
|
|
||||||
/*! Getter for sentence
|
/*! Getter for sentence
|
||||||
\returns sentence
|
\returns sentence
|
@ -207,9 +207,8 @@ markers_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_markers.bin"
|
|||||||
word_map_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_word_map.bin"
|
word_map_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_word_map.bin"
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
#-------------------------------------------------------------------------------
|
||||||
# The following settings control the sentence anonymizer mechanism. It is used to
|
# The following settings control the sentence tokenizer mechanism. Tokenizer
|
||||||
# remove unnecessary symbols and possibly words from sentences added to index
|
# takes into account html tags, substitutes predefined symbols
|
||||||
# and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
|
||||||
# with a single space, removes stop words (if the option is enabled), as well as
|
# with a single space, removes stop words (if the option is enabled), as well as
|
||||||
# named entities and special symbols. All these have to be listed in files.
|
# named entities and special symbols. All these have to be listed in files.
|
||||||
|
|
||||||
|
@ -1,6 +0,0 @@
|
|||||||
\|
|
|
||||||
\–
|
|
||||||
\-
|
|
||||||
\/
|
|
||||||
;
|
|
||||||
:
|
|
@ -1,37 +0,0 @@
|
|||||||
\\tab
|
|
||||||
\\emdash
|
|
||||||
\<
|
|
||||||
\>
|
|
||||||
\&
|
|
||||||
\"
|
|
||||||
\‐
|
|
||||||
\
|
|
||||||
<
|
|
||||||
>
|
|
||||||
=
|
|
||||||
\+
|
|
||||||
„
|
|
||||||
”
|
|
||||||
\"
|
|
||||||
…
|
|
||||||
\.
|
|
||||||
\,
|
|
||||||
\?
|
|
||||||
!
|
|
||||||
'
|
|
||||||
\(
|
|
||||||
\)
|
|
||||||
\{
|
|
||||||
\}
|
|
||||||
\@
|
|
||||||
\#
|
|
||||||
\$
|
|
||||||
\%
|
|
||||||
\^
|
|
||||||
\&
|
|
||||||
\*
|
|
||||||
\[
|
|
||||||
\]
|
|
||||||
\\
|
|
||||||
\~
|
|
||||||
&#\d+
|
|
@ -22,21 +22,15 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
|||||||
# named entities and special symbols. All these have to be listed in files.
|
# named entities and special symbols. All these have to be listed in files.
|
||||||
|
|
||||||
# File containing all html tags (one per line)
|
# File containing all html tags (one per line)
|
||||||
html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
html_tags_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
|
||||||
|
|
||||||
# File containing all symbols to be replaced by spaces
|
|
||||||
space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
|
||||||
|
|
||||||
# If set to true, words from predefined list are removed
|
# If set to true, words from predefined list are removed
|
||||||
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||||
|
|
||||||
# If stop_words_enabled is true, set the path to the stop words file
|
# If stop_words_enabled is true, set the path to the stop words file
|
||||||
#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
|
||||||
|
|
||||||
# File containing regular expressions that match named entities
|
# File containing regular expressions that match named entities
|
||||||
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
|
||||||
|
|
||||||
# File containing special symbols (one per line) to be removed
|
|
||||||
stop_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
|
|
||||||
|
|
||||||
### eof
|
### eof
|
||||||
|
@ -1,6 +0,0 @@
|
|||||||
\|
|
|
||||||
\–
|
|
||||||
\-
|
|
||||||
\/
|
|
||||||
;
|
|
||||||
:
|
|
@ -1,37 +0,0 @@
|
|||||||
\\tab
|
|
||||||
\\emdash
|
|
||||||
\<
|
|
||||||
\>
|
|
||||||
\&
|
|
||||||
\"
|
|
||||||
\‐
|
|
||||||
\
|
|
||||||
<
|
|
||||||
>
|
|
||||||
=
|
|
||||||
\+
|
|
||||||
„
|
|
||||||
”
|
|
||||||
\"
|
|
||||||
…
|
|
||||||
\.
|
|
||||||
\,
|
|
||||||
\?
|
|
||||||
!
|
|
||||||
'
|
|
||||||
\(
|
|
||||||
\)
|
|
||||||
\{
|
|
||||||
\}
|
|
||||||
\@
|
|
||||||
\#
|
|
||||||
\$
|
|
||||||
\%
|
|
||||||
\^
|
|
||||||
\&
|
|
||||||
\*
|
|
||||||
\[
|
|
||||||
\]
|
|
||||||
\\
|
|
||||||
\~
|
|
||||||
&#\d+
|
|
@ -14,15 +14,11 @@ markers_path = "/tmp/ma.bin"
|
|||||||
|
|
||||||
html_tags_path = "/tmp/html_tags.txt"
|
html_tags_path = "/tmp/html_tags.txt"
|
||||||
|
|
||||||
space_symbols_path = "/tmp/space_symbols.txt"
|
|
||||||
|
|
||||||
stop_words_enabled = "true"
|
stop_words_enabled = "true"
|
||||||
|
|
||||||
stop_words_path = "/tmp/stop_words.txt"
|
stop_words_path = "/tmp/stop_words.txt"
|
||||||
|
|
||||||
named_entities_path = "/tmp/named_entities.txt"
|
named_entities_path = "/tmp/named_entities.txt"
|
||||||
|
|
||||||
stop_symbols_path = "/tmp/stop_symbols.txt"
|
|
||||||
|
|
||||||
|
|
||||||
### eof
|
### eof
|
||||||
|
@ -22,21 +22,15 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
|||||||
# named entities and special symbols. All these have to be listed in files.
|
# named entities and special symbols. All these have to be listed in files.
|
||||||
|
|
||||||
# File containing all html tags (one per line)
|
# File containing all html tags (one per line)
|
||||||
html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
html_tags_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
|
||||||
|
|
||||||
# File containing all symbols to be replaced by spaces
|
|
||||||
space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
|
||||||
|
|
||||||
# If set to true, words from predefined list are removed
|
# If set to true, words from predefined list are removed
|
||||||
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||||
|
|
||||||
# If stop_words_enabled is true, set the path to the stop words file
|
# If stop_words_enabled is true, set the path to the stop words file
|
||||||
#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
|
||||||
|
|
||||||
# File containing regular expressions that match named entities
|
# File containing regular expressions that match named entities
|
||||||
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
|
||||||
|
|
||||||
# File containing special symbols (one per line) to be removed
|
|
||||||
stop_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
|
|
||||||
|
|
||||||
### eof
|
### eof
|
||||||
|
Loading…
Reference in New Issue
Block a user