tokenizer in progress

This commit is contained in:
rjawor 2015-06-25 10:12:51 +02:00
parent 0baf3e4ef2
commit 8432dd321f
35 changed files with 243 additions and 338 deletions

View File

@ -1,6 +1,7 @@
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
- implement tokenAnnotations vector as interval tree
- work on word regex pattern (allow for some symbols and digits within the word)
- document the code (classes, cfg files) and update tutorial
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
@ -11,6 +12,7 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś
---------------------------- Archive -----------------------------
DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
DONE - document the code

View File

@ -7,13 +7,13 @@ endforeach(dir)
add_library(concordia SHARED
token_annotation.cpp
anonymized_sentence.cpp
tokenized_sentence.cpp
hashed_sentence.cpp
concordia_search_result.cpp
matched_pattern_fragment.cpp
concordia_searcher.cpp
regex_rule.cpp
sentence_anonymizer.cpp
sentence_tokenizer.cpp
interval.cpp
tm_matches.cpp
anubis_search_result.cpp
@ -37,13 +37,13 @@ add_subdirectory(t)
install(TARGETS concordia DESTINATION lib/)
install(FILES
token_annotation.hpp
anonymized_sentence.hpp
tokenized_sentence.hpp
hashed_sentence.hpp
concordia_search_result.hpp
matched_pattern_fragment.hpp
concordia_searcher.hpp
regex_rule.hpp
sentence_anonymizer.hpp
sentence_tokenizer.hpp
interval.hpp
tm_matches.hpp
anubis_search_result.hpp

View File

@ -9,11 +9,9 @@
#define MARKERS_PARAM "markers_path"
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
#define HTML_TAGS_PARAM "html_tags_path"
#define SPACE_SYMBOLS_PARAM "space_symbols_path"
#define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
#define STOP_WORDS_PARAM "stop_words_path"
#define NAMED_ENTITIES_PARAM "named_entities_path"
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
#define ANUBIS_THRESHOLD_PARAM "anubis_threshold"
ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
@ -35,8 +33,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
_htmlTagsFilePath =
ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
_spaceSymbolsFilePath =
ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
_stopWordsEnabled =
ConcordiaConfig::_readConfigParameterStr(
STOP_WORDS_ENABLED_PARAM) != "false";
@ -44,8 +40,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, "");
_namedEntitiesFilePath =
ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
_stopSymbolsFilePath =
ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
_anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr(
ANUBIS_THRESHOLD_PARAM,
"0.3").c_str());

View File

@ -56,14 +56,6 @@ public:
return _htmlTagsFilePath;
}
/*! Getter for space symbols file path.
For more information see \ref tutorial3.
\returns space symbols file path
*/
std::string & getSpaceSymbolsFilePath() {
return _spaceSymbolsFilePath;
}
/*! Getter for stop symbols enabled parameter.
For more information see \ref tutorial3.
\returns true if stop words are enabled
@ -88,14 +80,6 @@ public:
return _namedEntitiesFilePath;
}
/*! Getter for stop symbols file path.
For more information see \ref tutorial3.
\returns stop symbols file path
*/
std::string & getStopSymbolsFilePath() {
return _stopSymbolsFilePath;
}
/*! Getter for anubis threshold. Anubis search results with
scores below that threshold will be discarded.
\returns anubis threshold
@ -115,16 +99,12 @@ private:
std::string _htmlTagsFilePath;
std::string _spaceSymbolsFilePath;
bool _stopWordsEnabled;
std::string _stopWordsFilePath;
std::string _namedEntitiesFilePath;
std::string _stopSymbolsFilePath;
double _anubisThreshold;
std::string _readConfigParameterStr(const std::string & name)

View File

@ -12,8 +12,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException) :
_wordMapFilePath(config->getWordMapFilePath()),
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
_sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(
new SentenceAnonymizer(config))) {
_sentenceTokenizer(boost::shared_ptr<SentenceTokenizer>(
new SentenceTokenizer(config))) {
if (boost::filesystem::exists(_wordMapFilePath)) {
std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
boost::archive::binary_iarchive ia(ifs);
@ -44,11 +44,11 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
std::vector<std::string> HashGenerator::generateTokenVector(
const std::string & sentence) {
boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence);
std::string anonymizedSentence = as->getSentence();
boost::trim(anonymizedSentence);
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
std::string tokenizedSentence = ts->getSentence();
boost::trim(tokenizedSentence);
std::vector<std::string> tokenTexts;
boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
boost::algorithm::token_compress_on);
return tokenTexts;
}

View File

@ -8,7 +8,7 @@
#include <boost/algorithm/string/predicate.hpp>
#include "concordia/word_map.hpp"
#include "concordia/common/config.hpp"
#include "concordia/sentence_anonymizer.hpp"
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
@ -71,7 +71,7 @@ public:
private:
boost::shared_ptr<WordMap> _wordMap;
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
std::string _wordMapFilePath;
};

View File

@ -48,7 +48,7 @@ public:
/*! Method for adding an original word position to the list.
\param original word position
*/
void addWordOriginalWordPosition(Interval & originalWordPosition) {
void addOriginalWordPosition(Interval & originalWordPosition) {
_originalWordPositions.push_back(originalWordPosition);
}

View File

@ -5,10 +5,12 @@
#include <boost/throw_exception.hpp>
RegexRule::RegexRule(std::string patternString,
std::string value,
bool caseSensitive)
throw(ConcordiaException):
_value(value) {
char annotationType,
std::string value,
bool caseSensitive)
throw(ConcordiaException):
_annotationType(annotationType),
_value(value) {
try {
if (caseSensitive) {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
@ -32,7 +34,7 @@ RegexRule::RegexRule(std::string patternString,
RegexRule::~RegexRule() {
}
void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
try {
UnicodeString s(sentence->getSentence().c_str());
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
@ -41,14 +43,14 @@ void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
for (; begin != end; ++begin) {
SUFFIX_MARKER_TYPE matchBegin = begin->position();
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, "");
annotations.push_back(annotation);
}
sentence->addAnnotations(annotations);
} catch(const std::exception & e) {
std::stringstream ss;
ss << "Exception while applying regex rule: "
<< _value << " to text: " << sentence->getSentence();
<< _annotationType << " to text: " << sentence->getSentence();
ss << ", message: " << e.what();
throw ConcordiaException(ss.str());
}

View File

@ -3,7 +3,7 @@
#include <string>
#include "concordia/common/config.hpp"
#include "concordia/anonymized_sentence.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/regex.hpp>
@ -24,12 +24,14 @@ public:
/*!
Constructor.
\param patternString regex pattern to match
\param replacement string to substitute the found match
\param annoationType type of annotation
\param caseSensitive case sensitivity of the pattern
*/
RegexRule(std::string patternString, std::string value,
bool caseSensitive = true)
throw(ConcordiaException);
RegexRule(std::string patternString,
char annotationType,
std::string value,
bool caseSensitive = true)
throw(ConcordiaException);
/*! Destructor.
*/
@ -38,12 +40,14 @@ public:
/*! Applies the operation on anonymized sentence.
\param sentence the input sentence
*/
void apply(boost::shared_ptr<AnonymizedSentence> sentence);
void apply(boost::shared_ptr<TokenizedSentence> sentence);
private:
boost::u32regex _pattern;
char _annotationType;
std::string _value;
boost::u32regex _pattern;
};
#endif

View File

@ -1,4 +1,5 @@
#include "concordia/sentence_anonymizer.hpp"
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/token_annotation.hpp"
#include <boost/foreach.hpp>
#include <fstream>
@ -6,29 +7,27 @@
#include <iostream>
#include <boost/algorithm/string.hpp>
SentenceAnonymizer::SentenceAnonymizer(
SentenceTokenizer::SentenceTokenizer(
boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException) {
_createNeRules(config->getNamedEntitiesFilePath());
_createHtmlTagsRule(config->getHtmlTagsFilePath());
_stopWordsEnabled = config->isStopWordsEnabled();
if (_stopWordsEnabled) {
_stopWords = _getMultipleReplacementRule(
config->getStopWordsFilePath(), "", true);
_stopWords = _getMultipleRegexRule(
config->getStopWordsFilePath(),
TokenAnnotation::STOP_WORD_TYPE,
"", true);
}
_stopSymbols = _getMultipleReplacementRule(
config->getStopSymbolsFilePath(), "");
_spaceSymbols = _getMultipleReplacementRule(
config->getSpaceSymbolsFilePath(), " ");
}
SentenceAnonymizer::~SentenceAnonymizer() {
SentenceTokenizer::~SentenceTokenizer() {
}
boost::shared_ptr<AnonymizedSentence>
SentenceAnonymizer::anonymize(const std::string & sentence) {
boost::shared_ptr<AnonymizedSentence>
result(new AnonymizedSentence(sentence));
boost::shared_ptr<TokenizedSentence>
SentenceTokenizer::tokenize(const std::string & sentence) {
boost::shared_ptr<TokenizedSentence>
result(new TokenizedSentence(sentence));
_htmlTags->apply(result);
@ -41,13 +40,14 @@ boost::shared_ptr<AnonymizedSentence>
if (_stopWordsEnabled) {
_stopWords->apply(result);
}
_stopSymbols->apply(result);
_spaceSymbols->apply(result);
boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word"));
return result;
}
void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
if (boost::filesystem::exists(namedEntitiesPath)) {
std::string line;
std::ifstream neFile(namedEntitiesPath.c_str());
@ -66,7 +66,9 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
throw ConcordiaException(ss.str());
} else {
_namedEntities.push_back(RegexRule(
tokenTexts->at(0), tokenTexts->at(1)));
tokenTexts->at(0),
TokenAnnotation::NE_TYPE,
tokenTexts->at(1)));
}
}
neFile.close();
@ -78,7 +80,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
}
}
void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
std::string tagsExpression = "<\\/?(";
if (boost::filesystem::exists(htmlTagsPath)) {
std::string line;
@ -97,12 +99,15 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
tagsExpression += "br).*?>";
_htmlTags = boost::shared_ptr<RegexRule>(
new RegexRule(tagsExpression, "", false));
new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false));
}
boost::shared_ptr<RegexRule>
SentenceAnonymizer::_getMultipleReplacementRule(
std::string & filePath, std::string replacement, bool wholeWord) {
SentenceTokenizer::_getMultipleRegexRule(
std::string filePath,
char annotationType,
std::string value,
bool wholeWord) {
std::string expression = "(";
if (boost::filesystem::exists(filePath)) {
std::string line;
@ -128,6 +133,6 @@ boost::shared_ptr<RegexRule>
expression = expression.substr(0, expression.size()-1);
expression += ")";
return boost::shared_ptr<RegexRule>(
new RegexRule(expression, replacement, false));
new RegexRule(expression, annotationType, value, false));
}

View File

@ -1,10 +1,10 @@
#ifndef SENTENCE_ANONYMIZER_HDR
#define SENTENCE_ANONYMIZER_HDR
#ifndef SENTENCE_TOKENIZER_HDR
#define SENTENCE_TOKENIZER_HDR
#include <string>
#include <vector>
#include "concordia/common/config.hpp"
#include "concordia/anonymized_sentence.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/regex_rule.hpp"
#include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp"
@ -13,42 +13,42 @@
/*!
Class for anonymizing sentence before generating hash.
Class for tokenizing sentence before generating hash.
This operation is is used to
remove unnecessary symbols and possibly words from sentences added to index
and search patterns. Anonymizer removes html tags, substitutes predefined symbols
with a single space, removes stop words (if the option is enabled), as well as
named entities and special symbols. All these have to be listed in files
and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
as well as annotates named entities and special symbols. All these have to be listed in files
(see \ref tutorial3).
*/
class SentenceAnonymizer {
class SentenceTokenizer {
public:
/*! Constructor.
\param config config object, holding paths to necessary files
*/
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException);
/*! Destructor.
*/
virtual ~SentenceAnonymizer();
virtual ~SentenceTokenizer();
/*! Anonymizes the sentence.
/*! Tokenizes the sentence.
\param sentence input sentence
\returns altered version of the input sentence
*/
boost::shared_ptr<AnonymizedSentence>
anonymize(const std::string & sentence);
boost::shared_ptr<TokenizedSentence>
tokenize(const std::string & sentence);
private:
void _createNeRules(std::string & namedEntitiesPath);
void _createHtmlTagsRule(std::string & htmlTagsPath);
boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
std::string & filePath,
std::string replacement,
boost::shared_ptr<RegexRule> _getMultipleRegexRule(
std::string filePath,
char annotationType,
std::string value,
bool wholeWord = false);
std::vector<RegexRule> _namedEntities;
@ -59,9 +59,6 @@ private:
boost::shared_ptr<RegexRule> _stopWords;
boost::shared_ptr<RegexRule> _stopSymbols;
boost::shared_ptr<RegexRule> _spaceSymbols;
};
#endif

View File

@ -1,8 +1,8 @@
add_library(concordia-tests
test_regex_rule.cpp
test_anonymized_sentence.cpp
test_tokenized_sentence.cpp
test_concordia_searcher.cpp
test_sentence_anonymizer.cpp
test_sentence_tokenizer.cpp
test_text_utils.cpp
test_example.cpp
test_tm_matches.cpp

View File

@ -16,10 +16,8 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" );
BOOST_CHECK_EQUAL( config.getSpaceSymbolsFilePath() , "/tmp/space_symbols.txt" );
BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" );
BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" );
BOOST_CHECK_EQUAL( config.getStopSymbolsFilePath() , "/tmp/stop_symbols.txt" );
}
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )

View File

@ -1,6 +1,7 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/regex_rule.hpp"
#include "concordia/anonymized_sentence.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/token_annotation.hpp"
#include "concordia/common/config.hpp"
#include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp>
@ -11,11 +12,11 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
BOOST_AUTO_TEST_CASE( SimpleReplacement )
{
RegexRule rr("a","b");
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa"));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = as->getAnnotations();
RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),7);
@ -43,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex )
bool exceptionThrown = false;
std::string message = "";
try {
RegexRule rr("+a","b");
RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b");
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
@ -54,11 +55,11 @@ BOOST_AUTO_TEST_CASE( BadRegex )
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
{
RegexRule rr("['\"\\\\.]","");
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = as->getAnnotations();
RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, "");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),3);
@ -84,11 +85,11 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
{
RegexRule rr("abc","xxx", false);
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC."));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),4);
std::list<TokenAnnotation> annotations = as->getAnnotations();
RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),8);
@ -109,11 +110,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
BOOST_AUTO_TEST_CASE( UnicodeReplacement )
{
RegexRule rr("ą","x");
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń"));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),1);
std::list<TokenAnnotation> annotations = as->getAnnotations();
RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -122,11 +123,11 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement )
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
{
RegexRule rr("ą","x", false);
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),2);
std::list<TokenAnnotation> annotations = as->getAnnotations();
RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -139,11 +140,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
{
RegexRule rr("[ąćęłńóśżź]","x", false);
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
rr.apply(as);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),18);
std::list<TokenAnnotation> annotations = as->getAnnotations();
RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),2);

View File

@ -1,76 +0,0 @@
#include <boost/filesystem.hpp>
#include "tests/unit-tests/unit_tests_globals.hpp"
#include <string>
#include <sstream>
#include <boost/shared_ptr.hpp>
#include "concordia/common/config.hpp"
#include "concordia/sentence_anonymizer.hpp"
#include "tests/common/test_resources_manager.hpp"
BOOST_AUTO_TEST_SUITE(sentence_anonymizer)
BOOST_AUTO_TEST_CASE( NETest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config);
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date ne_date mail ne_email number ne_number");
}
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config);
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
}
BOOST_AUTO_TEST_CASE( StopWordsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
if (config->isStopWordsEnabled()) {
SentenceAnonymizer anonymizer(config);
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence()," wiem konieczne");
}
}
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config);
std::string sentence = "xxx, . xxx # xx $xx@ xx";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx xx");
}
BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config);
std::string sentence = "xxx-xxx xx|xx";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
}
BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config);
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -0,0 +1,89 @@
#include <boost/filesystem.hpp>
#include "tests/unit-tests/unit_tests_globals.hpp"
#include <string>
#include <sstream>
#include <iostream>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include "concordia/common/config.hpp"
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "tests/common/test_resources_manager.hpp"
BOOST_AUTO_TEST_SUITE(sentence_tokenizer)
BOOST_AUTO_TEST_CASE( NETest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
BOOST_CHECK_EQUAL(8,annotations.size());
BOOST_FOREACH(TokenAnnotation annotation, annotations) {
std::cout << annotation.getStart() << ","
<< annotation.getEnd() << " type: "
<< annotation.getType() << " value: "
<< annotation.getValue() << std::endl;
}
// BOOST_CHECK_EQUAL(,"date ne_date mail ne_email number ne_number");
}
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline ");
}
BOOST_AUTO_TEST_CASE( StopWordsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
if (config->isStopWordsEnabled()) {
SentenceTokenizer tokenizer(config);
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence()," wiem konieczne");
}
}
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "xxx, . xxx # xx $xx@ xx";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx xx");
}
BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "xxx-xxx xx|xx";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx");
}
BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -1,14 +1,14 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/anonymized_sentence.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/token_annotation.hpp"
#include "concordia/common/config.hpp"
#include <iostream>
BOOST_AUTO_TEST_SUITE(anonymized_sentence)
BOOST_AUTO_TEST_SUITE(tokenized_sentence)
BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
{
AnonymizedSentence as("This is a test sentence");
TokenizedSentence ts("This is a test sentence");
std::vector<TokenAnnotation> annotations;
annotations.push_back(TokenAnnotation(0,1,'a',"val"));
@ -16,22 +16,22 @@ BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
annotations.push_back(TokenAnnotation(7,10,'a',"val"));
annotations.push_back(TokenAnnotation(12,14,'a',"val"));
as.addAnnotations(annotations);
ts.addAnnotations(annotations);
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4);
BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 4);
}
BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
{
AnonymizedSentence as("This is a test sentence");
TokenizedSentence ts("This is a test sentence");
std::vector<TokenAnnotation> annotations1;
annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
as.addAnnotations(annotations1);
ts.addAnnotations(annotations1);
/* annotation
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
- ---- ------- -----
@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
as.addAnnotations(annotations2);
ts.addAnnotations(annotations2);
/* annotations2
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
------- ------- -- -----
@ -53,8 +53,8 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
- ------- ---- ------- -- -----
*/
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6);
std::list<TokenAnnotation> annotations = as.getAnnotations();
BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 6);
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),0);

View File

@ -13,3 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
TokenAnnotation::~TokenAnnotation() {
}
char TokenAnnotation::NE_TYPE = 0;
char TokenAnnotation::WORD_TYPE = 1;
char TokenAnnotation::HTML_TAG_TYPE = 2;
char TokenAnnotation::STOP_WORD_TYPE = 3;

View File

@ -44,6 +44,14 @@ public:
return _value;
}
static char NE_TYPE;
static char WORD_TYPE;
static char HTML_TAG_TYPE;
static char STOP_WORD_TYPE;
protected:
char _annotationType;

View File

@ -1,16 +1,16 @@
#include "concordia/anonymized_sentence.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "concordia/common/text_utils.hpp"
#include <iostream>
AnonymizedSentence::AnonymizedSentence(std::string sentence):
TokenizedSentence::TokenizedSentence(std::string sentence):
_sentence(sentence) {
}
AnonymizedSentence::~AnonymizedSentence() {
TokenizedSentence::~TokenizedSentence() {
}
void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
@ -43,6 +43,6 @@ void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations
}
void AnonymizedSentence::toLowerCase() {
void TokenizedSentence::toLowerCase() {
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
}

View File

@ -1,5 +1,5 @@
#ifndef ANONYMIZED_SENTENCE_HDR
#define ANONYMIZED_SENTENCE_HDR
#ifndef TOKENIZED_SENTENCE_HDR
#define TOKENIZED_SENTENCE_HDR
#include "concordia/common/config.hpp"
#include "concordia/token_annotation.hpp"
@ -13,17 +13,17 @@
along with the annotations list.
*/
class AnonymizedSentence {
class TokenizedSentence {
public:
/*!
Constructor.
*/
AnonymizedSentence(std::string sentence);
TokenizedSentence(std::string sentence);
/*! Destructor.
*/
virtual ~AnonymizedSentence();
virtual ~TokenizedSentence();
/*! Getter for sentence
\returns sentence

View File

@ -207,9 +207,8 @@ markers_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_markers.bin"
word_map_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_word_map.bin"
#-------------------------------------------------------------------------------
# The following settings control the sentence anonymizer mechanism. It is used to
# remove unnecessary symbols and possibly words from sentences added to index
# and search patterns. Anonymizer removes html tags, substitutes predefined symbols
# The following settings control the sentence tokenizer mechanism. Tokenizer
# takes into account html tags, substitutes predefined symbols
# with a single space, removes stop words (if the option is enabled), as well as
# named entities and special symbols. All these have to be listed in files.

View File

@ -1,6 +0,0 @@
\|
\
\-
\/
;
:

View File

@ -1,37 +0,0 @@
\\tab
\\emdash
\&lt;
\&gt;
\&amp;
\&quot;
\&dash;
\&nbsp;
<
>
=
\+
\"
\.
\,
\?
!
'
\(
\)
\{
\}
\@
\#
\$
\%
\^
\&
\*
\[
\]
\\
\~
&#\d+

View File

@ -22,21 +22,15 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
# named entities and special symbols. All these have to be listed in files.
# File containing all html tags (one per line)
html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
# File containing all symbols to be replaced by spaces
space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
html_tags_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
# If set to true, words from predefined list are removed
stop_words_enabled = "@STOP_WORDS_ENABLED@"
# If stop_words_enabled is true, set the path to the stop words file
#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
# File containing regular expressions that match named entities
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
# File containing special symbols (one per line) to be removed
stop_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
### eof

View File

@ -1,6 +0,0 @@
\|
\
\-
\/
;
:

View File

@ -1,37 +0,0 @@
\\tab
\\emdash
\&lt;
\&gt;
\&amp;
\&quot;
\&dash;
\&nbsp;
<
>
=
\+
\"
\.
\,
\?
!
'
\(
\)
\{
\}
\@
\#
\$
\%
\^
\&
\*
\[
\]
\\
\~
&#\d+

View File

@ -14,15 +14,11 @@ markers_path = "/tmp/ma.bin"
html_tags_path = "/tmp/html_tags.txt"
space_symbols_path = "/tmp/space_symbols.txt"
stop_words_enabled = "true"
stop_words_path = "/tmp/stop_words.txt"
named_entities_path = "/tmp/named_entities.txt"
stop_symbols_path = "/tmp/stop_symbols.txt"
### eof

View File

@ -22,21 +22,15 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
# named entities and special symbols. All these have to be listed in files.
# File containing all html tags (one per line)
html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
# File containing all symbols to be replaced by spaces
space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
html_tags_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
# If set to true, words from predefined list are removed
stop_words_enabled = "@STOP_WORDS_ENABLED@"
# If stop_words_enabled is true, set the path to the stop words file
#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
# File containing regular expressions that match named entities
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
# File containing special symbols (one per line) to be removed
stop_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
### eof