From 8432dd321f7316a0f725368aa424bff1f926543a Mon Sep 17 00:00:00 2001 From: rjawor Date: Thu, 25 Jun 2015 10:12:51 +0200 Subject: [PATCH] tokenizer in progress --- TODO.txt | 4 +- concordia/CMakeLists.txt | 8 +- concordia/concordia_config.cpp | 6 -- concordia/concordia_config.hpp | 20 ----- concordia/hash_generator.cpp | 12 +-- concordia/hash_generator.hpp | 4 +- concordia/hashed_sentence.hpp | 2 +- concordia/regex_rule.cpp | 16 ++-- concordia/regex_rule.hpp | 18 ++-- ..._anonymizer.cpp => sentence_tokenizer.cpp} | 49 +++++----- ..._anonymizer.hpp => sentence_tokenizer.hpp} | 35 ++++---- concordia/t/CMakeLists.txt | 4 +- concordia/t/test_concordia_config.cpp | 2 - concordia/t/test_regex_rule.cpp | 65 +++++++------- concordia/t/test_sentence_anonymizer.cpp | 76 ---------------- concordia/t/test_sentence_tokenizer.cpp | 89 +++++++++++++++++++ ...ntence.cpp => test_tokenized_sentence.cpp} | 20 ++--- concordia/token_annotation.cpp | 4 + concordia/token_annotation.hpp | 8 ++ ...ed_sentence.cpp => tokenized_sentence.cpp} | 10 +-- ...ed_sentence.hpp => tokenized_sentence.hpp} | 10 +-- concordia/tutorial.dox | 5 +- prod/resources/anonymizer/space_symbols.txt | 6 -- prod/resources/anonymizer/stop_symbols.txt | 37 -------- .../concordia-config/concordia.cfg.in | 12 +-- .../{anonymizer => tokenizer}/html_tags.txt | 0 .../named_entities.txt | 0 .../{anonymizer => tokenizer}/stop_words.txt | 0 tests/resources/anonymizer/space_symbols.txt | 6 -- tests/resources/anonymizer/stop_symbols.txt | 37 -------- .../concordia-config/concordia-mock.cfg | 4 - .../concordia-config/concordia.cfg.in | 12 +-- .../{anonymizer => tokenizer}/html_tags.txt | 0 .../named_entities.txt | 0 .../{anonymizer => tokenizer}/stop_words.txt | 0 35 files changed, 243 insertions(+), 338 deletions(-) rename concordia/{sentence_anonymizer.cpp => sentence_tokenizer.cpp} (72%) rename concordia/{sentence_anonymizer.hpp => sentence_tokenizer.hpp} (52%) delete mode 100644 concordia/t/test_sentence_anonymizer.cpp create mode 100644 concordia/t/test_sentence_tokenizer.cpp rename concordia/t/{test_anonymized_sentence.cpp => test_tokenized_sentence.cpp} (82%) rename concordia/{anonymized_sentence.cpp => tokenized_sentence.cpp} (85%) rename concordia/{anonymized_sentence.hpp => tokenized_sentence.hpp} (88%) delete mode 100644 prod/resources/anonymizer/space_symbols.txt delete mode 100644 prod/resources/anonymizer/stop_symbols.txt rename prod/resources/{anonymizer => tokenizer}/html_tags.txt (100%) rename prod/resources/{anonymizer => tokenizer}/named_entities.txt (100%) rename prod/resources/{anonymizer => tokenizer}/stop_words.txt (100%) delete mode 100644 tests/resources/anonymizer/space_symbols.txt delete mode 100644 tests/resources/anonymizer/stop_symbols.txt rename tests/resources/{anonymizer => tokenizer}/html_tags.txt (100%) rename tests/resources/{anonymizer => tokenizer}/named_entities.txt (100%) rename tests/resources/{anonymizer => tokenizer}/stop_words.txt (100%) diff --git a/TODO.txt b/TODO.txt index e06f0be..01c38d3 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,6 +1,7 @@ ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) ----------------------------- -- implement tokenAnnotations vector as interval tree +- work on word regex pattern (allow for some symbols and digits within the word) +- document the code (classes, cfg files) and update tutorial IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją) - concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()). - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń. @@ -11,6 +12,7 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś ---------------------------- Archive ----------------------------- +DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better) DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html) DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server). DONE - document the code diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index f59f12e..43a33b5 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -7,13 +7,13 @@ endforeach(dir) add_library(concordia SHARED token_annotation.cpp - anonymized_sentence.cpp + tokenized_sentence.cpp hashed_sentence.cpp concordia_search_result.cpp matched_pattern_fragment.cpp concordia_searcher.cpp regex_rule.cpp - sentence_anonymizer.cpp + sentence_tokenizer.cpp interval.cpp tm_matches.cpp anubis_search_result.cpp @@ -37,13 +37,13 @@ add_subdirectory(t) install(TARGETS concordia DESTINATION lib/) install(FILES token_annotation.hpp - anonymized_sentence.hpp + tokenized_sentence.hpp hashed_sentence.hpp concordia_search_result.hpp matched_pattern_fragment.hpp concordia_searcher.hpp regex_rule.hpp - sentence_anonymizer.hpp + sentence_tokenizer.hpp interval.hpp tm_matches.hpp anubis_search_result.hpp diff --git a/concordia/concordia_config.cpp b/concordia/concordia_config.cpp index ff5120f..29e3080 100644 --- a/concordia/concordia_config.cpp +++ b/concordia/concordia_config.cpp @@ -9,11 +9,9 @@ #define MARKERS_PARAM "markers_path" #define SUFFIX_ARRAY_PARAM "suffix_array_path" #define HTML_TAGS_PARAM "html_tags_path" -#define SPACE_SYMBOLS_PARAM "space_symbols_path" #define STOP_WORDS_ENABLED_PARAM "stop_words_enabled" #define STOP_WORDS_PARAM "stop_words_path" #define NAMED_ENTITIES_PARAM "named_entities_path" -#define STOP_SYMBOLS_PARAM "stop_symbols_path" #define ANUBIS_THRESHOLD_PARAM "anubis_threshold" ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath) @@ -35,8 +33,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath) ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM); _htmlTagsFilePath = ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM); - _spaceSymbolsFilePath = - ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM); _stopWordsEnabled = ConcordiaConfig::_readConfigParameterStr( STOP_WORDS_ENABLED_PARAM) != "false"; @@ -44,8 +40,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath) ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, ""); _namedEntitiesFilePath = ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM); - _stopSymbolsFilePath = - ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM); _anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr( ANUBIS_THRESHOLD_PARAM, "0.3").c_str()); diff --git a/concordia/concordia_config.hpp b/concordia/concordia_config.hpp index 983665e..a7c12d6 100644 --- a/concordia/concordia_config.hpp +++ b/concordia/concordia_config.hpp @@ -56,14 +56,6 @@ public: return _htmlTagsFilePath; } - /*! Getter for space symbols file path. - For more information see \ref tutorial3. - \returns space symbols file path - */ - std::string & getSpaceSymbolsFilePath() { - return _spaceSymbolsFilePath; - } - /*! Getter for stop symbols enabled parameter. For more information see \ref tutorial3. \returns true if stop words are enabled @@ -88,14 +80,6 @@ public: return _namedEntitiesFilePath; } - /*! Getter for stop symbols file path. - For more information see \ref tutorial3. - \returns stop symbols file path - */ - std::string & getStopSymbolsFilePath() { - return _stopSymbolsFilePath; - } - /*! Getter for anubis threshold. Anubis search results with scores below that threshold will be discarded. \returns anubis threshold @@ -115,16 +99,12 @@ private: std::string _htmlTagsFilePath; - std::string _spaceSymbolsFilePath; - bool _stopWordsEnabled; std::string _stopWordsFilePath; std::string _namedEntitiesFilePath; - std::string _stopSymbolsFilePath; - double _anubisThreshold; std::string _readConfigParameterStr(const std::string & name) diff --git a/concordia/hash_generator.cpp b/concordia/hash_generator.cpp index a004f60..05e9afe 100644 --- a/concordia/hash_generator.cpp +++ b/concordia/hash_generator.cpp @@ -12,8 +12,8 @@ HashGenerator::HashGenerator(boost::shared_ptr config) throw(ConcordiaException) : _wordMapFilePath(config->getWordMapFilePath()), _wordMap(boost::shared_ptr(new WordMap)), - _sentenceAnonymizer(boost::shared_ptr( - new SentenceAnonymizer(config))) { + _sentenceTokenizer(boost::shared_ptr( + new SentenceTokenizer(config))) { if (boost::filesystem::exists(_wordMapFilePath)) { std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary); boost::archive::binary_iarchive ia(ifs); @@ -44,11 +44,11 @@ std::vector HashGenerator::generateHash( std::vector HashGenerator::generateTokenVector( const std::string & sentence) { - boost::shared_ptr as = _sentenceAnonymizer->anonymize(sentence); - std::string anonymizedSentence = as->getSentence(); - boost::trim(anonymizedSentence); + boost::shared_ptr ts = _sentenceTokenizer->tokenize(sentence); + std::string tokenizedSentence = ts->getSentence(); + boost::trim(tokenizedSentence); std::vector tokenTexts; - boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"), + boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"), boost::algorithm::token_compress_on); return tokenTexts; } diff --git a/concordia/hash_generator.hpp b/concordia/hash_generator.hpp index 676abda..f9a4562 100644 --- a/concordia/hash_generator.hpp +++ b/concordia/hash_generator.hpp @@ -8,7 +8,7 @@ #include #include "concordia/word_map.hpp" #include "concordia/common/config.hpp" -#include "concordia/sentence_anonymizer.hpp" +#include "concordia/sentence_tokenizer.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_exception.hpp" @@ -71,7 +71,7 @@ public: private: boost::shared_ptr _wordMap; - boost::shared_ptr _sentenceAnonymizer; + boost::shared_ptr _sentenceTokenizer; std::string _wordMapFilePath; }; diff --git a/concordia/hashed_sentence.hpp b/concordia/hashed_sentence.hpp index 59ebd3e..85e234a 100644 --- a/concordia/hashed_sentence.hpp +++ b/concordia/hashed_sentence.hpp @@ -48,7 +48,7 @@ public: /*! Method for adding an original word position to the list. \param original word position */ - void addWordOriginalWordPosition(Interval & originalWordPosition) { + void addOriginalWordPosition(Interval & originalWordPosition) { _originalWordPositions.push_back(originalWordPosition); } diff --git a/concordia/regex_rule.cpp b/concordia/regex_rule.cpp index 83ae20f..636dfda 100644 --- a/concordia/regex_rule.cpp +++ b/concordia/regex_rule.cpp @@ -5,10 +5,12 @@ #include RegexRule::RegexRule(std::string patternString, - std::string value, - bool caseSensitive) - throw(ConcordiaException): - _value(value) { + char annotationType, + std::string value, + bool caseSensitive) + throw(ConcordiaException): + _annotationType(annotationType), + _value(value) { try { if (caseSensitive) { _pattern = boost::make_u32regex(UnicodeString(patternString.c_str())); @@ -32,7 +34,7 @@ RegexRule::RegexRule(std::string patternString, RegexRule::~RegexRule() { } -void RegexRule::apply(boost::shared_ptr sentence) { +void RegexRule::apply(boost::shared_ptr sentence) { try { UnicodeString s(sentence->getSentence().c_str()); boost::u32regex_iterator begin(boost::make_u32regex_iterator(s, _pattern)); @@ -41,14 +43,14 @@ void RegexRule::apply(boost::shared_ptr sentence) { for (; begin != end; ++begin) { SUFFIX_MARKER_TYPE matchBegin = begin->position(); SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length(); - TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value); + TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, ""); annotations.push_back(annotation); } sentence->addAnnotations(annotations); } catch(const std::exception & e) { std::stringstream ss; ss << "Exception while applying regex rule: " - << _value << " to text: " << sentence->getSentence(); + << _annotationType << " to text: " << sentence->getSentence(); ss << ", message: " << e.what(); throw ConcordiaException(ss.str()); } diff --git a/concordia/regex_rule.hpp b/concordia/regex_rule.hpp index 2f74c30..2c40bb3 100644 --- a/concordia/regex_rule.hpp +++ b/concordia/regex_rule.hpp @@ -3,7 +3,7 @@ #include #include "concordia/common/config.hpp" -#include "concordia/anonymized_sentence.hpp" +#include "concordia/tokenized_sentence.hpp" #include "concordia/concordia_exception.hpp" #include #include @@ -24,12 +24,14 @@ public: /*! Constructor. \param patternString regex pattern to match - \param replacement string to substitute the found match + \param annoationType type of annotation \param caseSensitive case sensitivity of the pattern */ - RegexRule(std::string patternString, std::string value, - bool caseSensitive = true) - throw(ConcordiaException); + RegexRule(std::string patternString, + char annotationType, + std::string value, + bool caseSensitive = true) + throw(ConcordiaException); /*! Destructor. */ @@ -38,12 +40,14 @@ public: /*! Applies the operation on anonymized sentence. \param sentence the input sentence */ - void apply(boost::shared_ptr sentence); + void apply(boost::shared_ptr sentence); private: - boost::u32regex _pattern; + char _annotationType; std::string _value; + + boost::u32regex _pattern; }; #endif diff --git a/concordia/sentence_anonymizer.cpp b/concordia/sentence_tokenizer.cpp similarity index 72% rename from concordia/sentence_anonymizer.cpp rename to concordia/sentence_tokenizer.cpp index e0715f3..663ed80 100644 --- a/concordia/sentence_anonymizer.cpp +++ b/concordia/sentence_tokenizer.cpp @@ -1,4 +1,5 @@ -#include "concordia/sentence_anonymizer.hpp" +#include "concordia/sentence_tokenizer.hpp" +#include "concordia/token_annotation.hpp" #include #include @@ -6,29 +7,27 @@ #include #include -SentenceAnonymizer::SentenceAnonymizer( +SentenceTokenizer::SentenceTokenizer( boost::shared_ptr config) throw(ConcordiaException) { _createNeRules(config->getNamedEntitiesFilePath()); _createHtmlTagsRule(config->getHtmlTagsFilePath()); _stopWordsEnabled = config->isStopWordsEnabled(); if (_stopWordsEnabled) { - _stopWords = _getMultipleReplacementRule( - config->getStopWordsFilePath(), "", true); + _stopWords = _getMultipleRegexRule( + config->getStopWordsFilePath(), + TokenAnnotation::STOP_WORD_TYPE, + "", true); } - _stopSymbols = _getMultipleReplacementRule( - config->getStopSymbolsFilePath(), ""); - _spaceSymbols = _getMultipleReplacementRule( - config->getSpaceSymbolsFilePath(), " "); } -SentenceAnonymizer::~SentenceAnonymizer() { +SentenceTokenizer::~SentenceTokenizer() { } -boost::shared_ptr - SentenceAnonymizer::anonymize(const std::string & sentence) { - boost::shared_ptr - result(new AnonymizedSentence(sentence)); +boost::shared_ptr + SentenceTokenizer::tokenize(const std::string & sentence) { + boost::shared_ptr + result(new TokenizedSentence(sentence)); _htmlTags->apply(result); @@ -41,13 +40,14 @@ boost::shared_ptr if (_stopWordsEnabled) { _stopWords->apply(result); } - _stopSymbols->apply(result); - _spaceSymbols->apply(result); + + boost::shared_ptr wordsRule( + new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word")); return result; } -void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) { +void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) { if (boost::filesystem::exists(namedEntitiesPath)) { std::string line; std::ifstream neFile(namedEntitiesPath.c_str()); @@ -66,7 +66,9 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) { throw ConcordiaException(ss.str()); } else { _namedEntities.push_back(RegexRule( - tokenTexts->at(0), tokenTexts->at(1))); + tokenTexts->at(0), + TokenAnnotation::NE_TYPE, + tokenTexts->at(1))); } } neFile.close(); @@ -78,7 +80,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) { } } -void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) { +void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) { std::string tagsExpression = "<\\/?("; if (boost::filesystem::exists(htmlTagsPath)) { std::string line; @@ -97,12 +99,15 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) { tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression += "br).*?>"; _htmlTags = boost::shared_ptr( - new RegexRule(tagsExpression, "", false)); + new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false)); } boost::shared_ptr - SentenceAnonymizer::_getMultipleReplacementRule( - std::string & filePath, std::string replacement, bool wholeWord) { + SentenceTokenizer::_getMultipleRegexRule( + std::string filePath, + char annotationType, + std::string value, + bool wholeWord) { std::string expression = "("; if (boost::filesystem::exists(filePath)) { std::string line; @@ -128,6 +133,6 @@ boost::shared_ptr expression = expression.substr(0, expression.size()-1); expression += ")"; return boost::shared_ptr( - new RegexRule(expression, replacement, false)); + new RegexRule(expression, annotationType, value, false)); } diff --git a/concordia/sentence_anonymizer.hpp b/concordia/sentence_tokenizer.hpp similarity index 52% rename from concordia/sentence_anonymizer.hpp rename to concordia/sentence_tokenizer.hpp index db8e102..be60061 100644 --- a/concordia/sentence_anonymizer.hpp +++ b/concordia/sentence_tokenizer.hpp @@ -1,10 +1,10 @@ -#ifndef SENTENCE_ANONYMIZER_HDR -#define SENTENCE_ANONYMIZER_HDR +#ifndef SENTENCE_TOKENIZER_HDR +#define SENTENCE_TOKENIZER_HDR #include #include #include "concordia/common/config.hpp" -#include "concordia/anonymized_sentence.hpp" +#include "concordia/tokenized_sentence.hpp" #include "concordia/regex_rule.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_exception.hpp" @@ -13,42 +13,42 @@ /*! - Class for anonymizing sentence before generating hash. + Class for tokenizing sentence before generating hash. This operation is is used to remove unnecessary symbols and possibly words from sentences added to index - and search patterns. Anonymizer removes html tags, substitutes predefined symbols - with a single space, removes stop words (if the option is enabled), as well as - named entities and special symbols. All these have to be listed in files + and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled), + as well as annotates named entities and special symbols. All these have to be listed in files (see \ref tutorial3). */ -class SentenceAnonymizer { +class SentenceTokenizer { public: /*! Constructor. \param config config object, holding paths to necessary files */ - explicit SentenceAnonymizer(boost::shared_ptr config) + explicit SentenceTokenizer(boost::shared_ptr config) throw(ConcordiaException); /*! Destructor. */ - virtual ~SentenceAnonymizer(); + virtual ~SentenceTokenizer(); - /*! Anonymizes the sentence. + /*! Tokenizes the sentence. \param sentence input sentence \returns altered version of the input sentence */ - boost::shared_ptr - anonymize(const std::string & sentence); + boost::shared_ptr + tokenize(const std::string & sentence); private: void _createNeRules(std::string & namedEntitiesPath); void _createHtmlTagsRule(std::string & htmlTagsPath); - boost::shared_ptr _getMultipleReplacementRule( - std::string & filePath, - std::string replacement, + boost::shared_ptr _getMultipleRegexRule( + std::string filePath, + char annotationType, + std::string value, bool wholeWord = false); std::vector _namedEntities; @@ -59,9 +59,6 @@ private: boost::shared_ptr _stopWords; - boost::shared_ptr _stopSymbols; - - boost::shared_ptr _spaceSymbols; }; #endif diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt index 9020c3b..a143694 100644 --- a/concordia/t/CMakeLists.txt +++ b/concordia/t/CMakeLists.txt @@ -1,8 +1,8 @@ add_library(concordia-tests test_regex_rule.cpp - test_anonymized_sentence.cpp + test_tokenized_sentence.cpp test_concordia_searcher.cpp - test_sentence_anonymizer.cpp + test_sentence_tokenizer.cpp test_text_utils.cpp test_example.cpp test_tm_matches.cpp diff --git a/concordia/t/test_concordia_config.cpp b/concordia/t/test_concordia_config.cpp index 099759f..dc05ec1 100644 --- a/concordia/t/test_concordia_config.cpp +++ b/concordia/t/test_concordia_config.cpp @@ -16,10 +16,8 @@ BOOST_AUTO_TEST_CASE( ConfigParameters ) BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" ); BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" ); BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" ); - BOOST_CHECK_EQUAL( config.getSpaceSymbolsFilePath() , "/tmp/space_symbols.txt" ); BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" ); BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" ); - BOOST_CHECK_EQUAL( config.getStopSymbolsFilePath() , "/tmp/stop_symbols.txt" ); } BOOST_AUTO_TEST_CASE( NonexistentConfigTest ) diff --git a/concordia/t/test_regex_rule.cpp b/concordia/t/test_regex_rule.cpp index 7922452..e650067 100644 --- a/concordia/t/test_regex_rule.cpp +++ b/concordia/t/test_regex_rule.cpp @@ -1,6 +1,7 @@ #include "tests/unit-tests/unit_tests_globals.hpp" #include "concordia/regex_rule.hpp" -#include "concordia/anonymized_sentence.hpp" +#include "concordia/tokenized_sentence.hpp" +#include "concordia/token_annotation.hpp" #include "concordia/common/config.hpp" #include #include @@ -11,11 +12,11 @@ BOOST_AUTO_TEST_SUITE(regex_rule) BOOST_AUTO_TEST_CASE( SimpleReplacement ) { - RegexRule rr("a","b"); - boost::shared_ptr as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa")); - rr.apply(as); - BOOST_CHECK_EQUAL(as->getAnnotations().size(),5); - std::list annotations = as->getAnnotations(); + RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b"); + boost::shared_ptr ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa")); + rr.apply(ts); + BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5); + std::list annotations = ts->getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),7); @@ -43,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex ) bool exceptionThrown = false; std::string message = ""; try { - RegexRule rr("+a","b"); + RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b"); } catch (ConcordiaException & e) { exceptionThrown = true; message = e.what(); @@ -54,11 +55,11 @@ BOOST_AUTO_TEST_CASE( BadRegex ) BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement ) { - RegexRule rr("['\"\\\\.]",""); - boost::shared_ptr as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'.")); - rr.apply(as); - BOOST_CHECK_EQUAL(as->getAnnotations().size(),5); - std::list annotations = as->getAnnotations(); + RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, ""); + boost::shared_ptr ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'.")); + rr.apply(ts); + BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5); + std::list annotations = ts->getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),3); @@ -84,11 +85,11 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement ) { - RegexRule rr("abc","xxx", false); - boost::shared_ptr as(new AnonymizedSentence("This is AbC and ABC and abc and aBC.")); - rr.apply(as); - BOOST_CHECK_EQUAL(as->getAnnotations().size(),4); - std::list annotations = as->getAnnotations(); + RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false); + boost::shared_ptr ts(new TokenizedSentence("This is AbC and ABC and abc and aBC.")); + rr.apply(ts); + BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4); + std::list annotations = ts->getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),8); @@ -109,11 +110,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement ) BOOST_AUTO_TEST_CASE( UnicodeReplacement ) { - RegexRule rr("ą","x"); - boost::shared_ptr as(new AnonymizedSentence("zażółć gęślą jaźń")); - rr.apply(as); - BOOST_CHECK_EQUAL(as->getAnnotations().size(),1); - std::list annotations = as->getAnnotations(); + RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x"); + boost::shared_ptr ts(new TokenizedSentence("zażółć gęślą jaźń")); + rr.apply(ts); + BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1); + std::list annotations = ts->getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),11); @@ -122,11 +123,11 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement ) { - RegexRule rr("ą","x", false); - boost::shared_ptr as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); - rr.apply(as); - BOOST_CHECK_EQUAL(as->getAnnotations().size(),2); - std::list annotations = as->getAnnotations(); + RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false); + boost::shared_ptr ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); + rr.apply(ts); + BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2); + std::list annotations = ts->getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),11); @@ -139,11 +140,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement ) { - RegexRule rr("[ąćęłńóśżź]","x", false); - boost::shared_ptr as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); - rr.apply(as); - BOOST_CHECK_EQUAL(as->getAnnotations().size(),18); - std::list annotations = as->getAnnotations(); + RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false); + boost::shared_ptr ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); + rr.apply(ts); + BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18); + std::list annotations = ts->getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),2); diff --git a/concordia/t/test_sentence_anonymizer.cpp b/concordia/t/test_sentence_anonymizer.cpp deleted file mode 100644 index a712059..0000000 --- a/concordia/t/test_sentence_anonymizer.cpp +++ /dev/null @@ -1,76 +0,0 @@ -#include -#include "tests/unit-tests/unit_tests_globals.hpp" -#include -#include - -#include -#include "concordia/common/config.hpp" -#include "concordia/sentence_anonymizer.hpp" -#include "tests/common/test_resources_manager.hpp" - -BOOST_AUTO_TEST_SUITE(sentence_anonymizer) - -BOOST_AUTO_TEST_CASE( NETest ) -{ - boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - SentenceAnonymizer anonymizer(config); - - - std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date ne_date mail ne_email number ne_number"); -} - -BOOST_AUTO_TEST_CASE( HtmlTagsTest ) -{ - boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - SentenceAnonymizer anonymizer(config); - - - std::string sentence = "link and bold and newline
"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline "); - -} - -BOOST_AUTO_TEST_CASE( StopWordsTest ) -{ - boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - if (config->isStopWordsEnabled()) { - SentenceAnonymizer anonymizer(config); - std::string sentence = "Aczkolwiek nie wiem, czy to konieczne"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence()," wiem konieczne"); - } -} - -BOOST_AUTO_TEST_CASE( StopSymbolsTest ) -{ - boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - SentenceAnonymizer anonymizer(config); - - - std::string sentence = "xxx, . xxx # xx $xx@ xx"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx xx"); - -} - -BOOST_AUTO_TEST_CASE( SpaceSymbolsTest ) -{ - boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - SentenceAnonymizer anonymizer(config); - - - std::string sentence = "xxx-xxx xx|xx"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx"); - -} - -BOOST_AUTO_TEST_CASE( WeirdSentenceTest ) -{ - boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - SentenceAnonymizer anonymizer(config); - - std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |"; - BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number "); - -} - -BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_sentence_tokenizer.cpp b/concordia/t/test_sentence_tokenizer.cpp new file mode 100644 index 0000000..49d7244 --- /dev/null +++ b/concordia/t/test_sentence_tokenizer.cpp @@ -0,0 +1,89 @@ +#include +#include "tests/unit-tests/unit_tests_globals.hpp" +#include +#include +#include + +#include +#include +#include "concordia/common/config.hpp" +#include "concordia/sentence_tokenizer.hpp" +#include "concordia/tokenized_sentence.hpp" +#include "tests/common/test_resources_manager.hpp" + +BOOST_AUTO_TEST_SUITE(sentence_tokenizer) + +BOOST_AUTO_TEST_CASE( NETest ) +{ + boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); + SentenceTokenizer tokenizer(config); + + + std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34"; + boost::shared_ptr ts = tokenizer.tokenize(sentence); + std::list annotations = ts->getAnnotations(); + + BOOST_CHECK_EQUAL(8,annotations.size()); + BOOST_FOREACH(TokenAnnotation annotation, annotations) { + std::cout << annotation.getStart() << "," + << annotation.getEnd() << " type: " + << annotation.getType() << " value: " + << annotation.getValue() << std::endl; + } +// BOOST_CHECK_EQUAL(,"date ne_date mail ne_email number ne_number"); +} + +BOOST_AUTO_TEST_CASE( HtmlTagsTest ) +{ + boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); + SentenceTokenizer tokenizer(config); + + + std::string sentence = "link and bold and newline
"; + BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline "); + +} + +BOOST_AUTO_TEST_CASE( StopWordsTest ) +{ + boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); + if (config->isStopWordsEnabled()) { + SentenceTokenizer tokenizer(config); + std::string sentence = "Aczkolwiek nie wiem, czy to konieczne"; + BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence()," wiem konieczne"); + } +} + +BOOST_AUTO_TEST_CASE( StopSymbolsTest ) +{ + boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); + SentenceTokenizer tokenizer(config); + + + std::string sentence = "xxx, . xxx # xx $xx@ xx"; + BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx xx"); + +} + +BOOST_AUTO_TEST_CASE( SpaceSymbolsTest ) +{ + boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); + SentenceTokenizer tokenizer(config); + + + std::string sentence = "xxx-xxx xx|xx"; + BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx"); + +} + +BOOST_AUTO_TEST_CASE( WeirdSentenceTest ) +{ + boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); + SentenceTokenizer tokenizer(config); + + std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |"; + BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number "); + +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_anonymized_sentence.cpp b/concordia/t/test_tokenized_sentence.cpp similarity index 82% rename from concordia/t/test_anonymized_sentence.cpp rename to concordia/t/test_tokenized_sentence.cpp index 334cbda..213a5f3 100644 --- a/concordia/t/test_anonymized_sentence.cpp +++ b/concordia/t/test_tokenized_sentence.cpp @@ -1,14 +1,14 @@ #include "tests/unit-tests/unit_tests_globals.hpp" -#include "concordia/anonymized_sentence.hpp" +#include "concordia/tokenized_sentence.hpp" #include "concordia/token_annotation.hpp" #include "concordia/common/config.hpp" #include -BOOST_AUTO_TEST_SUITE(anonymized_sentence) +BOOST_AUTO_TEST_SUITE(tokenized_sentence) BOOST_AUTO_TEST_CASE( AnnotationsTrivial ) { - AnonymizedSentence as("This is a test sentence"); + TokenizedSentence ts("This is a test sentence"); std::vector annotations; annotations.push_back(TokenAnnotation(0,1,'a',"val")); @@ -16,22 +16,22 @@ BOOST_AUTO_TEST_CASE( AnnotationsTrivial ) annotations.push_back(TokenAnnotation(7,10,'a',"val")); annotations.push_back(TokenAnnotation(12,14,'a',"val")); - as.addAnnotations(annotations); + ts.addAnnotations(annotations); - BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4); + BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 4); } BOOST_AUTO_TEST_CASE( AnnotationsIntersecting ) { - AnonymizedSentence as("This is a test sentence"); + TokenizedSentence ts("This is a test sentence"); std::vector annotations1; annotations1.push_back(TokenAnnotation(0,1,'a',"val")); annotations1.push_back(TokenAnnotation(4,6,'a',"val")); annotations1.push_back(TokenAnnotation(7,10,'a',"val")); annotations1.push_back(TokenAnnotation(12,14,'a',"val")); - as.addAnnotations(annotations1); + ts.addAnnotations(annotations1); /* annotation 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 - ---- ------- ----- @@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting ) annotations2.push_back(TokenAnnotation(4,7,'a',"val")); annotations2.push_back(TokenAnnotation(10,11,'a',"val")); annotations2.push_back(TokenAnnotation(11,13,'a',"val")); - as.addAnnotations(annotations2); + ts.addAnnotations(annotations2); /* annotations2 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ------- ------- -- ----- @@ -53,8 +53,8 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting ) - ------- ---- ------- -- ----- */ - BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6); - std::list annotations = as.getAnnotations(); + BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 6); + std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); BOOST_CHECK_EQUAL(iter->getStart(),0); diff --git a/concordia/token_annotation.cpp b/concordia/token_annotation.cpp index a0b7c03..a44f820 100644 --- a/concordia/token_annotation.cpp +++ b/concordia/token_annotation.cpp @@ -13,3 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start, TokenAnnotation::~TokenAnnotation() { } +char TokenAnnotation::NE_TYPE = 0; +char TokenAnnotation::WORD_TYPE = 1; +char TokenAnnotation::HTML_TAG_TYPE = 2; +char TokenAnnotation::STOP_WORD_TYPE = 3; diff --git a/concordia/token_annotation.hpp b/concordia/token_annotation.hpp index 0c805bb..d98af1a 100644 --- a/concordia/token_annotation.hpp +++ b/concordia/token_annotation.hpp @@ -44,6 +44,14 @@ public: return _value; } + static char NE_TYPE; + + static char WORD_TYPE; + + static char HTML_TAG_TYPE; + + static char STOP_WORD_TYPE; + protected: char _annotationType; diff --git a/concordia/anonymized_sentence.cpp b/concordia/tokenized_sentence.cpp similarity index 85% rename from concordia/anonymized_sentence.cpp rename to concordia/tokenized_sentence.cpp index 6f7c687..0c0c014 100644 --- a/concordia/anonymized_sentence.cpp +++ b/concordia/tokenized_sentence.cpp @@ -1,16 +1,16 @@ -#include "concordia/anonymized_sentence.hpp" +#include "concordia/tokenized_sentence.hpp" #include "concordia/common/text_utils.hpp" #include -AnonymizedSentence::AnonymizedSentence(std::string sentence): +TokenizedSentence::TokenizedSentence(std::string sentence): _sentence(sentence) { } -AnonymizedSentence::~AnonymizedSentence() { +TokenizedSentence::~TokenizedSentence() { } -void AnonymizedSentence::addAnnotations(std::vector annotations) { +void TokenizedSentence::addAnnotations(std::vector annotations) { std::vector::iterator newAnnotation = annotations.begin(); std::list::iterator existingAnnotation = _tokenAnnotations.begin(); @@ -43,6 +43,6 @@ void AnonymizedSentence::addAnnotations(std::vector annotations } -void AnonymizedSentence::toLowerCase() { +void TokenizedSentence::toLowerCase() { _sentence = TextUtils::getInstance().toLowerCase(_sentence); } diff --git a/concordia/anonymized_sentence.hpp b/concordia/tokenized_sentence.hpp similarity index 88% rename from concordia/anonymized_sentence.hpp rename to concordia/tokenized_sentence.hpp index e805be0..b1aa77e 100644 --- a/concordia/anonymized_sentence.hpp +++ b/concordia/tokenized_sentence.hpp @@ -1,5 +1,5 @@ -#ifndef ANONYMIZED_SENTENCE_HDR -#define ANONYMIZED_SENTENCE_HDR +#ifndef TOKENIZED_SENTENCE_HDR +#define TOKENIZED_SENTENCE_HDR #include "concordia/common/config.hpp" #include "concordia/token_annotation.hpp" @@ -13,17 +13,17 @@ along with the annotations list. */ -class AnonymizedSentence { +class TokenizedSentence { public: /*! Constructor. */ - AnonymizedSentence(std::string sentence); + TokenizedSentence(std::string sentence); /*! Destructor. */ - virtual ~AnonymizedSentence(); + virtual ~TokenizedSentence(); /*! Getter for sentence \returns sentence diff --git a/concordia/tutorial.dox b/concordia/tutorial.dox index 2e0be41..6438efc 100644 --- a/concordia/tutorial.dox +++ b/concordia/tutorial.dox @@ -207,9 +207,8 @@ markers_path = "/tests/resources/temp/temp_markers.bin" word_map_path = "/tests/resources/temp/temp_word_map.bin" #------------------------------------------------------------------------------- -# The following settings control the sentence anonymizer mechanism. It is used to -# remove unnecessary symbols and possibly words from sentences added to index -# and search patterns. Anonymizer removes html tags, substitutes predefined symbols +# The following settings control the sentence tokenizer mechanism. Tokenizer +# takes into account html tags, substitutes predefined symbols # with a single space, removes stop words (if the option is enabled), as well as # named entities and special symbols. All these have to be listed in files. diff --git a/prod/resources/anonymizer/space_symbols.txt b/prod/resources/anonymizer/space_symbols.txt deleted file mode 100644 index 5fc44e2..0000000 --- a/prod/resources/anonymizer/space_symbols.txt +++ /dev/null @@ -1,6 +0,0 @@ -\| -\– -\- -\/ -; -: diff --git a/prod/resources/anonymizer/stop_symbols.txt b/prod/resources/anonymizer/stop_symbols.txt deleted file mode 100644 index 46aa42d..0000000 --- a/prod/resources/anonymizer/stop_symbols.txt +++ /dev/null @@ -1,37 +0,0 @@ -\\tab -\\emdash -\< -\> -\& -\" -\‐ -\  -< -> -= -\+ -„ -” -\" -… -\. -\, -\? -! -' -\( -\) -\{ -\} -\@ -\# -\$ -\% -\^ -\& -\* -\[ -\] -\\ -\~ -&#\d+ diff --git a/prod/resources/concordia-config/concordia.cfg.in b/prod/resources/concordia-config/concordia.cfg.in index eaa68d5..c14cb97 100644 --- a/prod/resources/concordia-config/concordia.cfg.in +++ b/prod/resources/concordia-config/concordia.cfg.in @@ -22,21 +22,15 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@" # named entities and special symbols. All these have to be listed in files. # File containing all html tags (one per line) -html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt" - -# File containing all symbols to be replaced by spaces -space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt" +html_tags_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt" # If set to true, words from predefined list are removed stop_words_enabled = "@STOP_WORDS_ENABLED@" # If stop_words_enabled is true, set the path to the stop words file -#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt" +#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt" # File containing regular expressions that match named entities -named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt" - -# File containing special symbols (one per line) to be removed -stop_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt" +named_entities_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt" ### eof diff --git a/prod/resources/anonymizer/html_tags.txt b/prod/resources/tokenizer/html_tags.txt similarity index 100% rename from prod/resources/anonymizer/html_tags.txt rename to prod/resources/tokenizer/html_tags.txt diff --git a/prod/resources/anonymizer/named_entities.txt b/prod/resources/tokenizer/named_entities.txt similarity index 100% rename from prod/resources/anonymizer/named_entities.txt rename to prod/resources/tokenizer/named_entities.txt diff --git a/prod/resources/anonymizer/stop_words.txt b/prod/resources/tokenizer/stop_words.txt similarity index 100% rename from prod/resources/anonymizer/stop_words.txt rename to prod/resources/tokenizer/stop_words.txt diff --git a/tests/resources/anonymizer/space_symbols.txt b/tests/resources/anonymizer/space_symbols.txt deleted file mode 100644 index 5fc44e2..0000000 --- a/tests/resources/anonymizer/space_symbols.txt +++ /dev/null @@ -1,6 +0,0 @@ -\| -\– -\- -\/ -; -: diff --git a/tests/resources/anonymizer/stop_symbols.txt b/tests/resources/anonymizer/stop_symbols.txt deleted file mode 100644 index 46aa42d..0000000 --- a/tests/resources/anonymizer/stop_symbols.txt +++ /dev/null @@ -1,37 +0,0 @@ -\\tab -\\emdash -\< -\> -\& -\" -\‐ -\  -< -> -= -\+ -„ -” -\" -… -\. -\, -\? -! -' -\( -\) -\{ -\} -\@ -\# -\$ -\% -\^ -\& -\* -\[ -\] -\\ -\~ -&#\d+ diff --git a/tests/resources/concordia-config/concordia-mock.cfg b/tests/resources/concordia-config/concordia-mock.cfg index 32edb82..6558a52 100644 --- a/tests/resources/concordia-config/concordia-mock.cfg +++ b/tests/resources/concordia-config/concordia-mock.cfg @@ -14,15 +14,11 @@ markers_path = "/tmp/ma.bin" html_tags_path = "/tmp/html_tags.txt" -space_symbols_path = "/tmp/space_symbols.txt" - stop_words_enabled = "true" stop_words_path = "/tmp/stop_words.txt" named_entities_path = "/tmp/named_entities.txt" -stop_symbols_path = "/tmp/stop_symbols.txt" - ### eof diff --git a/tests/resources/concordia-config/concordia.cfg.in b/tests/resources/concordia-config/concordia.cfg.in index b4f00ad..24df93c 100644 --- a/tests/resources/concordia-config/concordia.cfg.in +++ b/tests/resources/concordia-config/concordia.cfg.in @@ -22,21 +22,15 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@" # named entities and special symbols. All these have to be listed in files. # File containing all html tags (one per line) -html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt" - -# File containing all symbols to be replaced by spaces -space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt" +html_tags_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt" # If set to true, words from predefined list are removed stop_words_enabled = "@STOP_WORDS_ENABLED@" # If stop_words_enabled is true, set the path to the stop words file -#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt" +#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt" # File containing regular expressions that match named entities -named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt" - -# File containing special symbols (one per line) to be removed -stop_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt" +named_entities_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt" ### eof diff --git a/tests/resources/anonymizer/html_tags.txt b/tests/resources/tokenizer/html_tags.txt similarity index 100% rename from tests/resources/anonymizer/html_tags.txt rename to tests/resources/tokenizer/html_tags.txt diff --git a/tests/resources/anonymizer/named_entities.txt b/tests/resources/tokenizer/named_entities.txt similarity index 100% rename from tests/resources/anonymizer/named_entities.txt rename to tests/resources/tokenizer/named_entities.txt diff --git a/tests/resources/anonymizer/stop_words.txt b/tests/resources/tokenizer/stop_words.txt similarity index 100% rename from tests/resources/anonymizer/stop_words.txt rename to tests/resources/tokenizer/stop_words.txt