tokenizer in progress

This commit is contained in:
rjawor 2015-06-25 10:12:51 +02:00
parent 0baf3e4ef2
commit 8432dd321f
35 changed files with 243 additions and 338 deletions

View File

@ -1,6 +1,7 @@
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) ----------------------------- ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
- implement tokenAnnotations vector as interval tree - work on word regex pattern (allow for some symbols and digits within the word)
- document the code (classes, cfg files) and update tutorial
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją) IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()). - concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń. - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
@ -11,6 +12,7 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś
---------------------------- Archive ----------------------------- ---------------------------- Archive -----------------------------
DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html) DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server). DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
DONE - document the code DONE - document the code

View File

@ -7,13 +7,13 @@ endforeach(dir)
add_library(concordia SHARED add_library(concordia SHARED
token_annotation.cpp token_annotation.cpp
anonymized_sentence.cpp tokenized_sentence.cpp
hashed_sentence.cpp hashed_sentence.cpp
concordia_search_result.cpp concordia_search_result.cpp
matched_pattern_fragment.cpp matched_pattern_fragment.cpp
concordia_searcher.cpp concordia_searcher.cpp
regex_rule.cpp regex_rule.cpp
sentence_anonymizer.cpp sentence_tokenizer.cpp
interval.cpp interval.cpp
tm_matches.cpp tm_matches.cpp
anubis_search_result.cpp anubis_search_result.cpp
@ -37,13 +37,13 @@ add_subdirectory(t)
install(TARGETS concordia DESTINATION lib/) install(TARGETS concordia DESTINATION lib/)
install(FILES install(FILES
token_annotation.hpp token_annotation.hpp
anonymized_sentence.hpp tokenized_sentence.hpp
hashed_sentence.hpp hashed_sentence.hpp
concordia_search_result.hpp concordia_search_result.hpp
matched_pattern_fragment.hpp matched_pattern_fragment.hpp
concordia_searcher.hpp concordia_searcher.hpp
regex_rule.hpp regex_rule.hpp
sentence_anonymizer.hpp sentence_tokenizer.hpp
interval.hpp interval.hpp
tm_matches.hpp tm_matches.hpp
anubis_search_result.hpp anubis_search_result.hpp

View File

@ -9,11 +9,9 @@
#define MARKERS_PARAM "markers_path" #define MARKERS_PARAM "markers_path"
#define SUFFIX_ARRAY_PARAM "suffix_array_path" #define SUFFIX_ARRAY_PARAM "suffix_array_path"
#define HTML_TAGS_PARAM "html_tags_path" #define HTML_TAGS_PARAM "html_tags_path"
#define SPACE_SYMBOLS_PARAM "space_symbols_path"
#define STOP_WORDS_ENABLED_PARAM "stop_words_enabled" #define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
#define STOP_WORDS_PARAM "stop_words_path" #define STOP_WORDS_PARAM "stop_words_path"
#define NAMED_ENTITIES_PARAM "named_entities_path" #define NAMED_ENTITIES_PARAM "named_entities_path"
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
#define ANUBIS_THRESHOLD_PARAM "anubis_threshold" #define ANUBIS_THRESHOLD_PARAM "anubis_threshold"
ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath) ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
@ -35,8 +33,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM); ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
_htmlTagsFilePath = _htmlTagsFilePath =
ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM); ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
_spaceSymbolsFilePath =
ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
_stopWordsEnabled = _stopWordsEnabled =
ConcordiaConfig::_readConfigParameterStr( ConcordiaConfig::_readConfigParameterStr(
STOP_WORDS_ENABLED_PARAM) != "false"; STOP_WORDS_ENABLED_PARAM) != "false";
@ -44,8 +40,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, ""); ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, "");
_namedEntitiesFilePath = _namedEntitiesFilePath =
ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM); ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
_stopSymbolsFilePath =
ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
_anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr( _anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr(
ANUBIS_THRESHOLD_PARAM, ANUBIS_THRESHOLD_PARAM,
"0.3").c_str()); "0.3").c_str());

View File

@ -56,14 +56,6 @@ public:
return _htmlTagsFilePath; return _htmlTagsFilePath;
} }
/*! Getter for space symbols file path.
For more information see \ref tutorial3.
\returns space symbols file path
*/
std::string & getSpaceSymbolsFilePath() {
return _spaceSymbolsFilePath;
}
/*! Getter for stop symbols enabled parameter. /*! Getter for stop symbols enabled parameter.
For more information see \ref tutorial3. For more information see \ref tutorial3.
\returns true if stop words are enabled \returns true if stop words are enabled
@ -88,14 +80,6 @@ public:
return _namedEntitiesFilePath; return _namedEntitiesFilePath;
} }
/*! Getter for stop symbols file path.
For more information see \ref tutorial3.
\returns stop symbols file path
*/
std::string & getStopSymbolsFilePath() {
return _stopSymbolsFilePath;
}
/*! Getter for anubis threshold. Anubis search results with /*! Getter for anubis threshold. Anubis search results with
scores below that threshold will be discarded. scores below that threshold will be discarded.
\returns anubis threshold \returns anubis threshold
@ -115,16 +99,12 @@ private:
std::string _htmlTagsFilePath; std::string _htmlTagsFilePath;
std::string _spaceSymbolsFilePath;
bool _stopWordsEnabled; bool _stopWordsEnabled;
std::string _stopWordsFilePath; std::string _stopWordsFilePath;
std::string _namedEntitiesFilePath; std::string _namedEntitiesFilePath;
std::string _stopSymbolsFilePath;
double _anubisThreshold; double _anubisThreshold;
std::string _readConfigParameterStr(const std::string & name) std::string _readConfigParameterStr(const std::string & name)

View File

@ -12,8 +12,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException) : throw(ConcordiaException) :
_wordMapFilePath(config->getWordMapFilePath()), _wordMapFilePath(config->getWordMapFilePath()),
_wordMap(boost::shared_ptr<WordMap>(new WordMap)), _wordMap(boost::shared_ptr<WordMap>(new WordMap)),
_sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>( _sentenceTokenizer(boost::shared_ptr<SentenceTokenizer>(
new SentenceAnonymizer(config))) { new SentenceTokenizer(config))) {
if (boost::filesystem::exists(_wordMapFilePath)) { if (boost::filesystem::exists(_wordMapFilePath)) {
std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary); std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
boost::archive::binary_iarchive ia(ifs); boost::archive::binary_iarchive ia(ifs);
@ -44,11 +44,11 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
std::vector<std::string> HashGenerator::generateTokenVector( std::vector<std::string> HashGenerator::generateTokenVector(
const std::string & sentence) { const std::string & sentence) {
boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence); boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
std::string anonymizedSentence = as->getSentence(); std::string tokenizedSentence = ts->getSentence();
boost::trim(anonymizedSentence); boost::trim(tokenizedSentence);
std::vector<std::string> tokenTexts; std::vector<std::string> tokenTexts;
boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"), boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
boost::algorithm::token_compress_on); boost::algorithm::token_compress_on);
return tokenTexts; return tokenTexts;
} }

View File

@ -8,7 +8,7 @@
#include <boost/algorithm/string/predicate.hpp> #include <boost/algorithm/string/predicate.hpp>
#include "concordia/word_map.hpp" #include "concordia/word_map.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/sentence_anonymizer.hpp" #include "concordia/sentence_tokenizer.hpp"
#include "concordia/concordia_config.hpp" #include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
@ -71,7 +71,7 @@ public:
private: private:
boost::shared_ptr<WordMap> _wordMap; boost::shared_ptr<WordMap> _wordMap;
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer; boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
std::string _wordMapFilePath; std::string _wordMapFilePath;
}; };

View File

@ -48,7 +48,7 @@ public:
/*! Method for adding an original word position to the list. /*! Method for adding an original word position to the list.
\param original word position \param original word position
*/ */
void addWordOriginalWordPosition(Interval & originalWordPosition) { void addOriginalWordPosition(Interval & originalWordPosition) {
_originalWordPositions.push_back(originalWordPosition); _originalWordPositions.push_back(originalWordPosition);
} }

View File

@ -5,10 +5,12 @@
#include <boost/throw_exception.hpp> #include <boost/throw_exception.hpp>
RegexRule::RegexRule(std::string patternString, RegexRule::RegexRule(std::string patternString,
std::string value, char annotationType,
bool caseSensitive) std::string value,
throw(ConcordiaException): bool caseSensitive)
_value(value) { throw(ConcordiaException):
_annotationType(annotationType),
_value(value) {
try { try {
if (caseSensitive) { if (caseSensitive) {
_pattern = boost::make_u32regex(UnicodeString(patternString.c_str())); _pattern = boost::make_u32regex(UnicodeString(patternString.c_str()));
@ -32,7 +34,7 @@ RegexRule::RegexRule(std::string patternString,
RegexRule::~RegexRule() { RegexRule::~RegexRule() {
} }
void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) { void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
try { try {
UnicodeString s(sentence->getSentence().c_str()); UnicodeString s(sentence->getSentence().c_str());
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern)); boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
@ -41,14 +43,14 @@ void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
for (; begin != end; ++begin) { for (; begin != end; ++begin) {
SUFFIX_MARKER_TYPE matchBegin = begin->position(); SUFFIX_MARKER_TYPE matchBegin = begin->position();
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length(); SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value); TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, "");
annotations.push_back(annotation); annotations.push_back(annotation);
} }
sentence->addAnnotations(annotations); sentence->addAnnotations(annotations);
} catch(const std::exception & e) { } catch(const std::exception & e) {
std::stringstream ss; std::stringstream ss;
ss << "Exception while applying regex rule: " ss << "Exception while applying regex rule: "
<< _value << " to text: " << sentence->getSentence(); << _annotationType << " to text: " << sentence->getSentence();
ss << ", message: " << e.what(); ss << ", message: " << e.what();
throw ConcordiaException(ss.str()); throw ConcordiaException(ss.str());
} }

View File

@ -3,7 +3,7 @@
#include <string> #include <string>
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/anonymized_sentence.hpp" #include "concordia/tokenized_sentence.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/regex.hpp> #include <boost/regex.hpp>
@ -24,12 +24,14 @@ public:
/*! /*!
Constructor. Constructor.
\param patternString regex pattern to match \param patternString regex pattern to match
\param replacement string to substitute the found match \param annoationType type of annotation
\param caseSensitive case sensitivity of the pattern \param caseSensitive case sensitivity of the pattern
*/ */
RegexRule(std::string patternString, std::string value, RegexRule(std::string patternString,
bool caseSensitive = true) char annotationType,
throw(ConcordiaException); std::string value,
bool caseSensitive = true)
throw(ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */
@ -38,12 +40,14 @@ public:
/*! Applies the operation on anonymized sentence. /*! Applies the operation on anonymized sentence.
\param sentence the input sentence \param sentence the input sentence
*/ */
void apply(boost::shared_ptr<AnonymizedSentence> sentence); void apply(boost::shared_ptr<TokenizedSentence> sentence);
private: private:
boost::u32regex _pattern; char _annotationType;
std::string _value; std::string _value;
boost::u32regex _pattern;
}; };
#endif #endif

View File

@ -1,4 +1,5 @@
#include "concordia/sentence_anonymizer.hpp" #include "concordia/sentence_tokenizer.hpp"
#include "concordia/token_annotation.hpp"
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <fstream> #include <fstream>
@ -6,29 +7,27 @@
#include <iostream> #include <iostream>
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
SentenceAnonymizer::SentenceAnonymizer( SentenceTokenizer::SentenceTokenizer(
boost::shared_ptr<ConcordiaConfig> config) boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException) { throw(ConcordiaException) {
_createNeRules(config->getNamedEntitiesFilePath()); _createNeRules(config->getNamedEntitiesFilePath());
_createHtmlTagsRule(config->getHtmlTagsFilePath()); _createHtmlTagsRule(config->getHtmlTagsFilePath());
_stopWordsEnabled = config->isStopWordsEnabled(); _stopWordsEnabled = config->isStopWordsEnabled();
if (_stopWordsEnabled) { if (_stopWordsEnabled) {
_stopWords = _getMultipleReplacementRule( _stopWords = _getMultipleRegexRule(
config->getStopWordsFilePath(), "", true); config->getStopWordsFilePath(),
TokenAnnotation::STOP_WORD_TYPE,
"", true);
} }
_stopSymbols = _getMultipleReplacementRule(
config->getStopSymbolsFilePath(), "");
_spaceSymbols = _getMultipleReplacementRule(
config->getSpaceSymbolsFilePath(), " ");
} }
SentenceAnonymizer::~SentenceAnonymizer() { SentenceTokenizer::~SentenceTokenizer() {
} }
boost::shared_ptr<AnonymizedSentence> boost::shared_ptr<TokenizedSentence>
SentenceAnonymizer::anonymize(const std::string & sentence) { SentenceTokenizer::tokenize(const std::string & sentence) {
boost::shared_ptr<AnonymizedSentence> boost::shared_ptr<TokenizedSentence>
result(new AnonymizedSentence(sentence)); result(new TokenizedSentence(sentence));
_htmlTags->apply(result); _htmlTags->apply(result);
@ -41,13 +40,14 @@ boost::shared_ptr<AnonymizedSentence>
if (_stopWordsEnabled) { if (_stopWordsEnabled) {
_stopWords->apply(result); _stopWords->apply(result);
} }
_stopSymbols->apply(result);
_spaceSymbols->apply(result); boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word"));
return result; return result;
} }
void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) { void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
if (boost::filesystem::exists(namedEntitiesPath)) { if (boost::filesystem::exists(namedEntitiesPath)) {
std::string line; std::string line;
std::ifstream neFile(namedEntitiesPath.c_str()); std::ifstream neFile(namedEntitiesPath.c_str());
@ -66,7 +66,9 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
throw ConcordiaException(ss.str()); throw ConcordiaException(ss.str());
} else { } else {
_namedEntities.push_back(RegexRule( _namedEntities.push_back(RegexRule(
tokenTexts->at(0), tokenTexts->at(1))); tokenTexts->at(0),
TokenAnnotation::NE_TYPE,
tokenTexts->at(1)));
} }
} }
neFile.close(); neFile.close();
@ -78,7 +80,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
} }
} }
void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) { void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
std::string tagsExpression = "<\\/?("; std::string tagsExpression = "<\\/?(";
if (boost::filesystem::exists(htmlTagsPath)) { if (boost::filesystem::exists(htmlTagsPath)) {
std::string line; std::string line;
@ -97,12 +99,15 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
tagsExpression += "br).*?>"; tagsExpression += "br).*?>";
_htmlTags = boost::shared_ptr<RegexRule>( _htmlTags = boost::shared_ptr<RegexRule>(
new RegexRule(tagsExpression, "", false)); new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false));
} }
boost::shared_ptr<RegexRule> boost::shared_ptr<RegexRule>
SentenceAnonymizer::_getMultipleReplacementRule( SentenceTokenizer::_getMultipleRegexRule(
std::string & filePath, std::string replacement, bool wholeWord) { std::string filePath,
char annotationType,
std::string value,
bool wholeWord) {
std::string expression = "("; std::string expression = "(";
if (boost::filesystem::exists(filePath)) { if (boost::filesystem::exists(filePath)) {
std::string line; std::string line;
@ -128,6 +133,6 @@ boost::shared_ptr<RegexRule>
expression = expression.substr(0, expression.size()-1); expression = expression.substr(0, expression.size()-1);
expression += ")"; expression += ")";
return boost::shared_ptr<RegexRule>( return boost::shared_ptr<RegexRule>(
new RegexRule(expression, replacement, false)); new RegexRule(expression, annotationType, value, false));
} }

View File

@ -1,10 +1,10 @@
#ifndef SENTENCE_ANONYMIZER_HDR #ifndef SENTENCE_TOKENIZER_HDR
#define SENTENCE_ANONYMIZER_HDR #define SENTENCE_TOKENIZER_HDR
#include <string> #include <string>
#include <vector> #include <vector>
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/anonymized_sentence.hpp" #include "concordia/tokenized_sentence.hpp"
#include "concordia/regex_rule.hpp" #include "concordia/regex_rule.hpp"
#include "concordia/concordia_config.hpp" #include "concordia/concordia_config.hpp"
#include "concordia/concordia_exception.hpp" #include "concordia/concordia_exception.hpp"
@ -13,42 +13,42 @@
/*! /*!
Class for anonymizing sentence before generating hash. Class for tokenizing sentence before generating hash.
This operation is is used to This operation is is used to
remove unnecessary symbols and possibly words from sentences added to index remove unnecessary symbols and possibly words from sentences added to index
and search patterns. Anonymizer removes html tags, substitutes predefined symbols and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
with a single space, removes stop words (if the option is enabled), as well as as well as annotates named entities and special symbols. All these have to be listed in files
named entities and special symbols. All these have to be listed in files
(see \ref tutorial3). (see \ref tutorial3).
*/ */
class SentenceAnonymizer { class SentenceTokenizer {
public: public:
/*! Constructor. /*! Constructor.
\param config config object, holding paths to necessary files \param config config object, holding paths to necessary files
*/ */
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config) explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
throw(ConcordiaException); throw(ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */
virtual ~SentenceAnonymizer(); virtual ~SentenceTokenizer();
/*! Anonymizes the sentence. /*! Tokenizes the sentence.
\param sentence input sentence \param sentence input sentence
\returns altered version of the input sentence \returns altered version of the input sentence
*/ */
boost::shared_ptr<AnonymizedSentence> boost::shared_ptr<TokenizedSentence>
anonymize(const std::string & sentence); tokenize(const std::string & sentence);
private: private:
void _createNeRules(std::string & namedEntitiesPath); void _createNeRules(std::string & namedEntitiesPath);
void _createHtmlTagsRule(std::string & htmlTagsPath); void _createHtmlTagsRule(std::string & htmlTagsPath);
boost::shared_ptr<RegexRule> _getMultipleReplacementRule( boost::shared_ptr<RegexRule> _getMultipleRegexRule(
std::string & filePath, std::string filePath,
std::string replacement, char annotationType,
std::string value,
bool wholeWord = false); bool wholeWord = false);
std::vector<RegexRule> _namedEntities; std::vector<RegexRule> _namedEntities;
@ -59,9 +59,6 @@ private:
boost::shared_ptr<RegexRule> _stopWords; boost::shared_ptr<RegexRule> _stopWords;
boost::shared_ptr<RegexRule> _stopSymbols;
boost::shared_ptr<RegexRule> _spaceSymbols;
}; };
#endif #endif

View File

@ -1,8 +1,8 @@
add_library(concordia-tests add_library(concordia-tests
test_regex_rule.cpp test_regex_rule.cpp
test_anonymized_sentence.cpp test_tokenized_sentence.cpp
test_concordia_searcher.cpp test_concordia_searcher.cpp
test_sentence_anonymizer.cpp test_sentence_tokenizer.cpp
test_text_utils.cpp test_text_utils.cpp
test_example.cpp test_example.cpp
test_tm_matches.cpp test_tm_matches.cpp

View File

@ -16,10 +16,8 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" ); BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" ); BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" ); BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" );
BOOST_CHECK_EQUAL( config.getSpaceSymbolsFilePath() , "/tmp/space_symbols.txt" );
BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" ); BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" );
BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" ); BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" );
BOOST_CHECK_EQUAL( config.getStopSymbolsFilePath() , "/tmp/stop_symbols.txt" );
} }
BOOST_AUTO_TEST_CASE( NonexistentConfigTest ) BOOST_AUTO_TEST_CASE( NonexistentConfigTest )

View File

@ -1,6 +1,7 @@
#include "tests/unit-tests/unit_tests_globals.hpp" #include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/regex_rule.hpp" #include "concordia/regex_rule.hpp"
#include "concordia/anonymized_sentence.hpp" #include "concordia/tokenized_sentence.hpp"
#include "concordia/token_annotation.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/algorithm/string/predicate.hpp> #include <boost/algorithm/string/predicate.hpp>
@ -11,11 +12,11 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
BOOST_AUTO_TEST_CASE( SimpleReplacement ) BOOST_AUTO_TEST_CASE( SimpleReplacement )
{ {
RegexRule rr("a","b"); RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b");
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
rr.apply(as); rr.apply(ts);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = as->getAnnotations(); std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),7); BOOST_CHECK_EQUAL(iter->getStart(),7);
@ -43,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex )
bool exceptionThrown = false; bool exceptionThrown = false;
std::string message = ""; std::string message = "";
try { try {
RegexRule rr("+a","b"); RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b");
} catch (ConcordiaException & e) { } catch (ConcordiaException & e) {
exceptionThrown = true; exceptionThrown = true;
message = e.what(); message = e.what();
@ -54,11 +55,11 @@ BOOST_AUTO_TEST_CASE( BadRegex )
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement ) BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
{ {
RegexRule rr("['\"\\\\.]",""); RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, "");
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'.")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
rr.apply(as); rr.apply(ts);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
std::list<TokenAnnotation> annotations = as->getAnnotations(); std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),3); BOOST_CHECK_EQUAL(iter->getStart(),3);
@ -84,11 +85,11 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
{ {
RegexRule rr("abc","xxx", false); RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false);
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC.")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
rr.apply(as); rr.apply(ts);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),4); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
std::list<TokenAnnotation> annotations = as->getAnnotations(); std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),8); BOOST_CHECK_EQUAL(iter->getStart(),8);
@ -109,11 +110,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
BOOST_AUTO_TEST_CASE( UnicodeReplacement ) BOOST_AUTO_TEST_CASE( UnicodeReplacement )
{ {
RegexRule rr("ą","x"); RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x");
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
rr.apply(as); rr.apply(ts);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),1); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
std::list<TokenAnnotation> annotations = as->getAnnotations(); std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),11); BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -122,11 +123,11 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement )
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
{ {
RegexRule rr("ą","x", false); RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false);
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
rr.apply(as); rr.apply(ts);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),2); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
std::list<TokenAnnotation> annotations = as->getAnnotations(); std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),11); BOOST_CHECK_EQUAL(iter->getStart(),11);
@ -139,11 +140,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
{ {
RegexRule rr("[ąćęłńóśżź]","x", false); RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false);
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
rr.apply(as); rr.apply(ts);
BOOST_CHECK_EQUAL(as->getAnnotations().size(),18); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
std::list<TokenAnnotation> annotations = as->getAnnotations(); std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),2); BOOST_CHECK_EQUAL(iter->getStart(),2);

View File

@ -1,76 +0,0 @@
#include <boost/filesystem.hpp>
#include "tests/unit-tests/unit_tests_globals.hpp"
#include <string>
#include <sstream>
#include <boost/shared_ptr.hpp>
#include "concordia/common/config.hpp"
#include "concordia/sentence_anonymizer.hpp"
#include "tests/common/test_resources_manager.hpp"
BOOST_AUTO_TEST_SUITE(sentence_anonymizer)
BOOST_AUTO_TEST_CASE( NETest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config);
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date ne_date mail ne_email number ne_number");
}
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config);
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
}
BOOST_AUTO_TEST_CASE( StopWordsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
if (config->isStopWordsEnabled()) {
SentenceAnonymizer anonymizer(config);
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence()," wiem konieczne");
}
}
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config);
std::string sentence = "xxx, . xxx # xx $xx@ xx";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx xx");
}
BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config);
std::string sentence = "xxx-xxx xx|xx";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
}
BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceAnonymizer anonymizer(config);
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -0,0 +1,89 @@
#include <boost/filesystem.hpp>
#include "tests/unit-tests/unit_tests_globals.hpp"
#include <string>
#include <sstream>
#include <iostream>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#include "concordia/common/config.hpp"
#include "concordia/sentence_tokenizer.hpp"
#include "concordia/tokenized_sentence.hpp"
#include "tests/common/test_resources_manager.hpp"
BOOST_AUTO_TEST_SUITE(sentence_tokenizer)
BOOST_AUTO_TEST_CASE( NETest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
BOOST_CHECK_EQUAL(8,annotations.size());
BOOST_FOREACH(TokenAnnotation annotation, annotations) {
std::cout << annotation.getStart() << ","
<< annotation.getEnd() << " type: "
<< annotation.getType() << " value: "
<< annotation.getValue() << std::endl;
}
// BOOST_CHECK_EQUAL(,"date ne_date mail ne_email number ne_number");
}
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline ");
}
BOOST_AUTO_TEST_CASE( StopWordsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
if (config->isStopWordsEnabled()) {
SentenceTokenizer tokenizer(config);
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence()," wiem konieczne");
}
}
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "xxx, . xxx # xx $xx@ xx";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx xx");
}
BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "xxx-xxx xx|xx";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx");
}
BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -1,14 +1,14 @@
#include "tests/unit-tests/unit_tests_globals.hpp" #include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/anonymized_sentence.hpp" #include "concordia/tokenized_sentence.hpp"
#include "concordia/token_annotation.hpp" #include "concordia/token_annotation.hpp"
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include <iostream> #include <iostream>
BOOST_AUTO_TEST_SUITE(anonymized_sentence) BOOST_AUTO_TEST_SUITE(tokenized_sentence)
BOOST_AUTO_TEST_CASE( AnnotationsTrivial ) BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
{ {
AnonymizedSentence as("This is a test sentence"); TokenizedSentence ts("This is a test sentence");
std::vector<TokenAnnotation> annotations; std::vector<TokenAnnotation> annotations;
annotations.push_back(TokenAnnotation(0,1,'a',"val")); annotations.push_back(TokenAnnotation(0,1,'a',"val"));
@ -16,22 +16,22 @@ BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
annotations.push_back(TokenAnnotation(7,10,'a',"val")); annotations.push_back(TokenAnnotation(7,10,'a',"val"));
annotations.push_back(TokenAnnotation(12,14,'a',"val")); annotations.push_back(TokenAnnotation(12,14,'a',"val"));
as.addAnnotations(annotations); ts.addAnnotations(annotations);
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4); BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 4);
} }
BOOST_AUTO_TEST_CASE( AnnotationsIntersecting ) BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
{ {
AnonymizedSentence as("This is a test sentence"); TokenizedSentence ts("This is a test sentence");
std::vector<TokenAnnotation> annotations1; std::vector<TokenAnnotation> annotations1;
annotations1.push_back(TokenAnnotation(0,1,'a',"val")); annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
annotations1.push_back(TokenAnnotation(4,6,'a',"val")); annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
annotations1.push_back(TokenAnnotation(7,10,'a',"val")); annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
annotations1.push_back(TokenAnnotation(12,14,'a',"val")); annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
as.addAnnotations(annotations1); ts.addAnnotations(annotations1);
/* annotation /* annotation
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
- ---- ------- ----- - ---- ------- -----
@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
annotations2.push_back(TokenAnnotation(4,7,'a',"val")); annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
annotations2.push_back(TokenAnnotation(10,11,'a',"val")); annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
annotations2.push_back(TokenAnnotation(11,13,'a',"val")); annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
as.addAnnotations(annotations2); ts.addAnnotations(annotations2);
/* annotations2 /* annotations2
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
------- ------- -- ----- ------- ------- -- -----
@ -53,8 +53,8 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
- ------- ---- ------- -- ----- - ------- ---- ------- -- -----
*/ */
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6); BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 6);
std::list<TokenAnnotation> annotations = as.getAnnotations(); std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin(); std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(iter->getStart(),0); BOOST_CHECK_EQUAL(iter->getStart(),0);

View File

@ -13,3 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
TokenAnnotation::~TokenAnnotation() { TokenAnnotation::~TokenAnnotation() {
} }
char TokenAnnotation::NE_TYPE = 0;
char TokenAnnotation::WORD_TYPE = 1;
char TokenAnnotation::HTML_TAG_TYPE = 2;
char TokenAnnotation::STOP_WORD_TYPE = 3;

View File

@ -44,6 +44,14 @@ public:
return _value; return _value;
} }
static char NE_TYPE;
static char WORD_TYPE;
static char HTML_TAG_TYPE;
static char STOP_WORD_TYPE;
protected: protected:
char _annotationType; char _annotationType;

View File

@ -1,16 +1,16 @@
#include "concordia/anonymized_sentence.hpp" #include "concordia/tokenized_sentence.hpp"
#include "concordia/common/text_utils.hpp" #include "concordia/common/text_utils.hpp"
#include <iostream> #include <iostream>
AnonymizedSentence::AnonymizedSentence(std::string sentence): TokenizedSentence::TokenizedSentence(std::string sentence):
_sentence(sentence) { _sentence(sentence) {
} }
AnonymizedSentence::~AnonymizedSentence() { TokenizedSentence::~TokenizedSentence() {
} }
void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) { void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin(); std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin(); std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
@ -43,6 +43,6 @@ void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations
} }
void AnonymizedSentence::toLowerCase() { void TokenizedSentence::toLowerCase() {
_sentence = TextUtils::getInstance().toLowerCase(_sentence); _sentence = TextUtils::getInstance().toLowerCase(_sentence);
} }

View File

@ -1,5 +1,5 @@
#ifndef ANONYMIZED_SENTENCE_HDR #ifndef TOKENIZED_SENTENCE_HDR
#define ANONYMIZED_SENTENCE_HDR #define TOKENIZED_SENTENCE_HDR
#include "concordia/common/config.hpp" #include "concordia/common/config.hpp"
#include "concordia/token_annotation.hpp" #include "concordia/token_annotation.hpp"
@ -13,17 +13,17 @@
along with the annotations list. along with the annotations list.
*/ */
class AnonymizedSentence { class TokenizedSentence {
public: public:
/*! /*!
Constructor. Constructor.
*/ */
AnonymizedSentence(std::string sentence); TokenizedSentence(std::string sentence);
/*! Destructor. /*! Destructor.
*/ */
virtual ~AnonymizedSentence(); virtual ~TokenizedSentence();
/*! Getter for sentence /*! Getter for sentence
\returns sentence \returns sentence

View File

@ -207,9 +207,8 @@ markers_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_markers.bin"
word_map_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_word_map.bin" word_map_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_word_map.bin"
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
# The following settings control the sentence anonymizer mechanism. It is used to # The following settings control the sentence tokenizer mechanism. Tokenizer
# remove unnecessary symbols and possibly words from sentences added to index # takes into account html tags, substitutes predefined symbols
# and search patterns. Anonymizer removes html tags, substitutes predefined symbols
# with a single space, removes stop words (if the option is enabled), as well as # with a single space, removes stop words (if the option is enabled), as well as
# named entities and special symbols. All these have to be listed in files. # named entities and special symbols. All these have to be listed in files.

View File

@ -1,6 +0,0 @@
\|
\
\-
\/
;
:

View File

@ -1,37 +0,0 @@
\\tab
\\emdash
\&lt;
\&gt;
\&amp;
\&quot;
\&dash;
\&nbsp;
<
>
=
\+
\"
\.
\,
\?
!
'
\(
\)
\{
\}
\@
\#
\$
\%
\^
\&
\*
\[
\]
\\
\~
&#\d+

View File

@ -22,21 +22,15 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
# named entities and special symbols. All these have to be listed in files. # named entities and special symbols. All these have to be listed in files.
# File containing all html tags (one per line) # File containing all html tags (one per line)
html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt" html_tags_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
# File containing all symbols to be replaced by spaces
space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
# If set to true, words from predefined list are removed # If set to true, words from predefined list are removed
stop_words_enabled = "@STOP_WORDS_ENABLED@" stop_words_enabled = "@STOP_WORDS_ENABLED@"
# If stop_words_enabled is true, set the path to the stop words file # If stop_words_enabled is true, set the path to the stop words file
#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt" #stop_words_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
# File containing regular expressions that match named entities # File containing regular expressions that match named entities
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt" named_entities_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
# File containing special symbols (one per line) to be removed
stop_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
### eof ### eof

View File

@ -1,6 +0,0 @@
\|
\
\-
\/
;
:

View File

@ -1,37 +0,0 @@
\\tab
\\emdash
\&lt;
\&gt;
\&amp;
\&quot;
\&dash;
\&nbsp;
<
>
=
\+
\"
\.
\,
\?
!
'
\(
\)
\{
\}
\@
\#
\$
\%
\^
\&
\*
\[
\]
\\
\~
&#\d+

View File

@ -14,15 +14,11 @@ markers_path = "/tmp/ma.bin"
html_tags_path = "/tmp/html_tags.txt" html_tags_path = "/tmp/html_tags.txt"
space_symbols_path = "/tmp/space_symbols.txt"
stop_words_enabled = "true" stop_words_enabled = "true"
stop_words_path = "/tmp/stop_words.txt" stop_words_path = "/tmp/stop_words.txt"
named_entities_path = "/tmp/named_entities.txt" named_entities_path = "/tmp/named_entities.txt"
stop_symbols_path = "/tmp/stop_symbols.txt"
### eof ### eof

View File

@ -22,21 +22,15 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
# named entities and special symbols. All these have to be listed in files. # named entities and special symbols. All these have to be listed in files.
# File containing all html tags (one per line) # File containing all html tags (one per line)
html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt" html_tags_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
# File containing all symbols to be replaced by spaces
space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
# If set to true, words from predefined list are removed # If set to true, words from predefined list are removed
stop_words_enabled = "@STOP_WORDS_ENABLED@" stop_words_enabled = "@STOP_WORDS_ENABLED@"
# If stop_words_enabled is true, set the path to the stop words file # If stop_words_enabled is true, set the path to the stop words file
#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt" #stop_words_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
# File containing regular expressions that match named entities # File containing regular expressions that match named entities
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt" named_entities_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
# File containing special symbols (one per line) to be removed
stop_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
### eof ### eof