tokenizer in progress
This commit is contained in:
parent
0baf3e4ef2
commit
8432dd321f
4
TODO.txt
4
TODO.txt
@ -1,6 +1,7 @@
|
||||
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
||||
|
||||
- implement tokenAnnotations vector as interval tree
|
||||
- work on word regex pattern (allow for some symbols and digits within the word)
|
||||
- document the code (classes, cfg files) and update tutorial
|
||||
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
|
||||
- concordia_search_result nie musi przechowywać całego tokenVector (bo używa go tylko do odczytania size()).
|
||||
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
|
||||
@ -11,6 +12,7 @@ IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieś
|
||||
|
||||
|
||||
---------------------------- Archive -----------------------------
|
||||
DONE - implement tokenAnnotations vector as interval tree (not interval tree, but list, which is even better)
|
||||
DONE (IT IS GOOD AS IT IS) - mess with gcc performance optimization options (https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html)
|
||||
DONE - concordia-server (zastanowić się, czy nie napisać CAT-a oraz nad tym, czy nie oddzielić projektu concordia-server).
|
||||
DONE - document the code
|
||||
|
@ -7,13 +7,13 @@ endforeach(dir)
|
||||
|
||||
add_library(concordia SHARED
|
||||
token_annotation.cpp
|
||||
anonymized_sentence.cpp
|
||||
tokenized_sentence.cpp
|
||||
hashed_sentence.cpp
|
||||
concordia_search_result.cpp
|
||||
matched_pattern_fragment.cpp
|
||||
concordia_searcher.cpp
|
||||
regex_rule.cpp
|
||||
sentence_anonymizer.cpp
|
||||
sentence_tokenizer.cpp
|
||||
interval.cpp
|
||||
tm_matches.cpp
|
||||
anubis_search_result.cpp
|
||||
@ -37,13 +37,13 @@ add_subdirectory(t)
|
||||
install(TARGETS concordia DESTINATION lib/)
|
||||
install(FILES
|
||||
token_annotation.hpp
|
||||
anonymized_sentence.hpp
|
||||
tokenized_sentence.hpp
|
||||
hashed_sentence.hpp
|
||||
concordia_search_result.hpp
|
||||
matched_pattern_fragment.hpp
|
||||
concordia_searcher.hpp
|
||||
regex_rule.hpp
|
||||
sentence_anonymizer.hpp
|
||||
sentence_tokenizer.hpp
|
||||
interval.hpp
|
||||
tm_matches.hpp
|
||||
anubis_search_result.hpp
|
||||
|
@ -9,11 +9,9 @@
|
||||
#define MARKERS_PARAM "markers_path"
|
||||
#define SUFFIX_ARRAY_PARAM "suffix_array_path"
|
||||
#define HTML_TAGS_PARAM "html_tags_path"
|
||||
#define SPACE_SYMBOLS_PARAM "space_symbols_path"
|
||||
#define STOP_WORDS_ENABLED_PARAM "stop_words_enabled"
|
||||
#define STOP_WORDS_PARAM "stop_words_path"
|
||||
#define NAMED_ENTITIES_PARAM "named_entities_path"
|
||||
#define STOP_SYMBOLS_PARAM "stop_symbols_path"
|
||||
#define ANUBIS_THRESHOLD_PARAM "anubis_threshold"
|
||||
|
||||
ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
||||
@ -35,8 +33,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
||||
ConcordiaConfig::_readConfigParameterStr(MARKERS_PARAM);
|
||||
_htmlTagsFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(HTML_TAGS_PARAM);
|
||||
_spaceSymbolsFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(SPACE_SYMBOLS_PARAM);
|
||||
_stopWordsEnabled =
|
||||
ConcordiaConfig::_readConfigParameterStr(
|
||||
STOP_WORDS_ENABLED_PARAM) != "false";
|
||||
@ -44,8 +40,6 @@ ConcordiaConfig::ConcordiaConfig(const std::string & configFilePath)
|
||||
ConcordiaConfig::_readConfigParameterStr(STOP_WORDS_PARAM, "");
|
||||
_namedEntitiesFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(NAMED_ENTITIES_PARAM);
|
||||
_stopSymbolsFilePath =
|
||||
ConcordiaConfig::_readConfigParameterStr(STOP_SYMBOLS_PARAM);
|
||||
_anubisThreshold = atof(ConcordiaConfig::_readConfigParameterStr(
|
||||
ANUBIS_THRESHOLD_PARAM,
|
||||
"0.3").c_str());
|
||||
|
@ -56,14 +56,6 @@ public:
|
||||
return _htmlTagsFilePath;
|
||||
}
|
||||
|
||||
/*! Getter for space symbols file path.
|
||||
For more information see \ref tutorial3.
|
||||
\returns space symbols file path
|
||||
*/
|
||||
std::string & getSpaceSymbolsFilePath() {
|
||||
return _spaceSymbolsFilePath;
|
||||
}
|
||||
|
||||
/*! Getter for stop symbols enabled parameter.
|
||||
For more information see \ref tutorial3.
|
||||
\returns true if stop words are enabled
|
||||
@ -88,14 +80,6 @@ public:
|
||||
return _namedEntitiesFilePath;
|
||||
}
|
||||
|
||||
/*! Getter for stop symbols file path.
|
||||
For more information see \ref tutorial3.
|
||||
\returns stop symbols file path
|
||||
*/
|
||||
std::string & getStopSymbolsFilePath() {
|
||||
return _stopSymbolsFilePath;
|
||||
}
|
||||
|
||||
/*! Getter for anubis threshold. Anubis search results with
|
||||
scores below that threshold will be discarded.
|
||||
\returns anubis threshold
|
||||
@ -115,16 +99,12 @@ private:
|
||||
|
||||
std::string _htmlTagsFilePath;
|
||||
|
||||
std::string _spaceSymbolsFilePath;
|
||||
|
||||
bool _stopWordsEnabled;
|
||||
|
||||
std::string _stopWordsFilePath;
|
||||
|
||||
std::string _namedEntitiesFilePath;
|
||||
|
||||
std::string _stopSymbolsFilePath;
|
||||
|
||||
double _anubisThreshold;
|
||||
|
||||
std::string _readConfigParameterStr(const std::string & name)
|
||||
|
@ -12,8 +12,8 @@ HashGenerator::HashGenerator(boost::shared_ptr<ConcordiaConfig> config)
|
||||
throw(ConcordiaException) :
|
||||
_wordMapFilePath(config->getWordMapFilePath()),
|
||||
_wordMap(boost::shared_ptr<WordMap>(new WordMap)),
|
||||
_sentenceAnonymizer(boost::shared_ptr<SentenceAnonymizer>(
|
||||
new SentenceAnonymizer(config))) {
|
||||
_sentenceTokenizer(boost::shared_ptr<SentenceTokenizer>(
|
||||
new SentenceTokenizer(config))) {
|
||||
if (boost::filesystem::exists(_wordMapFilePath)) {
|
||||
std::ifstream ifs(_wordMapFilePath.c_str(), std::ios::binary);
|
||||
boost::archive::binary_iarchive ia(ifs);
|
||||
@ -44,11 +44,11 @@ std::vector<INDEX_CHARACTER_TYPE> HashGenerator::generateHash(
|
||||
|
||||
std::vector<std::string> HashGenerator::generateTokenVector(
|
||||
const std::string & sentence) {
|
||||
boost::shared_ptr<AnonymizedSentence> as = _sentenceAnonymizer->anonymize(sentence);
|
||||
std::string anonymizedSentence = as->getSentence();
|
||||
boost::trim(anonymizedSentence);
|
||||
boost::shared_ptr<TokenizedSentence> ts = _sentenceTokenizer->tokenize(sentence);
|
||||
std::string tokenizedSentence = ts->getSentence();
|
||||
boost::trim(tokenizedSentence);
|
||||
std::vector<std::string> tokenTexts;
|
||||
boost::split(tokenTexts, anonymizedSentence, boost::is_any_of(" \t\r\n"),
|
||||
boost::split(tokenTexts, tokenizedSentence, boost::is_any_of(" \t\r\n"),
|
||||
boost::algorithm::token_compress_on);
|
||||
return tokenTexts;
|
||||
}
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include "concordia/word_map.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/sentence_anonymizer.hpp"
|
||||
#include "concordia/sentence_tokenizer.hpp"
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
|
||||
@ -71,7 +71,7 @@ public:
|
||||
private:
|
||||
boost::shared_ptr<WordMap> _wordMap;
|
||||
|
||||
boost::shared_ptr<SentenceAnonymizer> _sentenceAnonymizer;
|
||||
boost::shared_ptr<SentenceTokenizer> _sentenceTokenizer;
|
||||
|
||||
std::string _wordMapFilePath;
|
||||
};
|
||||
|
@ -48,7 +48,7 @@ public:
|
||||
/*! Method for adding an original word position to the list.
|
||||
\param original word position
|
||||
*/
|
||||
void addWordOriginalWordPosition(Interval & originalWordPosition) {
|
||||
void addOriginalWordPosition(Interval & originalWordPosition) {
|
||||
_originalWordPositions.push_back(originalWordPosition);
|
||||
}
|
||||
|
||||
|
@ -5,9 +5,11 @@
|
||||
#include <boost/throw_exception.hpp>
|
||||
|
||||
RegexRule::RegexRule(std::string patternString,
|
||||
char annotationType,
|
||||
std::string value,
|
||||
bool caseSensitive)
|
||||
throw(ConcordiaException):
|
||||
_annotationType(annotationType),
|
||||
_value(value) {
|
||||
try {
|
||||
if (caseSensitive) {
|
||||
@ -32,7 +34,7 @@ RegexRule::RegexRule(std::string patternString,
|
||||
RegexRule::~RegexRule() {
|
||||
}
|
||||
|
||||
void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
|
||||
void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
|
||||
try {
|
||||
UnicodeString s(sentence->getSentence().c_str());
|
||||
boost::u32regex_iterator<const UChar*> begin(boost::make_u32regex_iterator(s, _pattern));
|
||||
@ -41,14 +43,14 @@ void RegexRule::apply(boost::shared_ptr<AnonymizedSentence> sentence) {
|
||||
for (; begin != end; ++begin) {
|
||||
SUFFIX_MARKER_TYPE matchBegin = begin->position();
|
||||
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
|
||||
TokenAnnotation annotation(matchBegin, matchEnd, 'a', _value);
|
||||
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, "");
|
||||
annotations.push_back(annotation);
|
||||
}
|
||||
sentence->addAnnotations(annotations);
|
||||
} catch(const std::exception & e) {
|
||||
std::stringstream ss;
|
||||
ss << "Exception while applying regex rule: "
|
||||
<< _value << " to text: " << sentence->getSentence();
|
||||
<< _annotationType << " to text: " << sentence->getSentence();
|
||||
ss << ", message: " << e.what();
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
#include <string>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/anonymized_sentence.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/regex.hpp>
|
||||
@ -24,10 +24,12 @@ public:
|
||||
/*!
|
||||
Constructor.
|
||||
\param patternString regex pattern to match
|
||||
\param replacement string to substitute the found match
|
||||
\param annoationType type of annotation
|
||||
\param caseSensitive case sensitivity of the pattern
|
||||
*/
|
||||
RegexRule(std::string patternString, std::string value,
|
||||
RegexRule(std::string patternString,
|
||||
char annotationType,
|
||||
std::string value,
|
||||
bool caseSensitive = true)
|
||||
throw(ConcordiaException);
|
||||
|
||||
@ -38,12 +40,14 @@ public:
|
||||
/*! Applies the operation on anonymized sentence.
|
||||
\param sentence the input sentence
|
||||
*/
|
||||
void apply(boost::shared_ptr<AnonymizedSentence> sentence);
|
||||
void apply(boost::shared_ptr<TokenizedSentence> sentence);
|
||||
|
||||
private:
|
||||
boost::u32regex _pattern;
|
||||
char _annotationType;
|
||||
|
||||
std::string _value;
|
||||
|
||||
boost::u32regex _pattern;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "concordia/sentence_anonymizer.hpp"
|
||||
#include "concordia/sentence_tokenizer.hpp"
|
||||
#include "concordia/token_annotation.hpp"
|
||||
|
||||
#include <boost/foreach.hpp>
|
||||
#include <fstream>
|
||||
@ -6,29 +7,27 @@
|
||||
#include <iostream>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
SentenceAnonymizer::SentenceAnonymizer(
|
||||
SentenceTokenizer::SentenceTokenizer(
|
||||
boost::shared_ptr<ConcordiaConfig> config)
|
||||
throw(ConcordiaException) {
|
||||
_createNeRules(config->getNamedEntitiesFilePath());
|
||||
_createHtmlTagsRule(config->getHtmlTagsFilePath());
|
||||
_stopWordsEnabled = config->isStopWordsEnabled();
|
||||
if (_stopWordsEnabled) {
|
||||
_stopWords = _getMultipleReplacementRule(
|
||||
config->getStopWordsFilePath(), "", true);
|
||||
_stopWords = _getMultipleRegexRule(
|
||||
config->getStopWordsFilePath(),
|
||||
TokenAnnotation::STOP_WORD_TYPE,
|
||||
"", true);
|
||||
}
|
||||
_stopSymbols = _getMultipleReplacementRule(
|
||||
config->getStopSymbolsFilePath(), "");
|
||||
_spaceSymbols = _getMultipleReplacementRule(
|
||||
config->getSpaceSymbolsFilePath(), " ");
|
||||
}
|
||||
|
||||
SentenceAnonymizer::~SentenceAnonymizer() {
|
||||
SentenceTokenizer::~SentenceTokenizer() {
|
||||
}
|
||||
|
||||
boost::shared_ptr<AnonymizedSentence>
|
||||
SentenceAnonymizer::anonymize(const std::string & sentence) {
|
||||
boost::shared_ptr<AnonymizedSentence>
|
||||
result(new AnonymizedSentence(sentence));
|
||||
boost::shared_ptr<TokenizedSentence>
|
||||
SentenceTokenizer::tokenize(const std::string & sentence) {
|
||||
boost::shared_ptr<TokenizedSentence>
|
||||
result(new TokenizedSentence(sentence));
|
||||
|
||||
_htmlTags->apply(result);
|
||||
|
||||
@ -41,13 +40,14 @@ boost::shared_ptr<AnonymizedSentence>
|
||||
if (_stopWordsEnabled) {
|
||||
_stopWords->apply(result);
|
||||
}
|
||||
_stopSymbols->apply(result);
|
||||
_spaceSymbols->apply(result);
|
||||
|
||||
boost::shared_ptr<RegexRule> wordsRule(
|
||||
new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word"));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
|
||||
void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
|
||||
if (boost::filesystem::exists(namedEntitiesPath)) {
|
||||
std::string line;
|
||||
std::ifstream neFile(namedEntitiesPath.c_str());
|
||||
@ -66,7 +66,9 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
|
||||
throw ConcordiaException(ss.str());
|
||||
} else {
|
||||
_namedEntities.push_back(RegexRule(
|
||||
tokenTexts->at(0), tokenTexts->at(1)));
|
||||
tokenTexts->at(0),
|
||||
TokenAnnotation::NE_TYPE,
|
||||
tokenTexts->at(1)));
|
||||
}
|
||||
}
|
||||
neFile.close();
|
||||
@ -78,7 +80,7 @@ void SentenceAnonymizer::_createNeRules(std::string & namedEntitiesPath) {
|
||||
}
|
||||
}
|
||||
|
||||
void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
||||
void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
||||
std::string tagsExpression = "<\\/?(";
|
||||
if (boost::filesystem::exists(htmlTagsPath)) {
|
||||
std::string line;
|
||||
@ -97,12 +99,15 @@ void SentenceAnonymizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
||||
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
||||
tagsExpression += "br).*?>";
|
||||
_htmlTags = boost::shared_ptr<RegexRule>(
|
||||
new RegexRule(tagsExpression, "", false));
|
||||
new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false));
|
||||
}
|
||||
|
||||
boost::shared_ptr<RegexRule>
|
||||
SentenceAnonymizer::_getMultipleReplacementRule(
|
||||
std::string & filePath, std::string replacement, bool wholeWord) {
|
||||
SentenceTokenizer::_getMultipleRegexRule(
|
||||
std::string filePath,
|
||||
char annotationType,
|
||||
std::string value,
|
||||
bool wholeWord) {
|
||||
std::string expression = "(";
|
||||
if (boost::filesystem::exists(filePath)) {
|
||||
std::string line;
|
||||
@ -128,6 +133,6 @@ boost::shared_ptr<RegexRule>
|
||||
expression = expression.substr(0, expression.size()-1);
|
||||
expression += ")";
|
||||
return boost::shared_ptr<RegexRule>(
|
||||
new RegexRule(expression, replacement, false));
|
||||
new RegexRule(expression, annotationType, value, false));
|
||||
}
|
||||
|
@ -1,10 +1,10 @@
|
||||
#ifndef SENTENCE_ANONYMIZER_HDR
|
||||
#define SENTENCE_ANONYMIZER_HDR
|
||||
#ifndef SENTENCE_TOKENIZER_HDR
|
||||
#define SENTENCE_TOKENIZER_HDR
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/anonymized_sentence.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include "concordia/regex_rule.hpp"
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
@ -13,42 +13,42 @@
|
||||
|
||||
|
||||
/*!
|
||||
Class for anonymizing sentence before generating hash.
|
||||
Class for tokenizing sentence before generating hash.
|
||||
This operation is is used to
|
||||
remove unnecessary symbols and possibly words from sentences added to index
|
||||
and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
||||
with a single space, removes stop words (if the option is enabled), as well as
|
||||
named entities and special symbols. All these have to be listed in files
|
||||
and search patterns. Tokenizer annotates html tags, removes stop words (if the option is enabled),
|
||||
as well as annotates named entities and special symbols. All these have to be listed in files
|
||||
(see \ref tutorial3).
|
||||
*/
|
||||
|
||||
class SentenceAnonymizer {
|
||||
class SentenceTokenizer {
|
||||
public:
|
||||
/*! Constructor.
|
||||
\param config config object, holding paths to necessary files
|
||||
*/
|
||||
explicit SentenceAnonymizer(boost::shared_ptr<ConcordiaConfig> config)
|
||||
explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~SentenceAnonymizer();
|
||||
virtual ~SentenceTokenizer();
|
||||
|
||||
/*! Anonymizes the sentence.
|
||||
/*! Tokenizes the sentence.
|
||||
\param sentence input sentence
|
||||
\returns altered version of the input sentence
|
||||
*/
|
||||
boost::shared_ptr<AnonymizedSentence>
|
||||
anonymize(const std::string & sentence);
|
||||
boost::shared_ptr<TokenizedSentence>
|
||||
tokenize(const std::string & sentence);
|
||||
|
||||
private:
|
||||
void _createNeRules(std::string & namedEntitiesPath);
|
||||
|
||||
void _createHtmlTagsRule(std::string & htmlTagsPath);
|
||||
|
||||
boost::shared_ptr<RegexRule> _getMultipleReplacementRule(
|
||||
std::string & filePath,
|
||||
std::string replacement,
|
||||
boost::shared_ptr<RegexRule> _getMultipleRegexRule(
|
||||
std::string filePath,
|
||||
char annotationType,
|
||||
std::string value,
|
||||
bool wholeWord = false);
|
||||
|
||||
std::vector<RegexRule> _namedEntities;
|
||||
@ -59,9 +59,6 @@ private:
|
||||
|
||||
boost::shared_ptr<RegexRule> _stopWords;
|
||||
|
||||
boost::shared_ptr<RegexRule> _stopSymbols;
|
||||
|
||||
boost::shared_ptr<RegexRule> _spaceSymbols;
|
||||
};
|
||||
|
||||
#endif
|
@ -1,8 +1,8 @@
|
||||
add_library(concordia-tests
|
||||
test_regex_rule.cpp
|
||||
test_anonymized_sentence.cpp
|
||||
test_tokenized_sentence.cpp
|
||||
test_concordia_searcher.cpp
|
||||
test_sentence_anonymizer.cpp
|
||||
test_sentence_tokenizer.cpp
|
||||
test_text_utils.cpp
|
||||
test_example.cpp
|
||||
test_tm_matches.cpp
|
||||
|
@ -16,10 +16,8 @@ BOOST_AUTO_TEST_CASE( ConfigParameters )
|
||||
BOOST_CHECK_EQUAL( config.getHashedIndexFilePath() , "/tmp/hi.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getMarkersFilePath() , "/tmp/ma.bin" );
|
||||
BOOST_CHECK_EQUAL( config.getHtmlTagsFilePath() , "/tmp/html_tags.txt" );
|
||||
BOOST_CHECK_EQUAL( config.getSpaceSymbolsFilePath() , "/tmp/space_symbols.txt" );
|
||||
BOOST_CHECK_EQUAL( config.getStopWordsFilePath() , "/tmp/stop_words.txt" );
|
||||
BOOST_CHECK_EQUAL( config.getNamedEntitiesFilePath() , "/tmp/named_entities.txt" );
|
||||
BOOST_CHECK_EQUAL( config.getStopSymbolsFilePath() , "/tmp/stop_symbols.txt" );
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( NonexistentConfigTest )
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/regex_rule.hpp"
|
||||
#include "concordia/anonymized_sentence.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include "concordia/token_annotation.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
@ -11,11 +12,11 @@ BOOST_AUTO_TEST_SUITE(regex_rule)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
||||
{
|
||||
RegexRule rr("a","b");
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("xxxxxxxaxxxaxxaxaxa"));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b");
|
||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
|
||||
rr.apply(ts);
|
||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
|
||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),7);
|
||||
@ -43,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex )
|
||||
bool exceptionThrown = false;
|
||||
std::string message = "";
|
||||
try {
|
||||
RegexRule rr("+a","b");
|
||||
RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b");
|
||||
} catch (ConcordiaException & e) {
|
||||
exceptionThrown = true;
|
||||
message = e.what();
|
||||
@ -54,11 +55,11 @@ BOOST_AUTO_TEST_CASE( BadRegex )
|
||||
|
||||
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
||||
{
|
||||
RegexRule rr("['\"\\\\.]","");
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),5);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, "");
|
||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
|
||||
rr.apply(ts);
|
||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
|
||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),3);
|
||||
@ -84,11 +85,11 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
|
||||
{
|
||||
RegexRule rr("abc","xxx", false);
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("This is AbC and ABC and abc and aBC."));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),4);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false);
|
||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
|
||||
rr.apply(ts);
|
||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
|
||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||
@ -109,11 +110,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
|
||||
|
||||
BOOST_AUTO_TEST_CASE( UnicodeReplacement )
|
||||
{
|
||||
RegexRule rr("ą","x");
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń"));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),1);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x");
|
||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
|
||||
rr.apply(ts);
|
||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
|
||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||
@ -122,11 +123,11 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement )
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
|
||||
{
|
||||
RegexRule rr("ą","x", false);
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),2);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false);
|
||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
||||
rr.apply(ts);
|
||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
|
||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),11);
|
||||
@ -139,11 +140,11 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
||||
{
|
||||
RegexRule rr("[ąćęłńóśżź]","x", false);
|
||||
boost::shared_ptr<AnonymizedSentence> as(new AnonymizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
||||
rr.apply(as);
|
||||
BOOST_CHECK_EQUAL(as->getAnnotations().size(),18);
|
||||
std::list<TokenAnnotation> annotations = as->getAnnotations();
|
||||
RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false);
|
||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
||||
rr.apply(ts);
|
||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
|
||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),2);
|
||||
|
@ -1,76 +0,0 @@
|
||||
#include <boost/filesystem.hpp>
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/sentence_anonymizer.hpp"
|
||||
#include "tests/common/test_resources_manager.hpp"
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(sentence_anonymizer)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( NETest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceAnonymizer anonymizer(config);
|
||||
|
||||
|
||||
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"date ne_date mail ne_email number ne_number");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceAnonymizer anonymizer(config);
|
||||
|
||||
|
||||
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"link and bold and newline ");
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( StopWordsTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
if (config->isStopWordsEnabled()) {
|
||||
SentenceAnonymizer anonymizer(config);
|
||||
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence()," wiem konieczne");
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceAnonymizer anonymizer(config);
|
||||
|
||||
|
||||
std::string sentence = "xxx, . xxx # xx $xx@ xx";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx xx");
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceAnonymizer anonymizer(config);
|
||||
|
||||
|
||||
std::string sentence = "xxx-xxx xx|xx";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"xxx xxx xx xx");
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceAnonymizer anonymizer(config);
|
||||
|
||||
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
||||
BOOST_CHECK_EQUAL(anonymizer.anonymize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
89
concordia/t/test_sentence_tokenizer.cpp
Normal file
89
concordia/t/test_sentence_tokenizer.cpp
Normal file
@ -0,0 +1,89 @@
|
||||
#include <boost/filesystem.hpp>
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/sentence_tokenizer.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include "tests/common/test_resources_manager.hpp"
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(sentence_tokenizer)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( NETest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceTokenizer tokenizer(config);
|
||||
|
||||
|
||||
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
|
||||
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||
|
||||
BOOST_CHECK_EQUAL(8,annotations.size());
|
||||
BOOST_FOREACH(TokenAnnotation annotation, annotations) {
|
||||
std::cout << annotation.getStart() << ","
|
||||
<< annotation.getEnd() << " type: "
|
||||
<< annotation.getType() << " value: "
|
||||
<< annotation.getValue() << std::endl;
|
||||
}
|
||||
// BOOST_CHECK_EQUAL(,"date ne_date mail ne_email number ne_number");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceTokenizer tokenizer(config);
|
||||
|
||||
|
||||
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
||||
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline ");
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( StopWordsTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
if (config->isStopWordsEnabled()) {
|
||||
SentenceTokenizer tokenizer(config);
|
||||
std::string sentence = "Aczkolwiek nie wiem, czy to konieczne";
|
||||
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence()," wiem konieczne");
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceTokenizer tokenizer(config);
|
||||
|
||||
|
||||
std::string sentence = "xxx, . xxx # xx $xx@ xx";
|
||||
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx xx");
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceTokenizer tokenizer(config);
|
||||
|
||||
|
||||
std::string sentence = "xxx-xxx xx|xx";
|
||||
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx");
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
||||
{
|
||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||
SentenceTokenizer tokenizer(config);
|
||||
|
||||
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
||||
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
@ -1,14 +1,14 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/anonymized_sentence.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include "concordia/token_annotation.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <iostream>
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(anonymized_sentence)
|
||||
BOOST_AUTO_TEST_SUITE(tokenized_sentence)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
|
||||
{
|
||||
AnonymizedSentence as("This is a test sentence");
|
||||
TokenizedSentence ts("This is a test sentence");
|
||||
|
||||
std::vector<TokenAnnotation> annotations;
|
||||
annotations.push_back(TokenAnnotation(0,1,'a',"val"));
|
||||
@ -16,22 +16,22 @@ BOOST_AUTO_TEST_CASE( AnnotationsTrivial )
|
||||
annotations.push_back(TokenAnnotation(7,10,'a',"val"));
|
||||
annotations.push_back(TokenAnnotation(12,14,'a',"val"));
|
||||
|
||||
as.addAnnotations(annotations);
|
||||
ts.addAnnotations(annotations);
|
||||
|
||||
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 4);
|
||||
BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 4);
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
|
||||
{
|
||||
AnonymizedSentence as("This is a test sentence");
|
||||
TokenizedSentence ts("This is a test sentence");
|
||||
|
||||
std::vector<TokenAnnotation> annotations1;
|
||||
annotations1.push_back(TokenAnnotation(0,1,'a',"val"));
|
||||
annotations1.push_back(TokenAnnotation(4,6,'a',"val"));
|
||||
annotations1.push_back(TokenAnnotation(7,10,'a',"val"));
|
||||
annotations1.push_back(TokenAnnotation(12,14,'a',"val"));
|
||||
as.addAnnotations(annotations1);
|
||||
ts.addAnnotations(annotations1);
|
||||
/* annotation
|
||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
- ---- ------- -----
|
||||
@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
|
||||
annotations2.push_back(TokenAnnotation(4,7,'a',"val"));
|
||||
annotations2.push_back(TokenAnnotation(10,11,'a',"val"));
|
||||
annotations2.push_back(TokenAnnotation(11,13,'a',"val"));
|
||||
as.addAnnotations(annotations2);
|
||||
ts.addAnnotations(annotations2);
|
||||
/* annotations2
|
||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
------- ------- -- -----
|
||||
@ -53,8 +53,8 @@ BOOST_AUTO_TEST_CASE( AnnotationsIntersecting )
|
||||
- ------- ---- ------- -- -----
|
||||
|
||||
*/
|
||||
BOOST_CHECK_EQUAL(as.getAnnotations().size(), 6);
|
||||
std::list<TokenAnnotation> annotations = as.getAnnotations();
|
||||
BOOST_CHECK_EQUAL(ts.getAnnotations().size(), 6);
|
||||
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
@ -13,3 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
||||
TokenAnnotation::~TokenAnnotation() {
|
||||
}
|
||||
|
||||
char TokenAnnotation::NE_TYPE = 0;
|
||||
char TokenAnnotation::WORD_TYPE = 1;
|
||||
char TokenAnnotation::HTML_TAG_TYPE = 2;
|
||||
char TokenAnnotation::STOP_WORD_TYPE = 3;
|
||||
|
@ -44,6 +44,14 @@ public:
|
||||
return _value;
|
||||
}
|
||||
|
||||
static char NE_TYPE;
|
||||
|
||||
static char WORD_TYPE;
|
||||
|
||||
static char HTML_TAG_TYPE;
|
||||
|
||||
static char STOP_WORD_TYPE;
|
||||
|
||||
protected:
|
||||
char _annotationType;
|
||||
|
||||
|
@ -1,16 +1,16 @@
|
||||
#include "concordia/anonymized_sentence.hpp"
|
||||
#include "concordia/tokenized_sentence.hpp"
|
||||
#include "concordia/common/text_utils.hpp"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
AnonymizedSentence::AnonymizedSentence(std::string sentence):
|
||||
TokenizedSentence::TokenizedSentence(std::string sentence):
|
||||
_sentence(sentence) {
|
||||
}
|
||||
|
||||
AnonymizedSentence::~AnonymizedSentence() {
|
||||
TokenizedSentence::~TokenizedSentence() {
|
||||
}
|
||||
|
||||
void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
|
||||
void TokenizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations) {
|
||||
std::vector<TokenAnnotation>::iterator newAnnotation = annotations.begin();
|
||||
std::list<TokenAnnotation>::iterator existingAnnotation = _tokenAnnotations.begin();
|
||||
|
||||
@ -43,6 +43,6 @@ void AnonymizedSentence::addAnnotations(std::vector<TokenAnnotation> annotations
|
||||
|
||||
}
|
||||
|
||||
void AnonymizedSentence::toLowerCase() {
|
||||
void TokenizedSentence::toLowerCase() {
|
||||
_sentence = TextUtils::getInstance().toLowerCase(_sentence);
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
#ifndef ANONYMIZED_SENTENCE_HDR
|
||||
#define ANONYMIZED_SENTENCE_HDR
|
||||
#ifndef TOKENIZED_SENTENCE_HDR
|
||||
#define TOKENIZED_SENTENCE_HDR
|
||||
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/token_annotation.hpp"
|
||||
@ -13,17 +13,17 @@
|
||||
along with the annotations list.
|
||||
*/
|
||||
|
||||
class AnonymizedSentence {
|
||||
class TokenizedSentence {
|
||||
public:
|
||||
/*!
|
||||
Constructor.
|
||||
|
||||
*/
|
||||
AnonymizedSentence(std::string sentence);
|
||||
TokenizedSentence(std::string sentence);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~AnonymizedSentence();
|
||||
virtual ~TokenizedSentence();
|
||||
|
||||
/*! Getter for sentence
|
||||
\returns sentence
|
@ -207,9 +207,8 @@ markers_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_markers.bin"
|
||||
word_map_path = "<CONCORDIA_HOME>/tests/resources/temp/temp_word_map.bin"
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# The following settings control the sentence anonymizer mechanism. It is used to
|
||||
# remove unnecessary symbols and possibly words from sentences added to index
|
||||
# and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
||||
# The following settings control the sentence tokenizer mechanism. Tokenizer
|
||||
# takes into account html tags, substitutes predefined symbols
|
||||
# with a single space, removes stop words (if the option is enabled), as well as
|
||||
# named entities and special symbols. All these have to be listed in files.
|
||||
|
||||
|
@ -1,6 +0,0 @@
|
||||
\|
|
||||
\–
|
||||
\-
|
||||
\/
|
||||
;
|
||||
:
|
@ -1,37 +0,0 @@
|
||||
\\tab
|
||||
\\emdash
|
||||
\<
|
||||
\>
|
||||
\&
|
||||
\"
|
||||
\‐
|
||||
\
|
||||
<
|
||||
>
|
||||
=
|
||||
\+
|
||||
„
|
||||
”
|
||||
\"
|
||||
…
|
||||
\.
|
||||
\,
|
||||
\?
|
||||
!
|
||||
'
|
||||
\(
|
||||
\)
|
||||
\{
|
||||
\}
|
||||
\@
|
||||
\#
|
||||
\$
|
||||
\%
|
||||
\^
|
||||
\&
|
||||
\*
|
||||
\[
|
||||
\]
|
||||
\\
|
||||
\~
|
||||
&#\d+
|
@ -22,21 +22,15 @@ word_map_path = "@PROD_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
||||
# named entities and special symbols. All these have to be listed in files.
|
||||
|
||||
# File containing all html tags (one per line)
|
||||
html_tags_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
||||
|
||||
# File containing all symbols to be replaced by spaces
|
||||
space_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
||||
html_tags_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
|
||||
|
||||
# If set to true, words from predefined list are removed
|
||||
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||
|
||||
# If stop_words_enabled is true, set the path to the stop words file
|
||||
#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
||||
#stop_words_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
|
||||
|
||||
# File containing regular expressions that match named entities
|
||||
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
||||
|
||||
# File containing special symbols (one per line) to be removed
|
||||
stop_symbols_path = "@PROD_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
|
||||
named_entities_path = "@PROD_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
|
||||
|
||||
### eof
|
||||
|
@ -1,6 +0,0 @@
|
||||
\|
|
||||
\–
|
||||
\-
|
||||
\/
|
||||
;
|
||||
:
|
@ -1,37 +0,0 @@
|
||||
\\tab
|
||||
\\emdash
|
||||
\<
|
||||
\>
|
||||
\&
|
||||
\"
|
||||
\‐
|
||||
\
|
||||
<
|
||||
>
|
||||
=
|
||||
\+
|
||||
„
|
||||
”
|
||||
\"
|
||||
…
|
||||
\.
|
||||
\,
|
||||
\?
|
||||
!
|
||||
'
|
||||
\(
|
||||
\)
|
||||
\{
|
||||
\}
|
||||
\@
|
||||
\#
|
||||
\$
|
||||
\%
|
||||
\^
|
||||
\&
|
||||
\*
|
||||
\[
|
||||
\]
|
||||
\\
|
||||
\~
|
||||
&#\d+
|
@ -14,15 +14,11 @@ markers_path = "/tmp/ma.bin"
|
||||
|
||||
html_tags_path = "/tmp/html_tags.txt"
|
||||
|
||||
space_symbols_path = "/tmp/space_symbols.txt"
|
||||
|
||||
stop_words_enabled = "true"
|
||||
|
||||
stop_words_path = "/tmp/stop_words.txt"
|
||||
|
||||
named_entities_path = "/tmp/named_entities.txt"
|
||||
|
||||
stop_symbols_path = "/tmp/stop_symbols.txt"
|
||||
|
||||
|
||||
### eof
|
||||
|
@ -22,21 +22,15 @@ word_map_path = "@TEST_RESOURCES_DIRECTORY@/temp/@TEMP_WORD_MAP@"
|
||||
# named entities and special symbols. All these have to be listed in files.
|
||||
|
||||
# File containing all html tags (one per line)
|
||||
html_tags_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
||||
|
||||
# File containing all symbols to be replaced by spaces
|
||||
space_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
||||
html_tags_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
|
||||
|
||||
# If set to true, words from predefined list are removed
|
||||
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||
|
||||
# If stop_words_enabled is true, set the path to the stop words file
|
||||
#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
||||
#stop_words_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
|
||||
|
||||
# File containing regular expressions that match named entities
|
||||
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
||||
|
||||
# File containing special symbols (one per line) to be removed
|
||||
stop_symbols_path = "@TEST_RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
|
||||
named_entities_path = "@TEST_RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
|
||||
|
||||
### eof
|
||||
|
Loading…
Reference in New Issue
Block a user