working sentence tokenizer
This commit is contained in:
parent
8432dd321f
commit
9b1735516c
1
TODO.txt
1
TODO.txt
@ -1,5 +1,6 @@
|
|||||||
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
||||||
|
|
||||||
|
- repair stop words feature
|
||||||
- work on word regex pattern (allow for some symbols and digits within the word)
|
- work on word regex pattern (allow for some symbols and digits within the word)
|
||||||
- document the code (classes, cfg files) and update tutorial
|
- document the code (classes, cfg files) and update tutorial
|
||||||
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
|
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
#include <boost/throw_exception.hpp>
|
#include <boost/throw_exception.hpp>
|
||||||
|
|
||||||
RegexRule::RegexRule(std::string patternString,
|
RegexRule::RegexRule(std::string patternString,
|
||||||
char annotationType,
|
int annotationType,
|
||||||
std::string value,
|
std::string value,
|
||||||
bool caseSensitive)
|
bool caseSensitive)
|
||||||
throw(ConcordiaException):
|
throw(ConcordiaException):
|
||||||
@ -43,7 +43,15 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
|
|||||||
for (; begin != end; ++begin) {
|
for (; begin != end; ++begin) {
|
||||||
SUFFIX_MARKER_TYPE matchBegin = begin->position();
|
SUFFIX_MARKER_TYPE matchBegin = begin->position();
|
||||||
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
|
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
|
||||||
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, "");
|
std::string value;
|
||||||
|
if (_annotationType == TokenAnnotation::WORD) {
|
||||||
|
UnicodeString unicodeValue;
|
||||||
|
s.extract(begin->position(), begin->length(), unicodeValue);
|
||||||
|
unicodeValue.toUTF8String(value);
|
||||||
|
} else {
|
||||||
|
value = _value;
|
||||||
|
}
|
||||||
|
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value);
|
||||||
annotations.push_back(annotation);
|
annotations.push_back(annotation);
|
||||||
}
|
}
|
||||||
sentence->addAnnotations(annotations);
|
sentence->addAnnotations(annotations);
|
||||||
|
@ -28,7 +28,7 @@ public:
|
|||||||
\param caseSensitive case sensitivity of the pattern
|
\param caseSensitive case sensitivity of the pattern
|
||||||
*/
|
*/
|
||||||
RegexRule(std::string patternString,
|
RegexRule(std::string patternString,
|
||||||
char annotationType,
|
int annotationType,
|
||||||
std::string value,
|
std::string value,
|
||||||
bool caseSensitive = true)
|
bool caseSensitive = true)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
@ -43,7 +43,7 @@ public:
|
|||||||
void apply(boost::shared_ptr<TokenizedSentence> sentence);
|
void apply(boost::shared_ptr<TokenizedSentence> sentence);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
char _annotationType;
|
int _annotationType;
|
||||||
|
|
||||||
std::string _value;
|
std::string _value;
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ SentenceTokenizer::SentenceTokenizer(
|
|||||||
if (_stopWordsEnabled) {
|
if (_stopWordsEnabled) {
|
||||||
_stopWords = _getMultipleRegexRule(
|
_stopWords = _getMultipleRegexRule(
|
||||||
config->getStopWordsFilePath(),
|
config->getStopWordsFilePath(),
|
||||||
TokenAnnotation::STOP_WORD_TYPE,
|
TokenAnnotation::STOP_WORD,
|
||||||
"", true);
|
"", true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -42,7 +42,11 @@ boost::shared_ptr<TokenizedSentence>
|
|||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule> wordsRule(
|
boost::shared_ptr<RegexRule> wordsRule(
|
||||||
new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word"));
|
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, ""));
|
||||||
|
wordsRule->apply(result);
|
||||||
|
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
||||||
|
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
||||||
|
singleLetterWordsRule->apply(result);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -67,7 +71,7 @@ void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
|
|||||||
} else {
|
} else {
|
||||||
_namedEntities.push_back(RegexRule(
|
_namedEntities.push_back(RegexRule(
|
||||||
tokenTexts->at(0),
|
tokenTexts->at(0),
|
||||||
TokenAnnotation::NE_TYPE,
|
TokenAnnotation::NE,
|
||||||
tokenTexts->at(1)));
|
tokenTexts->at(1)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -99,7 +103,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
|
|||||||
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
|
||||||
tagsExpression += "br).*?>";
|
tagsExpression += "br).*?>";
|
||||||
_htmlTags = boost::shared_ptr<RegexRule>(
|
_htmlTags = boost::shared_ptr<RegexRule>(
|
||||||
new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false));
|
new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false));
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule>
|
boost::shared_ptr<RegexRule>
|
||||||
|
@ -10,9 +10,9 @@
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE(regex_rule)
|
BOOST_AUTO_TEST_SUITE(regex_rule)
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( SimpleReplacement )
|
BOOST_AUTO_TEST_CASE( SimpleAnnotation )
|
||||||
{
|
{
|
||||||
RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b");
|
RegexRule rr("a", TokenAnnotation::WORD, "b");
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
|
||||||
@ -44,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex )
|
|||||||
bool exceptionThrown = false;
|
bool exceptionThrown = false;
|
||||||
std::string message = "";
|
std::string message = "";
|
||||||
try {
|
try {
|
||||||
RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b");
|
RegexRule rr("+a",TokenAnnotation::WORD, "b");
|
||||||
} catch (ConcordiaException & e) {
|
} catch (ConcordiaException & e) {
|
||||||
exceptionThrown = true;
|
exceptionThrown = true;
|
||||||
message = e.what();
|
message = e.what();
|
||||||
@ -53,9 +53,9 @@ BOOST_AUTO_TEST_CASE( BadRegex )
|
|||||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);
|
BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
|
||||||
{
|
{
|
||||||
RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, "");
|
RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, "");
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
|
||||||
@ -83,9 +83,9 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
|
BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
|
||||||
{
|
{
|
||||||
RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false);
|
RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false);
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
|
||||||
@ -108,9 +108,9 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
|
|||||||
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( UnicodeReplacement )
|
BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
|
||||||
{
|
{
|
||||||
RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x");
|
RegexRule rr("ą", TokenAnnotation::WORD, "x");
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
|
||||||
@ -121,9 +121,9 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement )
|
|||||||
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
BOOST_CHECK_EQUAL(iter->getEnd(),12);
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
|
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
|
||||||
{
|
{
|
||||||
RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false);
|
RegexRule rr("ą", TokenAnnotation::WORD, "x", false);
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
|
||||||
@ -140,7 +140,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
||||||
{
|
{
|
||||||
RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false);
|
RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false);
|
||||||
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
|
||||||
rr.apply(ts);
|
rr.apply(ts);
|
||||||
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
|
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);
|
||||||
|
@ -19,29 +19,310 @@ BOOST_AUTO_TEST_CASE( NETest )
|
|||||||
SentenceTokenizer tokenizer(config);
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
|
|
||||||
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34";
|
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
||||||
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
||||||
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(14,annotations.size());
|
||||||
|
|
||||||
|
/*
|
||||||
|
0,4 type: 1 value: date
|
||||||
|
6,16 type: 0 value: ne_date
|
||||||
|
18,22 type: 1 value: mail
|
||||||
|
24,40 type: 0 value: ne_email
|
||||||
|
42,48 type: 1 value: number
|
||||||
|
50,54 type: 0 value: ne_number
|
||||||
|
56,61 type: 1 value: hello
|
||||||
|
61,62 type: 0 value: ne_number
|
||||||
|
63,69 type: 1 value: zażółć
|
||||||
|
70,75 type: 1 value: gęślą
|
||||||
|
76,80 type: 1 value: jaźń
|
||||||
|
82,88 type: 1 value: zażółć
|
||||||
|
89,94 type: 1 value: gęślą
|
||||||
|
95,99 type: 1 value: jaźń
|
||||||
|
*/
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(8,annotations.size());
|
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
||||||
BOOST_FOREACH(TokenAnnotation annotation, annotations) {
|
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||||
std::cout << annotation.getStart() << ","
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
<< annotation.getEnd() << " type: "
|
BOOST_CHECK_EQUAL(iter->getValue(), "date");
|
||||||
<< annotation.getType() << " value: "
|
iter++;
|
||||||
<< annotation.getValue() << std::endl;
|
|
||||||
}
|
BOOST_CHECK_EQUAL(iter->getStart(),6);
|
||||||
// BOOST_CHECK_EQUAL(,"date ne_date mail ne_email number ne_number");
|
BOOST_CHECK_EQUAL(iter->getEnd(),16);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),18);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),22);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),24);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),40);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),42);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),48);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "number");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),50);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),54);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),56);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),61);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),61);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),62);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),63);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),69);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),70);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),75);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),76);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),80);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),82);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),88);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),89);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),94);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),95);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),99);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||||
{
|
{
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
SentenceTokenizer tokenizer(config);
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
|
|
||||||
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
|
||||||
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline ");
|
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
||||||
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
|
/*
|
||||||
|
0,23 type: 2 value:
|
||||||
|
23,27 type: 1 value: link
|
||||||
|
27,31 type: 2 value:
|
||||||
|
32,35 type: 1 value: and
|
||||||
|
36,39 type: 2 value:
|
||||||
|
39,43 type: 1 value: bold
|
||||||
|
43,47 type: 2 value:
|
||||||
|
48,51 type: 1 value: and
|
||||||
|
52,59 type: 1 value: newline
|
||||||
|
60,65 type: 2 value:
|
||||||
|
*/
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(10,annotations.size());
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),23);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),23);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),27);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"link");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),27);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),31);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),32);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),35);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),36);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),39);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),39);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),43);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),43);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),47);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),48);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),51);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"and");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),52);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),59);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),60);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),65);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
|
||||||
|
{
|
||||||
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
|
std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń";
|
||||||
|
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
||||||
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
|
/*
|
||||||
|
BOOST_FOREACH(TokenAnnotation annotation, annotations) {
|
||||||
|
std::cout << annotation.getStart() << ","
|
||||||
|
<< annotation.getEnd() << " type: "
|
||||||
|
<< annotation.getType() << " value: "
|
||||||
|
<< annotation.getValue() << std::endl;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
0,4 type: 1 value: this
|
||||||
|
5,7 type: 1 value: is
|
||||||
|
8,9 type: 1 value: a
|
||||||
|
10,18 type: 1 value: sentence
|
||||||
|
20,25 type: 1 value: don't
|
||||||
|
26,38 type: 1 value: over-analyze
|
||||||
|
39,41 type: 1 value: it
|
||||||
|
43,49 type: 1 value: zażółć
|
||||||
|
51,57 type: 1 value: gęś'lą
|
||||||
|
59,63 type: 1 value: jaźń
|
||||||
|
64,71 type: 1 value: zaż-ółć
|
||||||
|
72,77 type: 1 value: gęślą
|
||||||
|
78,83 type: 1 value: jaź'ń
|
||||||
|
*/
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(13,annotations.size());
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),0);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),4);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"this");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),5);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),7);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"is");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),8);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),9);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"a");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),10);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),18);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"sentence");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),20);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),25);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"don't");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),26);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),38);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),39);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),41);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"it");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),43);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),49);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"zażółć");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),51);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),57);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),59);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),63);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"jaźń");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),64);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),71);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),72);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),77);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"gęślą");
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(iter->getStart(),78);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getEnd(),83);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
|
||||||
|
BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń");
|
||||||
|
iter++;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( StopWordsTest )
|
BOOST_AUTO_TEST_CASE( StopWordsTest )
|
||||||
@ -54,36 +335,17 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
|
|
||||||
{
|
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
|
||||||
SentenceTokenizer tokenizer(config);
|
|
||||||
|
|
||||||
|
|
||||||
std::string sentence = "xxx, . xxx # xx $xx@ xx";
|
|
||||||
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx xx");
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
|
|
||||||
{
|
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
|
||||||
SentenceTokenizer tokenizer(config);
|
|
||||||
|
|
||||||
|
|
||||||
std::string sentence = "xxx-xxx xx|xx";
|
|
||||||
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx");
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
|
||||||
{
|
{
|
||||||
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
|
||||||
SentenceTokenizer tokenizer(config);
|
SentenceTokenizer tokenizer(config);
|
||||||
|
|
||||||
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
|
||||||
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number ");
|
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
|
||||||
|
std::list<TokenAnnotation> annotations = ts->getAnnotations();
|
||||||
|
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(161, annotations.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
||||||
const SUFFIX_MARKER_TYPE end,
|
const SUFFIX_MARKER_TYPE end,
|
||||||
const char annotationType,
|
const int annotationType,
|
||||||
const std::string & value):
|
const std::string & value):
|
||||||
Interval(start, end),
|
Interval(start, end),
|
||||||
_annotationType(annotationType),
|
_annotationType(annotationType),
|
||||||
@ -13,7 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
|||||||
TokenAnnotation::~TokenAnnotation() {
|
TokenAnnotation::~TokenAnnotation() {
|
||||||
}
|
}
|
||||||
|
|
||||||
char TokenAnnotation::NE_TYPE = 0;
|
int TokenAnnotation::NE = 0;
|
||||||
char TokenAnnotation::WORD_TYPE = 1;
|
int TokenAnnotation::WORD = 1;
|
||||||
char TokenAnnotation::HTML_TAG_TYPE = 2;
|
int TokenAnnotation::HTML_TAG = 2;
|
||||||
char TokenAnnotation::STOP_WORD_TYPE = 3;
|
int TokenAnnotation::STOP_WORD = 3;
|
||||||
|
@ -23,7 +23,7 @@ public:
|
|||||||
*/
|
*/
|
||||||
TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
TokenAnnotation(const SUFFIX_MARKER_TYPE start,
|
||||||
const SUFFIX_MARKER_TYPE end,
|
const SUFFIX_MARKER_TYPE end,
|
||||||
const char annotationType,
|
const int annotationType,
|
||||||
const std::string & value);
|
const std::string & value);
|
||||||
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
@ -33,7 +33,7 @@ public:
|
|||||||
/*! Getter for annotation type.
|
/*! Getter for annotation type.
|
||||||
\returns annotation type
|
\returns annotation type
|
||||||
*/
|
*/
|
||||||
char getType() const {
|
int getType() const {
|
||||||
return _annotationType;
|
return _annotationType;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -44,16 +44,16 @@ public:
|
|||||||
return _value;
|
return _value;
|
||||||
}
|
}
|
||||||
|
|
||||||
static char NE_TYPE;
|
static int NE;
|
||||||
|
|
||||||
static char WORD_TYPE;
|
static int WORD;
|
||||||
|
|
||||||
static char HTML_TAG_TYPE;
|
static int HTML_TAG;
|
||||||
|
|
||||||
static char STOP_WORD_TYPE;
|
static int STOP_WORD;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
char _annotationType;
|
int _annotationType;
|
||||||
|
|
||||||
std::string _value;
|
std::string _value;
|
||||||
};
|
};
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include "concordia/common/text_utils.hpp"
|
#include "concordia/common/text_utils.hpp"
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
TokenizedSentence::TokenizedSentence(std::string sentence):
|
TokenizedSentence::TokenizedSentence(std::string sentence):
|
||||||
_sentence(sentence) {
|
_sentence(sentence) {
|
||||||
|
Loading…
Reference in New Issue
Block a user