working sentence tokenizer

This commit is contained in:
rjawor 2015-06-25 20:49:22 +02:00
parent 8432dd321f
commit 9b1735516c
9 changed files with 343 additions and 67 deletions

View File

@ -1,5 +1,6 @@
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) ----------------------------- ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
- repair stop words feature
- work on word regex pattern (allow for some symbols and digits within the word) - work on word regex pattern (allow for some symbols and digits within the word)
- document the code (classes, cfg files) and update tutorial - document the code (classes, cfg files) and update tutorial
IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją) IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją)

View File

@ -5,7 +5,7 @@
#include <boost/throw_exception.hpp> #include <boost/throw_exception.hpp>
RegexRule::RegexRule(std::string patternString, RegexRule::RegexRule(std::string patternString,
char annotationType, int annotationType,
std::string value, std::string value,
bool caseSensitive) bool caseSensitive)
throw(ConcordiaException): throw(ConcordiaException):
@ -43,7 +43,15 @@ void RegexRule::apply(boost::shared_ptr<TokenizedSentence> sentence) {
for (; begin != end; ++begin) { for (; begin != end; ++begin) {
SUFFIX_MARKER_TYPE matchBegin = begin->position(); SUFFIX_MARKER_TYPE matchBegin = begin->position();
SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length(); SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length();
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, ""); std::string value;
if (_annotationType == TokenAnnotation::WORD) {
UnicodeString unicodeValue;
s.extract(begin->position(), begin->length(), unicodeValue);
unicodeValue.toUTF8String(value);
} else {
value = _value;
}
TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value);
annotations.push_back(annotation); annotations.push_back(annotation);
} }
sentence->addAnnotations(annotations); sentence->addAnnotations(annotations);

View File

@ -28,7 +28,7 @@ public:
\param caseSensitive case sensitivity of the pattern \param caseSensitive case sensitivity of the pattern
*/ */
RegexRule(std::string patternString, RegexRule(std::string patternString,
char annotationType, int annotationType,
std::string value, std::string value,
bool caseSensitive = true) bool caseSensitive = true)
throw(ConcordiaException); throw(ConcordiaException);
@ -43,7 +43,7 @@ public:
void apply(boost::shared_ptr<TokenizedSentence> sentence); void apply(boost::shared_ptr<TokenizedSentence> sentence);
private: private:
char _annotationType; int _annotationType;
std::string _value; std::string _value;

View File

@ -16,7 +16,7 @@ SentenceTokenizer::SentenceTokenizer(
if (_stopWordsEnabled) { if (_stopWordsEnabled) {
_stopWords = _getMultipleRegexRule( _stopWords = _getMultipleRegexRule(
config->getStopWordsFilePath(), config->getStopWordsFilePath(),
TokenAnnotation::STOP_WORD_TYPE, TokenAnnotation::STOP_WORD,
"", true); "", true);
} }
} }
@ -42,7 +42,11 @@ boost::shared_ptr<TokenizedSentence>
} }
boost::shared_ptr<RegexRule> wordsRule( boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word")); new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, ""));
wordsRule->apply(result);
boost::shared_ptr<RegexRule> singleLetterWordsRule(
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
singleLetterWordsRule->apply(result);
return result; return result;
} }
@ -67,7 +71,7 @@ void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) {
} else { } else {
_namedEntities.push_back(RegexRule( _namedEntities.push_back(RegexRule(
tokenTexts->at(0), tokenTexts->at(0),
TokenAnnotation::NE_TYPE, TokenAnnotation::NE,
tokenTexts->at(1))); tokenTexts->at(1)));
} }
} }
@ -99,7 +103,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) {
tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1);
tagsExpression += "br).*?>"; tagsExpression += "br).*?>";
_htmlTags = boost::shared_ptr<RegexRule>( _htmlTags = boost::shared_ptr<RegexRule>(
new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false)); new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false));
} }
boost::shared_ptr<RegexRule> boost::shared_ptr<RegexRule>

View File

@ -10,9 +10,9 @@
BOOST_AUTO_TEST_SUITE(regex_rule) BOOST_AUTO_TEST_SUITE(regex_rule)
BOOST_AUTO_TEST_CASE( SimpleReplacement ) BOOST_AUTO_TEST_CASE( SimpleAnnotation )
{ {
RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b"); RegexRule rr("a", TokenAnnotation::WORD, "b");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa"));
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
@ -44,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex )
bool exceptionThrown = false; bool exceptionThrown = false;
std::string message = ""; std::string message = "";
try { try {
RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b"); RegexRule rr("+a",TokenAnnotation::WORD, "b");
} catch (ConcordiaException & e) { } catch (ConcordiaException & e) {
exceptionThrown = true; exceptionThrown = true;
message = e.what(); message = e.what();
@ -53,9 +53,9 @@ BOOST_AUTO_TEST_CASE( BadRegex )
BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true); BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true);
} }
BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement ) BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation )
{ {
RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, ""); RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, "");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'.")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'."));
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5);
@ -83,9 +83,9 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement )
} }
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation )
{ {
RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false); RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC.")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("This is AbC and ABC and abc and aBC."));
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4);
@ -108,9 +108,9 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
BOOST_CHECK_EQUAL(iter->getEnd(),35); BOOST_CHECK_EQUAL(iter->getEnd(),35);
} }
BOOST_AUTO_TEST_CASE( UnicodeReplacement ) BOOST_AUTO_TEST_CASE( UnicodeAnnotation )
{ {
RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x"); RegexRule rr("ą", TokenAnnotation::WORD, "x");
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń"));
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1);
@ -121,9 +121,9 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement )
BOOST_CHECK_EQUAL(iter->getEnd(),12); BOOST_CHECK_EQUAL(iter->getEnd(),12);
} }
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation )
{ {
RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false); RegexRule rr("ą", TokenAnnotation::WORD, "x", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2);
@ -140,7 +140,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
{ {
RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false); RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false);
boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); boost::shared_ptr<TokenizedSentence> ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"));
rr.apply(ts); rr.apply(ts);
BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18);

View File

@ -19,29 +19,310 @@ BOOST_AUTO_TEST_CASE( NETest )
SentenceTokenizer tokenizer(config); SentenceTokenizer tokenizer(config);
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34"; std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence); boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations(); std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(14,annotations.size());
/*
0,4 type: 1 value: date
6,16 type: 0 value: ne_date
18,22 type: 1 value: mail
24,40 type: 0 value: ne_email
42,48 type: 1 value: number
50,54 type: 0 value: ne_number
56,61 type: 1 value: hello
61,62 type: 0 value: ne_number
63,69 type: 1 value: zażółć
70,75 type: 1 value: gęślą
76,80 type: 1 value: jaźń
82,88 type: 1 value: zażółć
89,94 type: 1 value: gęślą
95,99 type: 1 value: jaźń
*/
BOOST_CHECK_EQUAL(8,annotations.size()); BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_FOREACH(TokenAnnotation annotation, annotations) { BOOST_CHECK_EQUAL(iter->getEnd(),4);
std::cout << annotation.getStart() << "," BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
<< annotation.getEnd() << " type: " BOOST_CHECK_EQUAL(iter->getValue(), "date");
<< annotation.getType() << " value: " iter++;
<< annotation.getValue() << std::endl;
} BOOST_CHECK_EQUAL(iter->getStart(),6);
// BOOST_CHECK_EQUAL(,"date ne_date mail ne_email number ne_number"); BOOST_CHECK_EQUAL(iter->getEnd(),16);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_date");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),18);
BOOST_CHECK_EQUAL(iter->getEnd(),22);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "mail");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),24);
BOOST_CHECK_EQUAL(iter->getEnd(),40);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_email");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),42);
BOOST_CHECK_EQUAL(iter->getEnd(),48);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "number");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),50);
BOOST_CHECK_EQUAL(iter->getEnd(),54);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),56);
BOOST_CHECK_EQUAL(iter->getEnd(),61);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "hello");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),61);
BOOST_CHECK_EQUAL(iter->getEnd(),62);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE);
BOOST_CHECK_EQUAL(iter->getValue(), "ne_number");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),63);
BOOST_CHECK_EQUAL(iter->getEnd(),69);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),70);
BOOST_CHECK_EQUAL(iter->getEnd(),75);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),76);
BOOST_CHECK_EQUAL(iter->getEnd(),80);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),82);
BOOST_CHECK_EQUAL(iter->getEnd(),88);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "zażółć");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),89);
BOOST_CHECK_EQUAL(iter->getEnd(),94);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "gęślą");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),95);
BOOST_CHECK_EQUAL(iter->getEnd(),99);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(), "jaźń");
} }
BOOST_AUTO_TEST_CASE( HtmlTagsTest ) BOOST_AUTO_TEST_CASE( HtmlTagsTest )
{ {
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config); SentenceTokenizer tokenizer(config);
std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>"; std::string sentence = "<a href='http://wp.pl'>link</a> and <b>bold</b> and newline <br/>";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline "); boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
/*
0,23 type: 2 value:
23,27 type: 1 value: link
27,31 type: 2 value:
32,35 type: 1 value: and
36,39 type: 2 value:
39,43 type: 1 value: bold
43,47 type: 2 value:
48,51 type: 1 value: and
52,59 type: 1 value: newline
60,65 type: 2 value:
*/
BOOST_CHECK_EQUAL(10,annotations.size());
BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),23);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),23);
BOOST_CHECK_EQUAL(iter->getEnd(),27);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"link");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),27);
BOOST_CHECK_EQUAL(iter->getEnd(),31);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),32);
BOOST_CHECK_EQUAL(iter->getEnd(),35);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),36);
BOOST_CHECK_EQUAL(iter->getEnd(),39);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),39);
BOOST_CHECK_EQUAL(iter->getEnd(),43);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"bold");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),43);
BOOST_CHECK_EQUAL(iter->getEnd(),47);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),48);
BOOST_CHECK_EQUAL(iter->getEnd(),51);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"and");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),52);
BOOST_CHECK_EQUAL(iter->getEnd(),59);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"newline");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),60);
BOOST_CHECK_EQUAL(iter->getEnd(),65);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG);
}
BOOST_AUTO_TEST_CASE( InWordSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń";
boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
/*
BOOST_FOREACH(TokenAnnotation annotation, annotations) {
std::cout << annotation.getStart() << ","
<< annotation.getEnd() << " type: "
<< annotation.getType() << " value: "
<< annotation.getValue() << std::endl;
}
*/
/*
0,4 type: 1 value: this
5,7 type: 1 value: is
8,9 type: 1 value: a
10,18 type: 1 value: sentence
20,25 type: 1 value: don't
26,38 type: 1 value: over-analyze
39,41 type: 1 value: it
43,49 type: 1 value: zażółć
51,57 type: 1 value: gęś'
59,63 type: 1 value: jaźń
64,71 type: 1 value: zaż-ółć
72,77 type: 1 value: gęślą
78,83 type: 1 value: jaź'ń
*/
BOOST_CHECK_EQUAL(13,annotations.size());
BOOST_CHECK_EQUAL(iter->getStart(),0);
BOOST_CHECK_EQUAL(iter->getEnd(),4);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"this");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),5);
BOOST_CHECK_EQUAL(iter->getEnd(),7);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"is");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),8);
BOOST_CHECK_EQUAL(iter->getEnd(),9);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"a");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),10);
BOOST_CHECK_EQUAL(iter->getEnd(),18);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"sentence");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),20);
BOOST_CHECK_EQUAL(iter->getEnd(),25);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"don't");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),26);
BOOST_CHECK_EQUAL(iter->getEnd(),38);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),39);
BOOST_CHECK_EQUAL(iter->getEnd(),41);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"it");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),43);
BOOST_CHECK_EQUAL(iter->getEnd(),49);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"zażółć");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),51);
BOOST_CHECK_EQUAL(iter->getEnd(),57);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),59);
BOOST_CHECK_EQUAL(iter->getEnd(),63);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"jaźń");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),64);
BOOST_CHECK_EQUAL(iter->getEnd(),71);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),72);
BOOST_CHECK_EQUAL(iter->getEnd(),77);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"gęślą");
iter++;
BOOST_CHECK_EQUAL(iter->getStart(),78);
BOOST_CHECK_EQUAL(iter->getEnd(),83);
BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD);
BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń");
iter++;
} }
BOOST_AUTO_TEST_CASE( StopWordsTest ) BOOST_AUTO_TEST_CASE( StopWordsTest )
@ -54,36 +335,17 @@ BOOST_AUTO_TEST_CASE( StopWordsTest )
} }
} }
BOOST_AUTO_TEST_CASE( StopSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "xxx, . xxx # xx $xx@ xx";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx xx");
}
BOOST_AUTO_TEST_CASE( SpaceSymbolsTest )
{
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config);
std::string sentence = "xxx-xxx xx|xx";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx");
}
BOOST_AUTO_TEST_CASE( WeirdSentenceTest ) BOOST_AUTO_TEST_CASE( WeirdSentenceTest )
{ {
boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); boost::shared_ptr<ConcordiaConfig> config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg")));
SentenceTokenizer tokenizer(config); SentenceTokenizer tokenizer(config);
std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |"; std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |";
BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number "); boost::shared_ptr<TokenizedSentence> ts = tokenizer.tokenize(sentence);
std::list<TokenAnnotation> annotations = ts->getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(161, annotations.size());
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View File

@ -3,7 +3,7 @@
TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start, TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
const SUFFIX_MARKER_TYPE end, const SUFFIX_MARKER_TYPE end,
const char annotationType, const int annotationType,
const std::string & value): const std::string & value):
Interval(start, end), Interval(start, end),
_annotationType(annotationType), _annotationType(annotationType),
@ -13,7 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start,
TokenAnnotation::~TokenAnnotation() { TokenAnnotation::~TokenAnnotation() {
} }
char TokenAnnotation::NE_TYPE = 0; int TokenAnnotation::NE = 0;
char TokenAnnotation::WORD_TYPE = 1; int TokenAnnotation::WORD = 1;
char TokenAnnotation::HTML_TAG_TYPE = 2; int TokenAnnotation::HTML_TAG = 2;
char TokenAnnotation::STOP_WORD_TYPE = 3; int TokenAnnotation::STOP_WORD = 3;

View File

@ -23,7 +23,7 @@ public:
*/ */
TokenAnnotation(const SUFFIX_MARKER_TYPE start, TokenAnnotation(const SUFFIX_MARKER_TYPE start,
const SUFFIX_MARKER_TYPE end, const SUFFIX_MARKER_TYPE end,
const char annotationType, const int annotationType,
const std::string & value); const std::string & value);
/*! Destructor. /*! Destructor.
@ -33,7 +33,7 @@ public:
/*! Getter for annotation type. /*! Getter for annotation type.
\returns annotation type \returns annotation type
*/ */
char getType() const { int getType() const {
return _annotationType; return _annotationType;
} }
@ -44,16 +44,16 @@ public:
return _value; return _value;
} }
static char NE_TYPE; static int NE;
static char WORD_TYPE; static int WORD;
static char HTML_TAG_TYPE; static int HTML_TAG;
static char STOP_WORD_TYPE; static int STOP_WORD;
protected: protected:
char _annotationType; int _annotationType;
std::string _value; std::string _value;
}; };

View File

@ -2,6 +2,7 @@
#include "concordia/common/text_utils.hpp" #include "concordia/common/text_utils.hpp"
#include <iostream> #include <iostream>
#include <boost/foreach.hpp>
TokenizedSentence::TokenizedSentence(std::string sentence): TokenizedSentence::TokenizedSentence(std::string sentence):
_sentence(sentence) { _sentence(sentence) {