From 9b1735516c1ee4a59a81dfbbdf89f5ef76886aac Mon Sep 17 00:00:00 2001 From: rjawor Date: Thu, 25 Jun 2015 20:49:22 +0200 Subject: [PATCH] working sentence tokenizer --- TODO.txt | 1 + concordia/regex_rule.cpp | 12 +- concordia/regex_rule.hpp | 4 +- concordia/sentence_tokenizer.cpp | 12 +- concordia/t/test_regex_rule.cpp | 24 +- concordia/t/test_sentence_tokenizer.cpp | 332 +++++++++++++++++++++--- concordia/token_annotation.cpp | 10 +- concordia/token_annotation.hpp | 14 +- concordia/tokenized_sentence.cpp | 1 + 9 files changed, 343 insertions(+), 67 deletions(-) diff --git a/TODO.txt b/TODO.txt index 01c38d3..41a583d 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,5 +1,6 @@ ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) ----------------------------- +- repair stop words feature - work on word regex pattern (allow for some symbols and digits within the word) - document the code (classes, cfg files) and update tutorial IN PROGRESS - concordia search zwraca pozycje tokenów z hash'a. Jak to odnieść do examples w korpusie? Należy przechowywać oryginalne pozycje tokenów w bazie danych concordia-server. Pozycje te będą obliczane przez funkcję generateTokenVector (przy użyciu listy oryginalnych pozycji, która będzie modyfikowana synchronicznie z każdą anonimizacją) diff --git a/concordia/regex_rule.cpp b/concordia/regex_rule.cpp index 636dfda..062c118 100644 --- a/concordia/regex_rule.cpp +++ b/concordia/regex_rule.cpp @@ -5,7 +5,7 @@ #include RegexRule::RegexRule(std::string patternString, - char annotationType, + int annotationType, std::string value, bool caseSensitive) throw(ConcordiaException): @@ -43,7 +43,15 @@ void RegexRule::apply(boost::shared_ptr sentence) { for (; begin != end; ++begin) { SUFFIX_MARKER_TYPE matchBegin = begin->position(); SUFFIX_MARKER_TYPE matchEnd = matchBegin + begin->length(); - TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, ""); + std::string value; + if (_annotationType == TokenAnnotation::WORD) { + UnicodeString unicodeValue; + s.extract(begin->position(), begin->length(), unicodeValue); + unicodeValue.toUTF8String(value); + } else { + value = _value; + } + TokenAnnotation annotation(matchBegin, matchEnd, _annotationType, value); annotations.push_back(annotation); } sentence->addAnnotations(annotations); diff --git a/concordia/regex_rule.hpp b/concordia/regex_rule.hpp index 2c40bb3..dbc509a 100644 --- a/concordia/regex_rule.hpp +++ b/concordia/regex_rule.hpp @@ -28,7 +28,7 @@ public: \param caseSensitive case sensitivity of the pattern */ RegexRule(std::string patternString, - char annotationType, + int annotationType, std::string value, bool caseSensitive = true) throw(ConcordiaException); @@ -43,7 +43,7 @@ public: void apply(boost::shared_ptr sentence); private: - char _annotationType; + int _annotationType; std::string _value; diff --git a/concordia/sentence_tokenizer.cpp b/concordia/sentence_tokenizer.cpp index 663ed80..2adfbcd 100644 --- a/concordia/sentence_tokenizer.cpp +++ b/concordia/sentence_tokenizer.cpp @@ -16,7 +16,7 @@ SentenceTokenizer::SentenceTokenizer( if (_stopWordsEnabled) { _stopWords = _getMultipleRegexRule( config->getStopWordsFilePath(), - TokenAnnotation::STOP_WORD_TYPE, + TokenAnnotation::STOP_WORD, "", true); } } @@ -42,7 +42,11 @@ boost::shared_ptr } boost::shared_ptr wordsRule( - new RegexRule("\\w+", TokenAnnotation::WORD_TYPE, "word")); + new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}", TokenAnnotation::WORD, "")); + wordsRule->apply(result); + boost::shared_ptr singleLetterWordsRule( + new RegexRule("\\p{L}", TokenAnnotation::WORD, "")); + singleLetterWordsRule->apply(result); return result; } @@ -67,7 +71,7 @@ void SentenceTokenizer::_createNeRules(std::string & namedEntitiesPath) { } else { _namedEntities.push_back(RegexRule( tokenTexts->at(0), - TokenAnnotation::NE_TYPE, + TokenAnnotation::NE, tokenTexts->at(1))); } } @@ -99,7 +103,7 @@ void SentenceTokenizer::_createHtmlTagsRule(std::string & htmlTagsPath) { tagsExpression = tagsExpression.substr(0, tagsExpression.size()-1); tagsExpression += "br).*?>"; _htmlTags = boost::shared_ptr( - new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG_TYPE, "", false)); + new RegexRule(tagsExpression, TokenAnnotation::HTML_TAG, "", false)); } boost::shared_ptr diff --git a/concordia/t/test_regex_rule.cpp b/concordia/t/test_regex_rule.cpp index e650067..78685cf 100644 --- a/concordia/t/test_regex_rule.cpp +++ b/concordia/t/test_regex_rule.cpp @@ -10,9 +10,9 @@ BOOST_AUTO_TEST_SUITE(regex_rule) -BOOST_AUTO_TEST_CASE( SimpleReplacement ) +BOOST_AUTO_TEST_CASE( SimpleAnnotation ) { - RegexRule rr("a", TokenAnnotation::WORD_TYPE, "b"); + RegexRule rr("a", TokenAnnotation::WORD, "b"); boost::shared_ptr ts(new TokenizedSentence("xxxxxxxaxxxaxxaxaxa")); rr.apply(ts); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5); @@ -44,7 +44,7 @@ BOOST_AUTO_TEST_CASE( BadRegex ) bool exceptionThrown = false; std::string message = ""; try { - RegexRule rr("+a",TokenAnnotation::WORD_TYPE, "b"); + RegexRule rr("+a",TokenAnnotation::WORD, "b"); } catch (ConcordiaException & e) { exceptionThrown = true; message = e.what(); @@ -53,9 +53,9 @@ BOOST_AUTO_TEST_CASE( BadRegex ) BOOST_CHECK_EQUAL(boost::starts_with(message, "Bad regex pattern"), true); } -BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement ) +BOOST_AUTO_TEST_CASE( WeirdSymbolsAnnotation ) { - RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD_TYPE, ""); + RegexRule rr("['\"\\\\.]", TokenAnnotation::WORD, ""); boost::shared_ptr ts(new TokenizedSentence("Don't stop believin' \\ Hold on to the feelin'.")); rr.apply(ts); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),5); @@ -83,9 +83,9 @@ BOOST_AUTO_TEST_CASE( WeirdSymbolsReplacement ) } -BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement ) +BOOST_AUTO_TEST_CASE( CaseInsensitiveAnnotation ) { - RegexRule rr("abc", TokenAnnotation::WORD_TYPE, "xxx", false); + RegexRule rr("abc", TokenAnnotation::WORD, "xxx", false); boost::shared_ptr ts(new TokenizedSentence("This is AbC and ABC and abc and aBC.")); rr.apply(ts); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),4); @@ -108,9 +108,9 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement ) BOOST_CHECK_EQUAL(iter->getEnd(),35); } -BOOST_AUTO_TEST_CASE( UnicodeReplacement ) +BOOST_AUTO_TEST_CASE( UnicodeAnnotation ) { - RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x"); + RegexRule rr("ą", TokenAnnotation::WORD, "x"); boost::shared_ptr ts(new TokenizedSentence("zażółć gęślą jaźń")); rr.apply(ts); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),1); @@ -121,9 +121,9 @@ BOOST_AUTO_TEST_CASE( UnicodeReplacement ) BOOST_CHECK_EQUAL(iter->getEnd(),12); } -BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement ) +BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeAnnotation ) { - RegexRule rr("ą", TokenAnnotation::WORD_TYPE, "x", false); + RegexRule rr("ą", TokenAnnotation::WORD, "x", false); boost::shared_ptr ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); rr.apply(ts); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),2); @@ -140,7 +140,7 @@ BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement ) BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement ) { - RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD_TYPE, "x", false); + RegexRule rr("[ąćęłńóśżź]", TokenAnnotation::WORD, "x", false); boost::shared_ptr ts(new TokenizedSentence("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ")); rr.apply(ts); BOOST_CHECK_EQUAL(ts->getAnnotations().size(),18); diff --git a/concordia/t/test_sentence_tokenizer.cpp b/concordia/t/test_sentence_tokenizer.cpp index 49d7244..cd1df21 100644 --- a/concordia/t/test_sentence_tokenizer.cpp +++ b/concordia/t/test_sentence_tokenizer.cpp @@ -19,29 +19,310 @@ BOOST_AUTO_TEST_CASE( NETest ) SentenceTokenizer tokenizer(config); - std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34"; + std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ"; boost::shared_ptr ts = tokenizer.tokenize(sentence); std::list annotations = ts->getAnnotations(); + std::list::iterator iter = annotations.begin(); + + BOOST_CHECK_EQUAL(14,annotations.size()); + + /* + 0,4 type: 1 value: date + 6,16 type: 0 value: ne_date + 18,22 type: 1 value: mail + 24,40 type: 0 value: ne_email + 42,48 type: 1 value: number + 50,54 type: 0 value: ne_number + 56,61 type: 1 value: hello + 61,62 type: 0 value: ne_number + 63,69 type: 1 value: zażółć + 70,75 type: 1 value: gęślą + 76,80 type: 1 value: jaźń + 82,88 type: 1 value: zażółć + 89,94 type: 1 value: gęślą + 95,99 type: 1 value: jaźń + */ - BOOST_CHECK_EQUAL(8,annotations.size()); - BOOST_FOREACH(TokenAnnotation annotation, annotations) { - std::cout << annotation.getStart() << "," - << annotation.getEnd() << " type: " - << annotation.getType() << " value: " - << annotation.getValue() << std::endl; - } -// BOOST_CHECK_EQUAL(,"date ne_date mail ne_email number ne_number"); + BOOST_CHECK_EQUAL(iter->getStart(),0); + BOOST_CHECK_EQUAL(iter->getEnd(),4); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(), "date"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),6); + BOOST_CHECK_EQUAL(iter->getEnd(),16); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); + BOOST_CHECK_EQUAL(iter->getValue(), "ne_date"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),18); + BOOST_CHECK_EQUAL(iter->getEnd(),22); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(), "mail"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),24); + BOOST_CHECK_EQUAL(iter->getEnd(),40); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); + BOOST_CHECK_EQUAL(iter->getValue(), "ne_email"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),42); + BOOST_CHECK_EQUAL(iter->getEnd(),48); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(), "number"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),50); + BOOST_CHECK_EQUAL(iter->getEnd(),54); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); + BOOST_CHECK_EQUAL(iter->getValue(), "ne_number"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),56); + BOOST_CHECK_EQUAL(iter->getEnd(),61); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(), "hello"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),61); + BOOST_CHECK_EQUAL(iter->getEnd(),62); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::NE); + BOOST_CHECK_EQUAL(iter->getValue(), "ne_number"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),63); + BOOST_CHECK_EQUAL(iter->getEnd(),69); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(), "zażółć"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),70); + BOOST_CHECK_EQUAL(iter->getEnd(),75); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(), "gęślą"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),76); + BOOST_CHECK_EQUAL(iter->getEnd(),80); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(), "jaźń"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),82); + BOOST_CHECK_EQUAL(iter->getEnd(),88); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(), "zażółć"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),89); + BOOST_CHECK_EQUAL(iter->getEnd(),94); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(), "gęślą"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),95); + BOOST_CHECK_EQUAL(iter->getEnd(),99); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(), "jaźń"); + } BOOST_AUTO_TEST_CASE( HtmlTagsTest ) { boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); SentenceTokenizer tokenizer(config); - - + std::string sentence = "link and bold and newline
"; - BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"link and bold and newline "); + boost::shared_ptr ts = tokenizer.tokenize(sentence); + std::list annotations = ts->getAnnotations(); + std::list::iterator iter = annotations.begin(); + + /* + 0,23 type: 2 value: + 23,27 type: 1 value: link + 27,31 type: 2 value: + 32,35 type: 1 value: and + 36,39 type: 2 value: + 39,43 type: 1 value: bold + 43,47 type: 2 value: + 48,51 type: 1 value: and + 52,59 type: 1 value: newline + 60,65 type: 2 value: + */ + + BOOST_CHECK_EQUAL(10,annotations.size()); + BOOST_CHECK_EQUAL(iter->getStart(),0); + BOOST_CHECK_EQUAL(iter->getEnd(),23); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),23); + BOOST_CHECK_EQUAL(iter->getEnd(),27); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"link"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),27); + BOOST_CHECK_EQUAL(iter->getEnd(),31); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),32); + BOOST_CHECK_EQUAL(iter->getEnd(),35); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"and"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),36); + BOOST_CHECK_EQUAL(iter->getEnd(),39); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),39); + BOOST_CHECK_EQUAL(iter->getEnd(),43); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"bold"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),43); + BOOST_CHECK_EQUAL(iter->getEnd(),47); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),48); + BOOST_CHECK_EQUAL(iter->getEnd(),51); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"and"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),52); + BOOST_CHECK_EQUAL(iter->getEnd(),59); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"newline"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),60); + BOOST_CHECK_EQUAL(iter->getEnd(),65); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::HTML_TAG); +} + +BOOST_AUTO_TEST_CASE( InWordSymbolsTest ) +{ + boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); + SentenceTokenizer tokenizer(config); + + std::string sentence = "This is a sentence, don't over-analyze it. zażółć' gęś'lą -jaźń ZAŻ-ÓŁĆ GĘŚLĄ JAŹ'Ń"; + boost::shared_ptr ts = tokenizer.tokenize(sentence); + std::list annotations = ts->getAnnotations(); + std::list::iterator iter = annotations.begin(); + + /* + BOOST_FOREACH(TokenAnnotation annotation, annotations) { + std::cout << annotation.getStart() << "," + << annotation.getEnd() << " type: " + << annotation.getType() << " value: " + << annotation.getValue() << std::endl; + } + */ + + /* + 0,4 type: 1 value: this + 5,7 type: 1 value: is + 8,9 type: 1 value: a + 10,18 type: 1 value: sentence + 20,25 type: 1 value: don't + 26,38 type: 1 value: over-analyze + 39,41 type: 1 value: it + 43,49 type: 1 value: zażółć + 51,57 type: 1 value: gęś'lą + 59,63 type: 1 value: jaźń + 64,71 type: 1 value: zaż-ółć + 72,77 type: 1 value: gęślą + 78,83 type: 1 value: jaź'ń + */ + + BOOST_CHECK_EQUAL(13,annotations.size()); + + BOOST_CHECK_EQUAL(iter->getStart(),0); + BOOST_CHECK_EQUAL(iter->getEnd(),4); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"this"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),5); + BOOST_CHECK_EQUAL(iter->getEnd(),7); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"is"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),8); + BOOST_CHECK_EQUAL(iter->getEnd(),9); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"a"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),10); + BOOST_CHECK_EQUAL(iter->getEnd(),18); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"sentence"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),20); + BOOST_CHECK_EQUAL(iter->getEnd(),25); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"don't"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),26); + BOOST_CHECK_EQUAL(iter->getEnd(),38); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"over-analyze"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),39); + BOOST_CHECK_EQUAL(iter->getEnd(),41); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"it"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),43); + BOOST_CHECK_EQUAL(iter->getEnd(),49); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"zażółć"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),51); + BOOST_CHECK_EQUAL(iter->getEnd(),57); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"gęś'lą"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),59); + BOOST_CHECK_EQUAL(iter->getEnd(),63); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"jaźń"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),64); + BOOST_CHECK_EQUAL(iter->getEnd(),71); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"zaż-ółć"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),72); + BOOST_CHECK_EQUAL(iter->getEnd(),77); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"gęślą"); + iter++; + + BOOST_CHECK_EQUAL(iter->getStart(),78); + BOOST_CHECK_EQUAL(iter->getEnd(),83); + BOOST_CHECK_EQUAL(iter->getType(),TokenAnnotation::WORD); + BOOST_CHECK_EQUAL(iter->getValue(),"jaź'ń"); + iter++; + } BOOST_AUTO_TEST_CASE( StopWordsTest ) @@ -54,36 +335,17 @@ BOOST_AUTO_TEST_CASE( StopWordsTest ) } } -BOOST_AUTO_TEST_CASE( StopSymbolsTest ) -{ - boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - SentenceTokenizer tokenizer(config); - - - std::string sentence = "xxx, . xxx # xx $xx@ xx"; - BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx xx"); - -} - -BOOST_AUTO_TEST_CASE( SpaceSymbolsTest ) -{ - boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); - SentenceTokenizer tokenizer(config); - - - std::string sentence = "xxx-xxx xx|xx"; - BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"xxx xxx xx xx"); - -} - BOOST_AUTO_TEST_CASE( WeirdSentenceTest ) { boost::shared_ptr config(new ConcordiaConfig(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"))); SentenceTokenizer tokenizer(config); std::string sentence = "Sony | DXC-M7PKDXC-M7PDXC-M7PHDXC-M7PK/1DXC-M7P/1DXC-M7PH/1DXC-327PKDXC-327PLDXC-327PHDXC-327APKDXC-327APLDXC-327AHDXC-537PKDXC-537PLDXC-537PHDXC-537APKDXC-537APLDXC-537APHEVW-537PKEVW-327PKDXC-637PDXC-637PKDXC-637PLDXC-637PHPVW-637PKPVW-637PLDXC-D30PFDXC-D30PKDXC-D30PLDXC-D30PHDSR-130PFDSR-130PKDSR-130PLPVW-D30PFPVW-D30PKPVW-D30PLDXC-327BPFDXC-327BPKDXC-327BPLDXC-327BPHDXC-D30WSPDXC-D35PHDXC-D35PLDXC-D35PKDXC-D35WSPLDSR-135PL | DXF-3000CEDXF-325CEDXF-501CEDXF-M3CEDXF-M7CEDXF-40CEDXF-40ACEDXF-50CEDXF-601CEDXF-40BCEDXF-50BCEDXF-701CEDXF-WSCEDXF-801CEHDVF-C30W | CCU-M3PCCU-M5PCCU-M7PCUU-M5AP | RM-M7GRM-M7E | — | CA-325PCA-325APCA-325BCA-327PCA-537PCA-511CA-512PCA-513VCT-U14 |"; - BOOST_CHECK_EQUAL(tokenizer.tokenize(sentence)->getSentence(),"sony dxc mne_numberpkdxc mne_numberpdxc mne_numberphdxc mne_numberpk ne_numberdxc mne_numberp ne_numberdxc mne_numberph ne_numberdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberahdxc ne_numberpkdxc ne_numberpldxc ne_numberphdxc ne_numberapkdxc ne_numberapldxc ne_numberaphevw ne_numberpkevw ne_numberpkdxc ne_numberpdxc ne_numberpkdxc ne_numberpldxc ne_numberphpvw ne_numberpkpvw ne_numberpldxc dne_numberpfdxc dne_numberpkdxc dne_numberpldxc dne_numberphdsr ne_numberpfdsr ne_numberpkdsr ne_numberplpvw dne_numberpfpvw dne_numberpkpvw dne_numberpldxc ne_numberbpfdxc ne_numberbpkdxc ne_numberbpldxc ne_numberbphdxc dne_numberwspdxc dne_numberphdxc dne_numberpldxc dne_numberpkdxc dne_numberwspldsr ne_numberpl dxf ne_numbercedxf ne_numbercedxf ne_numbercedxf mne_numbercedxf mne_numbercedxf ne_numbercedxf ne_numberacedxf ne_numbercedxf ne_numbercedxf ne_numberbcedxf ne_numberbcedxf ne_numbercedxf wscedxf ne_numbercehdvf cne_numberw ccu mne_numberpccu mne_numberpccu mne_numberpcuu mne_numberap rm mne_numbergrm mne_numbere — ca ne_numberpca ne_numberapca ne_numberbca ne_numberpca ne_numberpca ne_numberca ne_numberpca ne_numbervct une_number "); + boost::shared_ptr ts = tokenizer.tokenize(sentence); + std::list annotations = ts->getAnnotations(); + std::list::iterator iter = annotations.begin(); + BOOST_CHECK_EQUAL(161, annotations.size()); } BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/token_annotation.cpp b/concordia/token_annotation.cpp index a44f820..2d0a470 100644 --- a/concordia/token_annotation.cpp +++ b/concordia/token_annotation.cpp @@ -3,7 +3,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start, const SUFFIX_MARKER_TYPE end, - const char annotationType, + const int annotationType, const std::string & value): Interval(start, end), _annotationType(annotationType), @@ -13,7 +13,7 @@ TokenAnnotation::TokenAnnotation(const SUFFIX_MARKER_TYPE start, TokenAnnotation::~TokenAnnotation() { } -char TokenAnnotation::NE_TYPE = 0; -char TokenAnnotation::WORD_TYPE = 1; -char TokenAnnotation::HTML_TAG_TYPE = 2; -char TokenAnnotation::STOP_WORD_TYPE = 3; +int TokenAnnotation::NE = 0; +int TokenAnnotation::WORD = 1; +int TokenAnnotation::HTML_TAG = 2; +int TokenAnnotation::STOP_WORD = 3; diff --git a/concordia/token_annotation.hpp b/concordia/token_annotation.hpp index d98af1a..11153e5 100644 --- a/concordia/token_annotation.hpp +++ b/concordia/token_annotation.hpp @@ -23,7 +23,7 @@ public: */ TokenAnnotation(const SUFFIX_MARKER_TYPE start, const SUFFIX_MARKER_TYPE end, - const char annotationType, + const int annotationType, const std::string & value); /*! Destructor. @@ -33,7 +33,7 @@ public: /*! Getter for annotation type. \returns annotation type */ - char getType() const { + int getType() const { return _annotationType; } @@ -44,16 +44,16 @@ public: return _value; } - static char NE_TYPE; + static int NE; - static char WORD_TYPE; + static int WORD; - static char HTML_TAG_TYPE; + static int HTML_TAG; - static char STOP_WORD_TYPE; + static int STOP_WORD; protected: - char _annotationType; + int _annotationType; std::string _value; }; diff --git a/concordia/tokenized_sentence.cpp b/concordia/tokenized_sentence.cpp index 0c0c014..e828c54 100644 --- a/concordia/tokenized_sentence.cpp +++ b/concordia/tokenized_sentence.cpp @@ -2,6 +2,7 @@ #include "concordia/common/text_utils.hpp" #include +#include TokenizedSentence::TokenizedSentence(std::string sentence): _sentence(sentence) {