diff --git a/concordia/t/test_sentence_tokenizer.cpp b/concordia/t/test_sentence_tokenizer.cpp index 4843c84..0919774 100644 --- a/concordia/t/test_sentence_tokenizer.cpp +++ b/concordia/t/test_sentence_tokenizer.cpp @@ -19,18 +19,18 @@ BOOST_AUTO_TEST_CASE( NETest ) SentenceTokenizer tokenizer(config); - std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ"; + std::string sentence = "Dates: 12.04.2012, 03/03/2017 2012.04.12 mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ"; TokenizedSentence ts = tokenizer.tokenize(sentence); - BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń"); + BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "dates ne_date ne_date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń"); std::list annotations = ts.getAnnotations(); std::list::iterator iter = annotations.begin(); - BOOST_CHECK_EQUAL(13,annotations.size()); + BOOST_CHECK_EQUAL(15,annotations.size()); std::stringstream ss; ss << ts; - BOOST_CHECK_EQUAL("[0,4][1][date] [6,16][0][ne_date] [18,22][1][mail] [24,40][0][ne_email] [42,48][1][number] [50,54][0][ne_number] [56,62][1][hello3] [63,69][1][zażółć] [70,75][1][gęślą] [76,80][1][jaźń] [82,88][1][zażółć] [89,94][1][gęślą] [95,99][1][jaźń]", ss.str()); + BOOST_CHECK_EQUAL("[0,5][1][dates] [7,17][0][ne_date] [19,29][0][ne_date] [30,40][0][ne_date] [41,45][1][mail] [47,63][0][ne_email] [65,71][1][number] [73,77][0][ne_number] [79,85][1][hello3] [86,92][1][zażółć] [93,98][1][gęślą] [99,103][1][jaźń] [105,111][1][zażółć] [112,117][1][gęślą] [118,122][1][jaźń]", ss.str()); } BOOST_AUTO_TEST_CASE( HtmlTagsTest ) diff --git a/prod/resources/tokenizer/named_entities.txt b/prod/resources/tokenizer/named_entities.txt index 905e61b..25e292a 100644 --- a/prod/resources/tokenizer/named_entities.txt +++ b/prod/resources/tokenizer/named_entities.txt @@ -1,3 +1,5 @@ [0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date +[0-9]{4}[\.\-/][0-9]{1,2}[\.\-/][0-9]{1,2} ne_date [\w\._\d]+@\w+(\.\w+)* ne_email -[0-9]+([\.\,][0-9]+)? ne_number +[0-9]+[\.\)]([0-9]+\.)+ ne_bullet +\b[0-9]+([\.\,][0-9]+)?\b ne_number diff --git a/tests/resources/tokenizer/named_entities.txt b/tests/resources/tokenizer/named_entities.txt index 3403b2a..25e292a 100644 --- a/tests/resources/tokenizer/named_entities.txt +++ b/tests/resources/tokenizer/named_entities.txt @@ -1,4 +1,5 @@ [0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date +[0-9]{4}[\.\-/][0-9]{1,2}[\.\-/][0-9]{1,2} ne_date [\w\._\d]+@\w+(\.\w+)* ne_email [0-9]+[\.\)]([0-9]+\.)+ ne_bullet \b[0-9]+([\.\,][0-9]+)?\b ne_number