date recognition

This commit is contained in:
rjawor 2017-04-27 10:37:29 +02:00
parent bd73749388
commit dceb0d9f47
3 changed files with 8 additions and 5 deletions

View File

@ -19,18 +19,18 @@ BOOST_AUTO_TEST_CASE( NETest )
SentenceTokenizer tokenizer(config);
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
std::string sentence = "Dates: 12.04.2012, 03/03/2017 2012.04.12 mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
TokenizedSentence ts = tokenizer.tokenize(sentence);
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "dates ne_date ne_date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
std::list<TokenAnnotation> annotations = ts.getAnnotations();
std::list<TokenAnnotation>::iterator iter = annotations.begin();
BOOST_CHECK_EQUAL(13,annotations.size());
BOOST_CHECK_EQUAL(15,annotations.size());
std::stringstream ss;
ss << ts;
BOOST_CHECK_EQUAL("[0,4][1][date] [6,16][0][ne_date] [18,22][1][mail] [24,40][0][ne_email] [42,48][1][number] [50,54][0][ne_number] [56,62][1][hello3] [63,69][1][zażółć] [70,75][1][gęślą] [76,80][1][jaźń] [82,88][1][zażółć] [89,94][1][gęślą] [95,99][1][jaźń]", ss.str());
BOOST_CHECK_EQUAL("[0,5][1][dates] [7,17][0][ne_date] [19,29][0][ne_date] [30,40][0][ne_date] [41,45][1][mail] [47,63][0][ne_email] [65,71][1][number] [73,77][0][ne_number] [79,85][1][hello3] [86,92][1][zażółć] [93,98][1][gęślą] [99,103][1][jaźń] [105,111][1][zażółć] [112,117][1][gęślą] [118,122][1][jaźń]", ss.str());
}
BOOST_AUTO_TEST_CASE( HtmlTagsTest )

View File

@ -1,3 +1,5 @@
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
[0-9]{4}[\.\-/][0-9]{1,2}[\.\-/][0-9]{1,2} ne_date
[\w\._\d]+@\w+(\.\w+)* ne_email
[0-9]+([\.\,][0-9]+)? ne_number
[0-9]+[\.\)]([0-9]+\.)+ ne_bullet
\b[0-9]+([\.\,][0-9]+)?\b ne_number

View File

@ -1,4 +1,5 @@
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
[0-9]{4}[\.\-/][0-9]{1,2}[\.\-/][0-9]{1,2} ne_date
[\w\._\d]+@\w+(\.\w+)* ne_email
[0-9]+[\.\)]([0-9]+\.)+ ne_bullet
\b[0-9]+([\.\,][0-9]+)?\b ne_number