date recognition
This commit is contained in:
parent
bd73749388
commit
dceb0d9f47
@ -19,18 +19,18 @@ BOOST_AUTO_TEST_CASE( NETest )
|
||||
SentenceTokenizer tokenizer(config);
|
||||
|
||||
|
||||
std::string sentence = "Date: 12.04.2012, mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
||||
std::string sentence = "Dates: 12.04.2012, 03/03/2017 2012.04.12 mail: test@example.com, number: 5.34, hello3 zażółć gęślą jaźń, ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
||||
TokenizedSentence ts = tokenizer.tokenize(sentence);
|
||||
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
|
||||
BOOST_CHECK_EQUAL(ts.getTokenizedSentence(), "dates ne_date ne_date ne_date mail ne_email number ne_number hello3 zażółć gęślą jaźń zażółć gęślą jaźń");
|
||||
|
||||
std::list<TokenAnnotation> annotations = ts.getAnnotations();
|
||||
std::list<TokenAnnotation>::iterator iter = annotations.begin();
|
||||
|
||||
BOOST_CHECK_EQUAL(13,annotations.size());
|
||||
BOOST_CHECK_EQUAL(15,annotations.size());
|
||||
|
||||
std::stringstream ss;
|
||||
ss << ts;
|
||||
BOOST_CHECK_EQUAL("[0,4][1][date] [6,16][0][ne_date] [18,22][1][mail] [24,40][0][ne_email] [42,48][1][number] [50,54][0][ne_number] [56,62][1][hello3] [63,69][1][zażółć] [70,75][1][gęślą] [76,80][1][jaźń] [82,88][1][zażółć] [89,94][1][gęślą] [95,99][1][jaźń]", ss.str());
|
||||
BOOST_CHECK_EQUAL("[0,5][1][dates] [7,17][0][ne_date] [19,29][0][ne_date] [30,40][0][ne_date] [41,45][1][mail] [47,63][0][ne_email] [65,71][1][number] [73,77][0][ne_number] [79,85][1][hello3] [86,92][1][zażółć] [93,98][1][gęślą] [99,103][1][jaźń] [105,111][1][zażółć] [112,117][1][gęślą] [118,122][1][jaźń]", ss.str());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( HtmlTagsTest )
|
||||
|
@ -1,3 +1,5 @@
|
||||
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
|
||||
[0-9]{4}[\.\-/][0-9]{1,2}[\.\-/][0-9]{1,2} ne_date
|
||||
[\w\._\d]+@\w+(\.\w+)* ne_email
|
||||
[0-9]+([\.\,][0-9]+)? ne_number
|
||||
[0-9]+[\.\)]([0-9]+\.)+ ne_bullet
|
||||
\b[0-9]+([\.\,][0-9]+)?\b ne_number
|
||||
|
@ -1,4 +1,5 @@
|
||||
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
|
||||
[0-9]{4}[\.\-/][0-9]{1,2}[\.\-/][0-9]{1,2} ne_date
|
||||
[\w\._\d]+@\w+(\.\w+)* ne_email
|
||||
[0-9]+[\.\)]([0-9]+\.)+ ne_bullet
|
||||
\b[0-9]+([\.\,][0-9]+)?\b ne_number
|
||||
|
Loading…
Reference in New Issue
Block a user