From 9358863f8ddbc1572ae5c19088f5ae63ffbe6a0f Mon Sep 17 00:00:00 2001 From: rjawor Date: Thu, 24 Apr 2014 08:36:48 +0200 Subject: [PATCH] text utils stub Former-commit-id: d4459220f5696839d98848e9c30a61c084763a91 --- CMakeLists.txt | 2 +- TODO.txt | 3 + concordia/CMakeLists.txt | 4 + concordia/common/text_utils.cpp | 29 ++ concordia/common/text_utils.hpp | 30 ++ concordia/regex_replacement.cpp | 11 +- concordia/regex_replacement.hpp | 6 +- concordia/sentence_anonymizer.hpp | 10 + concordia/t/CMakeLists.txt | 1 + concordia/t/test_regex_replacement.cpp | 26 ++ concordia/t/test_text_utils.cpp | 21 ++ tests/resources/anonymizer/html_tags.txt | 92 ++++++ tests/resources/anonymizer/named_entities.txt | 5 + tests/resources/anonymizer/space_symbols.txt | 4 + tests/resources/anonymizer/stop_symbols.txt | 39 +++ tests/resources/anonymizer/stop_words.txt | 274 ++++++++++++++++++ 16 files changed, 551 insertions(+), 6 deletions(-) create mode 100644 TODO.txt create mode 100644 concordia/common/text_utils.cpp create mode 100644 concordia/common/text_utils.hpp create mode 100644 concordia/t/test_text_utils.cpp create mode 100644 tests/resources/anonymizer/html_tags.txt create mode 100644 tests/resources/anonymizer/named_entities.txt create mode 100644 tests/resources/anonymizer/space_symbols.txt create mode 100644 tests/resources/anonymizer/stop_symbols.txt create mode 100644 tests/resources/anonymizer/stop_words.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index f4f27d9..8b46060 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,7 +74,7 @@ endif(WITH_PCRE) set(Boost_USE_STATIC_LIBS OFF) set(Boost_USE_STATIC_RUNTIME OFF) find_package(Boost COMPONENTS - serialization unit_test_framework system filesystem program_options iostreams regex REQUIRED) + serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED) # ---------------------------------------------------- # libconfig diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 0000000..2f60d3f --- /dev/null +++ b/TODO.txt @@ -0,0 +1,3 @@ +1. lokalizowane to_lower +2. anonimizacja zdań +3. Dzielenie zdań (max 255 tokenów) diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index 6a971c7..e054bce 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -22,6 +22,7 @@ add_library(concordia SHARED concordia_exception.cpp common/logging.cpp common/utils.cpp + common/text_utils.cpp ) add_subdirectory(t) @@ -29,6 +30,8 @@ add_subdirectory(t) install(TARGETS concordia DESTINATION lib/) install(FILES + regex_replacement.hpp + sentence_anonymizer.hpp interval.hpp tm_matches.hpp anubis_search_result.hpp @@ -47,6 +50,7 @@ install(FILES common/config.hpp common/logging.hpp common/utils.hpp + common/text_utils.hpp DESTINATION include/concordia/common/) # ---------------------------------------------------- diff --git a/concordia/common/text_utils.cpp b/concordia/common/text_utils.cpp new file mode 100644 index 0000000..c38bc02 --- /dev/null +++ b/concordia/common/text_utils.cpp @@ -0,0 +1,29 @@ +#include "concordia/common/text_utils.hpp" +#include +#include +#include + + +using namespace boost::locale; + +string TextUtils::toLowerCase(const string & text) { + generator gen; + locale loc=gen("pl_PL.UTF-8"); + locale::global(loc); + cout.imbue(loc); + + string result = text; + boost::locale::to_lower(result); + return result; +} + +string TextUtils::toUpperCase(const string & text) { + generator gen; + locale loc=gen("pl_PL.UTF-8"); + locale::global(loc); + cout.imbue(loc); + + string result = text; + boost::locale::to_upper(result); + return result; +} diff --git a/concordia/common/text_utils.hpp b/concordia/common/text_utils.hpp new file mode 100644 index 0000000..0c6d1b4 --- /dev/null +++ b/concordia/common/text_utils.hpp @@ -0,0 +1,30 @@ +#ifndef TEXT_UTILS_HDR +#define TEXT_UTILS_HDR + +#include + + +using namespace std; + +/*! Utility class for performing simple string operations. +*/ +class TextUtils { +public: + + /*! A method for converting all string letters to lower case. + \param text input string + \returns lower case version of the input string. + */ + static string toLowerCase(const string & text); + + /*! A method for converting all string letters to upper case. + \param text input string + \returns upper case version of the input string. + */ + static string toUpperCase(const string & text); + +private: + +}; + +#endif diff --git a/concordia/regex_replacement.cpp b/concordia/regex_replacement.cpp index cc09964..0b5f7ab 100644 --- a/concordia/regex_replacement.cpp +++ b/concordia/regex_replacement.cpp @@ -3,11 +3,16 @@ #include #include -RegexReplacement::RegexReplacement(string patternString, string replacement) +RegexReplacement::RegexReplacement(string patternString, string replacement, + bool caseSensitive) throw(ConcordiaException): _replacement(replacement) { try { - _pattern = boost::regex(patternString); + if (caseSensitive) { + _pattern = boost::make_u32regex(patternString); + } else { + _pattern = boost::make_u32regex(patternString, boost::regex::icase); + } } catch ( const std::exception & e ) { stringstream ss; @@ -25,7 +30,7 @@ RegexReplacement::~RegexReplacement() { } string RegexReplacement::apply(const string & text) { - return boost::regex_replace(text, _pattern, _replacement, + return boost::u32regex_replace(text, _pattern, _replacement, boost::match_default | boost::format_all); } diff --git a/concordia/regex_replacement.hpp b/concordia/regex_replacement.hpp index 88c33ce..9684218 100644 --- a/concordia/regex_replacement.hpp +++ b/concordia/regex_replacement.hpp @@ -6,6 +6,7 @@ #include "concordia/concordia_exception.hpp" #include #include +#include /*! @@ -19,7 +20,8 @@ typedef boost::error_info my_tag_error_info; class RegexReplacement { public: - explicit RegexReplacement(string patternString, string replacement) + RegexReplacement(string patternString, string replacement, + bool caseSensitive = true) throw(ConcordiaException); /*! Destructor. @@ -29,7 +31,7 @@ public: string apply(const string & text); private: - boost::regex _pattern; + boost::u32regex _pattern; string _replacement; }; diff --git a/concordia/sentence_anonymizer.hpp b/concordia/sentence_anonymizer.hpp index 6d0ded3..6259818 100644 --- a/concordia/sentence_anonymizer.hpp +++ b/concordia/sentence_anonymizer.hpp @@ -3,9 +3,11 @@ #include #include "concordia/common/config.hpp" +#include "concordia/regex_replacement.hpp" #include "concordia/concordia_config.hpp" #include "concordia/concordia_exception.hpp" #include +#include /*! @@ -27,6 +29,14 @@ public: string anonymize(const string & sentence); private: + + boost::ptr_vector _namedEntities; + + boost::shared_ptr _stopWords; + + boost::shared_ptr _stopSymbols; + + boost::shared_ptr _spaceSymbols; }; #endif diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt index 054178c..05f892d 100644 --- a/concordia/t/CMakeLists.txt +++ b/concordia/t/CMakeLists.txt @@ -1,4 +1,5 @@ add_library(concordia-tests + test_text_utils.cpp test_regex_replacement.cpp test_example.cpp test_tm_matches.cpp diff --git a/concordia/t/test_regex_replacement.cpp b/concordia/t/test_regex_replacement.cpp index 02a6065..23c613a 100644 --- a/concordia/t/test_regex_replacement.cpp +++ b/concordia/t/test_regex_replacement.cpp @@ -2,6 +2,8 @@ #include "concordia/regex_replacement.hpp" #include "concordia/common/config.hpp" #include +#include +#include using namespace std; @@ -39,4 +41,28 @@ BOOST_AUTO_TEST_CASE( BackrefReplacement ) BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812."); } +BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement ) +{ + RegexReplacement rr("abc","xxx", false); + BOOST_CHECK_EQUAL(rr.apply("This is AbC and ABC and abc and aBC."),"This is xxx and xxx and xxx and xxx."); +} + +BOOST_AUTO_TEST_CASE( UnicodeReplacement ) +{ + RegexReplacement rr("ą","x"); + BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń"),"zażółć gęślx jaźń"); +} + +BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement ) +{ + RegexReplacement rr("ą","x", false); + BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zażółć gęślx jaźń ZAŻÓŁĆ GĘŚLx JAŹŃ"); +} + +BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement ) +{ + RegexReplacement rr("[ąćęłńóśżź]","x", false); + BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zaxxxx gxxlx jaxx ZAxxxx GxxLx JAxx"); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/concordia/t/test_text_utils.cpp b/concordia/t/test_text_utils.cpp new file mode 100644 index 0000000..fdfa6c3 --- /dev/null +++ b/concordia/t/test_text_utils.cpp @@ -0,0 +1,21 @@ +#include "tests/unit-tests/unit_tests_globals.hpp" +#include "concordia/common/config.hpp" +#include "concordia/common/text_utils.hpp" + +using namespace std; + +BOOST_AUTO_TEST_SUITE(text_utils) + +BOOST_AUTO_TEST_CASE( ToLower ) +{ + string str = "ZAŻÓŁĆ GĘŚLĄ JAŹŃ"; + BOOST_CHECK_EQUAL(TextUtils::toLowerCase(str),"zażółć gęślą jaźń"); +} + +BOOST_AUTO_TEST_CASE( ToUpper ) +{ + string str = "zażółć gęślą jaźń"; + BOOST_CHECK_EQUAL(TextUtils::toUpperCase(str),"ZAŻÓŁĆ GĘŚLĄ JAŹŃ"); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/resources/anonymizer/html_tags.txt b/tests/resources/anonymizer/html_tags.txt new file mode 100644 index 0000000..4cd5a33 --- /dev/null +++ b/tests/resources/anonymizer/html_tags.txt @@ -0,0 +1,92 @@ +a +abbr +acronym +address +applet +area +b +base +basefont +bdo +big +blockquote +body +br +button +caption +center +cite +code +col +colgroup +dd +del +dir +div +dfn +dl +dt +em +fieldset +font +form +frame +frameset +h1 +h2 +h3 +h4 +h5 +h6 +head +hr +html +i +iframe +img +input +ins +isindex +kbd +label +legend +li +link +map +menu +meta +noframes +noscript +object +ol +optgroup +option +p +param +pre +q +s +samp +script +select +small +span +strike +strong +style +sub +sup +table +tbody +td +textarea +tfoot +th +thead +title +tr +tt +u +ul +var +xmp diff --git a/tests/resources/anonymizer/named_entities.txt b/tests/resources/anonymizer/named_entities.txt new file mode 100644 index 0000000..759e306 --- /dev/null +++ b/tests/resources/anonymizer/named_entities.txt @@ -0,0 +1,5 @@ +[0-9]{1,2})[\.\-/]([0-9]{1,2})[\.\-/]([0-9]{4} NE_DATE +[A-ZĄŻŹŚĘĆŃÓŁ][[A-ZĄŻŹŚĘĆŃÓŁ][a-zążźśęćńół]\-\.]*\s+Sp\.\s+z\s+o\.\s*o\. NE_COMPANY +[A-ZĄŻŹŚĘĆŃÓŁ][[A-ZĄŻŹŚĘĆŃÓŁ][a-zążźśęćńół]\-\.]*\s+S\.?\s?A\.? NE_COMPANY +[\w\._\d]+@\w+(\.\w+)* NE_EMAIL +[0-9]+([\.\,][0-9]+)? NE_NUMBER diff --git a/tests/resources/anonymizer/space_symbols.txt b/tests/resources/anonymizer/space_symbols.txt new file mode 100644 index 0000000..af86cef --- /dev/null +++ b/tests/resources/anonymizer/space_symbols.txt @@ -0,0 +1,4 @@ +\| +\– +\- +\/ diff --git a/tests/resources/anonymizer/stop_symbols.txt b/tests/resources/anonymizer/stop_symbols.txt new file mode 100644 index 0000000..bf699f6 --- /dev/null +++ b/tests/resources/anonymizer/stop_symbols.txt @@ -0,0 +1,39 @@ +\\tab +\\emdash +\< +\> +\& +\" +\‐ +\  +< +> += +\+ +„ +” +\" +… +\. +\, +\? +! +; +: +' +\( +\) +\{ +\} +\@ +\# +\$ +\% +\^ +\& +\* +\[ +\] +\\ +\~ +&#\d+ diff --git a/tests/resources/anonymizer/stop_words.txt b/tests/resources/anonymizer/stop_words.txt new file mode 100644 index 0000000..383533c --- /dev/null +++ b/tests/resources/anonymizer/stop_words.txt @@ -0,0 +1,274 @@ +a +aby +ach +acz +aczkolwiek +aj +albo +ale +ależ +aż +bardziej +bardzo +bez +bo +bowiem +by +byli +bynajmniej +być +był +była +było +były +będzie +będą +cali +cała +cały +ci +cię +ciebie +co +cokolwiek +coś +czasami +czasem +czemu +czy +czyli +daleko +dla +dlaczego +dlatego +do +dobrze +dokąd +dość +dużo +dwa +dwaj +dwie +dwoje +dziś +dzisiaj +gdy +gdyby +gdyż +gdzie +gdziekolwiek +gdzieś +go +i +ich +ile +im +inna +inne +inny +innych +iż +ja +ją +jak +jakaś +jakby +jaki +jakichś +jakie +jakiś +jakiż +jakkolwiek +jako +jakoś +je +jeden +jedna +jedno +jednak +jednakże +jego +jej +jemu +jest +jestem +jeszcze +jeśli +jeżeli +już +ją +każdy +kiedy +kilka +kimś +kto +ktokolwiek +ktoś +która +które +którego +której +który +których +którym +którzy +ku +lat +lecz +lub +ma +mają +mam +mi +mimo +między +mną +mnie +mogą +moi +moim +moja +moje +może +możliwe +można +mój +mu +musi +my +na +nad +nam +nami +nas +nasi +nasz +nasza +nasze +naszego +naszych +natomiast +natychmiast +nawet +nią +nic +nich +nie +niego +niej +niemu +nigdy +nim +nimi +niż +no +o +obok +od +około +on +ona +one +oni +ono +oraz +oto +owszem +pan +pana +pani +po +pod +podczas +pomimo +ponad +ponieważ +powinien +powinna +powinni +powinno +poza +prawie +przecież +przed +przede +przedtem +przez +przy +roku +również +sam +sama +są +się +skąd +sobie +sobą +sposób +swoje +ta +tak +taka +taki +takie +także +tam +te +tego +tej +ten +teraz +też +to +tobą +tobie +toteż +trzeba +tu +tutaj +twoi +twoim +twoja +twoje +twym +twój +ty +tych +tylko +tym +u +w +wam +wami +was +wasz +wasza +wasze +we +według +wiele +wielu +więc +więcej +wszyscy +wszystkich +wszystkie +wszystkim +wszystko +wtedy +wy +właśnie +z +za +zapewne +zawsze +ze +znowu +znów +został +żaden +żadna +żadne +żadnych +że +żeby