text utils stub

Former-commit-id: d4459220f5696839d98848e9c30a61c084763a91
2014-04-24 08:36:48 +02:00 · 2014-04-24 08:36:48 +02:00 · 9358863f8d
commit 9358863f8d
parent 13c97f572d
16 changed files with 551 additions and 6 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -74,7 +74,7 @@ endif(WITH_PCRE)
 set(Boost_USE_STATIC_LIBS OFF)
 set(Boost_USE_STATIC_RUNTIME OFF)
 find_package(Boost COMPONENTS 
-      serialization unit_test_framework system filesystem program_options iostreams regex REQUIRED)
+      serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)

 # ----------------------------------------------------
 # libconfig
--- a/TODO.txt
+++ b/TODO.txt
@ -0,0 +1,3 @@
+1. lokalizowane to_lower
+2. anonimizacja zdań
+3. Dzielenie zdań (max 255 tokenów)
--- a/concordia/CMakeLists.txt
+++ b/concordia/CMakeLists.txt
@ -22,6 +22,7 @@ add_library(concordia SHARED
  concordia_exception.cpp
  common/logging.cpp
  common/utils.cpp
+  common/text_utils.cpp
  )

 add_subdirectory(t)
@ -29,6 +30,8 @@ add_subdirectory(t)

 install(TARGETS concordia DESTINATION lib/)
 install(FILES 
+          regex_replacement.hpp
+          sentence_anonymizer.hpp
          interval.hpp
          tm_matches.hpp
          anubis_search_result.hpp
@ -47,6 +50,7 @@ install(FILES
          common/config.hpp
          common/logging.hpp
          common/utils.hpp
+          common/text_utils.hpp
        DESTINATION include/concordia/common/)

 # ----------------------------------------------------
--- a/concordia/common/text_utils.cpp
+++ b/concordia/common/text_utils.cpp
@ -0,0 +1,29 @@
+#include "concordia/common/text_utils.hpp"
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/case_conv.hpp>
+#include <boost/locale.hpp>
+
+
+using namespace boost::locale;
+ 
+string TextUtils::toLowerCase(const string & text) {
+    generator gen;
+    locale loc=gen("pl_PL.UTF-8"); 
+    locale::global(loc); 
+    cout.imbue(loc);
+    
+    string result = text;
+    boost::locale::to_lower(result);
+    return result;
+}
+
+string TextUtils::toUpperCase(const string & text) {
+    generator gen;
+    locale loc=gen("pl_PL.UTF-8"); 
+    locale::global(loc); 
+    cout.imbue(loc);
+
+    string result = text;
+    boost::locale::to_upper(result);
+    return result;
+}
--- a/concordia/common/text_utils.hpp
+++ b/concordia/common/text_utils.hpp
@ -0,0 +1,30 @@
+#ifndef TEXT_UTILS_HDR
+#define TEXT_UTILS_HDR
+
+#include <string>
+
+
+using namespace std;
+
+/*! Utility class for performing simple string operations.
+*/
+class TextUtils {
+public:
+
+    /*! A method for converting all string letters to lower case.
+     \param text input string
+     \returns lower case version of the input string.
+    */
+    static string toLowerCase(const string & text);
+
+    /*! A method for converting all string letters to upper case.
+     \param text input string
+     \returns upper case version of the input string.
+    */
+    static string toUpperCase(const string & text);
+
+private:
+
+};
+
+#endif
--- a/concordia/regex_replacement.cpp
+++ b/concordia/regex_replacement.cpp
@ -3,11 +3,16 @@
 #include <boost/exception/all.hpp>
 #include <boost/throw_exception.hpp>

-RegexReplacement::RegexReplacement(string patternString, string replacement)
+RegexReplacement::RegexReplacement(string patternString, string replacement,
+                                   bool caseSensitive)
                                         throw(ConcordiaException):
                                         _replacement(replacement) {
    try {
-        _pattern = boost::regex(patternString);
+        if (caseSensitive) {
+            _pattern = boost::make_u32regex(patternString);
+        } else {
+            _pattern = boost::make_u32regex(patternString, boost::regex::icase);        
+        }
    } catch ( const std::exception & e ) {
        stringstream ss;
        
@ -25,7 +30,7 @@ RegexReplacement::~RegexReplacement() {
 }

 string RegexReplacement::apply(const string & text) {
-    return boost::regex_replace(text, _pattern, _replacement,
+    return boost::u32regex_replace(text, _pattern, _replacement,
                    boost::match_default | boost::format_all);
 }

--- a/concordia/regex_replacement.hpp
+++ b/concordia/regex_replacement.hpp
@ -6,6 +6,7 @@
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
 #include <boost/regex.hpp>
+#include <boost/regex/icu.hpp>


 /*!
@ -19,7 +20,8 @@ typedef boost::error_info<struct my_tag,std::string> my_tag_error_info;

 class RegexReplacement {
 public:
-    explicit RegexReplacement(string patternString, string replacement)
+    RegexReplacement(string patternString, string replacement,
+                             bool caseSensitive = true)
                                              throw(ConcordiaException);

    /*! Destructor.
@ -29,7 +31,7 @@ public:
    string apply(const string & text);

 private:
-    boost::regex _pattern;
+    boost::u32regex _pattern;
    
    string _replacement;    
 };
--- a/concordia/sentence_anonymizer.hpp
+++ b/concordia/sentence_anonymizer.hpp
@ -3,9 +3,11 @@

 #include <string>
 #include "concordia/common/config.hpp"
+#include "concordia/regex_replacement.hpp"
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
+#include <boost/ptr_container/ptr_vector.hpp>


 /*!
@ -27,6 +29,14 @@ public:
    string anonymize(const string & sentence);

 private:
+
+    boost::ptr_vector<RegexReplacement> _namedEntities;
+
+    boost::shared_ptr<RegexReplacement> _stopWords;
+    
+    boost::shared_ptr<RegexReplacement> _stopSymbols;
+    
+    boost::shared_ptr<RegexReplacement> _spaceSymbols;    
 };

 #endif
--- a/concordia/t/CMakeLists.txt
+++ b/concordia/t/CMakeLists.txt
@ -1,4 +1,5 @@
 add_library(concordia-tests
+  test_text_utils.cpp
  test_regex_replacement.cpp
  test_example.cpp
  test_tm_matches.cpp
--- a/concordia/t/test_regex_replacement.cpp
+++ b/concordia/t/test_regex_replacement.cpp
@ -2,6 +2,8 @@
 #include "concordia/regex_replacement.hpp"
 #include "concordia/common/config.hpp"
 #include <boost/algorithm/string/predicate.hpp>
+#include <boost/locale.hpp>
+#include <boost/algorithm/string/case_conv.hpp>

 using namespace std;

@ -39,4 +41,28 @@ BOOST_AUTO_TEST_CASE( BackrefReplacement )
    BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812.");
 }

+BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
+{
+    RegexReplacement rr("abc","xxx", false);
+    BOOST_CHECK_EQUAL(rr.apply("This is AbC and ABC and abc and aBC."),"This is xxx and xxx and xxx and xxx.");
+}
+
+BOOST_AUTO_TEST_CASE( UnicodeReplacement )
+{
+    RegexReplacement rr("ą","x");
+    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń"),"zażółć gęślx jaźń");
+}
+
+BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
+{
+    RegexReplacement rr("ą","x", false);
+    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zażółć gęślx jaźń ZAŻÓŁĆ GĘŚLx JAŹŃ");
+}
+
+BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
+{
+    RegexReplacement rr("[ąćęłńóśżź]","x", false);
+    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zaxxxx gxxlx jaxx ZAxxxx GxxLx JAxx");
+}
+
 BOOST_AUTO_TEST_SUITE_END()
--- a/concordia/t/test_text_utils.cpp
+++ b/concordia/t/test_text_utils.cpp
@ -0,0 +1,21 @@
+#include "tests/unit-tests/unit_tests_globals.hpp"
+#include "concordia/common/config.hpp"
+#include "concordia/common/text_utils.hpp"
+
+using namespace std;
+
+BOOST_AUTO_TEST_SUITE(text_utils)
+
+BOOST_AUTO_TEST_CASE( ToLower )
+{
+    string str = "ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
+    BOOST_CHECK_EQUAL(TextUtils::toLowerCase(str),"zażółć gęślą jaźń");
+}
+
+BOOST_AUTO_TEST_CASE( ToUpper )
+{
+    string str = "zażółć gęślą jaźń";
+    BOOST_CHECK_EQUAL(TextUtils::toUpperCase(str),"ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
+}
+
+BOOST_AUTO_TEST_SUITE_END()
--- a/tests/resources/anonymizer/html_tags.txt
+++ b/tests/resources/anonymizer/html_tags.txt
@ -0,0 +1,92 @@
+a
+abbr
+acronym
+address
+applet
+area
+b
+base
+basefont
+bdo
+big
+blockquote
+body
+br
+button
+caption
+center
+cite
+code
+col
+colgroup
+dd
+del
+dir
+div
+dfn
+dl
+dt
+em
+fieldset
+font
+form
+frame
+frameset
+h1
+h2
+h3
+h4
+h5
+h6
+head
+hr
+html
+i
+iframe
+img
+input
+ins
+isindex
+kbd
+label
+legend
+li
+link
+map
+menu
+meta
+noframes
+noscript
+object
+ol
+optgroup
+option
+p
+param
+pre
+q
+s
+samp
+script
+select
+small
+span
+strike
+strong
+style
+sub
+sup
+table
+tbody
+td
+textarea
+tfoot
+th
+thead
+title
+tr
+tt
+u
+ul
+var
+xmp
--- a/tests/resources/anonymizer/named_entities.txt
+++ b/tests/resources/anonymizer/named_entities.txt
@ -0,0 +1,5 @@
+[0-9]{1,2})[\.\-/]([0-9]{1,2})[\.\-/]([0-9]{4}  NE_DATE
+[A-ZĄŻŹŚĘĆŃÓŁ][[A-ZĄŻŹŚĘĆŃÓŁ][a-zążźśęćńół]\-\.]*\s+Sp\.\s+z\s+o\.\s*o\.    NE_COMPANY
+[A-ZĄŻŹŚĘĆŃÓŁ][[A-ZĄŻŹŚĘĆŃÓŁ][a-zążźśęćńół]\-\.]*\s+S\.?\s?A\.? NE_COMPANY
+[\w\._\d]+@\w+(\.\w+)*  NE_EMAIL
+[0-9]+([\.\,][0-9]+)? NE_NUMBER
--- a/tests/resources/anonymizer/space_symbols.txt
+++ b/tests/resources/anonymizer/space_symbols.txt
@ -0,0 +1,4 @@
+\|
+\–
+\-
+\/
--- a/tests/resources/anonymizer/stop_symbols.txt
+++ b/tests/resources/anonymizer/stop_symbols.txt
@ -0,0 +1,39 @@
+\\tab
+\\emdash
+\&lt;
+\&gt;
+\&amp;
+\&quot;
+\&dash;
+\&nbsp;
+<
+>
+=
+\+
+„
+”
+\"
+…
+\.
+\,
+\?
+!
+;
+:
+'
+\(
+\)
+\{
+\}
+\@
+\#
+\$
+\%
+\^
+\&
+\*
+\[
+\]
+\\
+\~
+&#\d+
--- a/tests/resources/anonymizer/stop_words.txt
+++ b/tests/resources/anonymizer/stop_words.txt
@ -0,0 +1,274 @@
+a
+aby
+ach
+acz
+aczkolwiek
+aj
+albo
+ale
+ależ
+aż
+bardziej
+bardzo
+bez
+bo
+bowiem
+by
+byli
+bynajmniej
+być
+był
+była
+było
+były
+będzie
+będą
+cali
+cała
+cały
+ci
+cię
+ciebie
+co
+cokolwiek
+coś
+czasami
+czasem
+czemu
+czy
+czyli
+daleko
+dla
+dlaczego
+dlatego
+do
+dobrze
+dokąd
+dość
+dużo
+dwa
+dwaj
+dwie
+dwoje
+dziś
+dzisiaj
+gdy
+gdyby
+gdyż
+gdzie
+gdziekolwiek
+gdzieś
+go
+i
+ich
+ile
+im
+inna
+inne
+inny
+innych
+iż
+ja
+ją
+jak
+jakaś
+jakby
+jaki
+jakichś
+jakie
+jakiś
+jakiż
+jakkolwiek
+jako
+jakoś
+je
+jeden
+jedna
+jedno
+jednak
+jednakże
+jego
+jej
+jemu
+jest
+jestem
+jeszcze
+jeśli
+jeżeli
+już
+ją
+każdy
+kiedy
+kilka
+kimś
+kto
+ktokolwiek
+ktoś
+która
+które
+którego
+której
+który
+których
+którym
+którzy
+ku
+lat
+lecz
+lub
+ma
+mają
+mam
+mi
+mimo
+między
+mną
+mnie
+mogą
+moi
+moim
+moja
+moje
+może
+możliwe
+można
+mój
+mu
+musi
+my
+na
+nad
+nam
+nami
+nas
+nasi
+nasz
+nasza
+nasze
+naszego
+naszych
+natomiast
+natychmiast
+nawet
+nią
+nic
+nich
+nie
+niego
+niej
+niemu
+nigdy
+nim
+nimi
+niż
+no
+o
+obok
+od
+około
+on
+ona
+one
+oni
+ono
+oraz
+oto
+owszem
+pan
+pana
+pani
+po
+pod
+podczas
+pomimo
+ponad
+ponieważ
+powinien
+powinna
+powinni
+powinno
+poza
+prawie
+przecież
+przed
+przede
+przedtem
+przez
+przy
+roku
+również
+sam
+sama
+są
+się
+skąd
+sobie
+sobą
+sposób
+swoje
+ta
+tak
+taka
+taki
+takie
+także
+tam
+te
+tego
+tej
+ten
+teraz
+też
+to
+tobą
+tobie
+toteż
+trzeba
+tu
+tutaj
+twoi
+twoim
+twoja
+twoje
+twym
+twój
+ty
+tych
+tylko
+tym
+u
+w
+wam
+wami
+was
+wasz
+wasza
+wasze
+we
+według
+wiele
+wielu
+więc
+więcej
+wszyscy
+wszystkich
+wszystkie
+wszystkim
+wszystko
+wtedy
+wy
+właśnie
+z
+za
+zapewne
+zawsze
+ze
+znowu
+znów
+został
+żaden
+żadna
+żadne
+żadnych
+że
+żeby