From 9358863f8ddbc1572ae5c19088f5ae63ffbe6a0f Mon Sep 17 00:00:00 2001
From: rjawor <rjawor@amu.edu.pl>
Date: Thu, 24 Apr 2014 08:36:48 +0200
Subject: [PATCH] text utils stub

Former-commit-id: d4459220f5696839d98848e9c30a61c084763a91
---
 CMakeLists.txt                                |   2 +-
 TODO.txt                                      |   3 +
 concordia/CMakeLists.txt                      |   4 +
 concordia/common/text_utils.cpp               |  29 ++
 concordia/common/text_utils.hpp               |  30 ++
 concordia/regex_replacement.cpp               |  11 +-
 concordia/regex_replacement.hpp               |   6 +-
 concordia/sentence_anonymizer.hpp             |  10 +
 concordia/t/CMakeLists.txt                    |   1 +
 concordia/t/test_regex_replacement.cpp        |  26 ++
 concordia/t/test_text_utils.cpp               |  21 ++
 tests/resources/anonymizer/html_tags.txt      |  92 ++++++
 tests/resources/anonymizer/named_entities.txt |   5 +
 tests/resources/anonymizer/space_symbols.txt  |   4 +
 tests/resources/anonymizer/stop_symbols.txt   |  39 +++
 tests/resources/anonymizer/stop_words.txt     | 274 ++++++++++++++++++
 16 files changed, 551 insertions(+), 6 deletions(-)
 create mode 100644 TODO.txt
 create mode 100644 concordia/common/text_utils.cpp
 create mode 100644 concordia/common/text_utils.hpp
 create mode 100644 concordia/t/test_text_utils.cpp
 create mode 100644 tests/resources/anonymizer/html_tags.txt
 create mode 100644 tests/resources/anonymizer/named_entities.txt
 create mode 100644 tests/resources/anonymizer/space_symbols.txt
 create mode 100644 tests/resources/anonymizer/stop_symbols.txt
 create mode 100644 tests/resources/anonymizer/stop_words.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4f27d9..8b46060 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,7 +74,7 @@ endif(WITH_PCRE)
 set(Boost_USE_STATIC_LIBS OFF)
 set(Boost_USE_STATIC_RUNTIME OFF)
 find_package(Boost COMPONENTS 
-      serialization unit_test_framework system filesystem program_options iostreams regex REQUIRED)
+      serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)
 
 # ----------------------------------------------------
 # libconfig
diff --git a/TODO.txt b/TODO.txt
new file mode 100644
index 0000000..2f60d3f
--- /dev/null
+++ b/TODO.txt
@@ -0,0 +1,3 @@
+1. lokalizowane to_lower
+2. anonimizacja zdań
+3. Dzielenie zdań (max 255 tokenów)
diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt
index 6a971c7..e054bce 100644
--- a/concordia/CMakeLists.txt
+++ b/concordia/CMakeLists.txt
@@ -22,6 +22,7 @@ add_library(concordia SHARED
   concordia_exception.cpp
   common/logging.cpp
   common/utils.cpp
+  common/text_utils.cpp
   )
 
 add_subdirectory(t)
@@ -29,6 +30,8 @@ add_subdirectory(t)
 
 install(TARGETS concordia DESTINATION lib/)
 install(FILES 
+          regex_replacement.hpp
+          sentence_anonymizer.hpp
           interval.hpp
           tm_matches.hpp
           anubis_search_result.hpp
@@ -47,6 +50,7 @@ install(FILES
           common/config.hpp
           common/logging.hpp
           common/utils.hpp
+          common/text_utils.hpp
         DESTINATION include/concordia/common/)
 
 # ----------------------------------------------------
diff --git a/concordia/common/text_utils.cpp b/concordia/common/text_utils.cpp
new file mode 100644
index 0000000..c38bc02
--- /dev/null
+++ b/concordia/common/text_utils.cpp
@@ -0,0 +1,29 @@
+#include "concordia/common/text_utils.hpp"
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/case_conv.hpp>
+#include <boost/locale.hpp>
+
+
+using namespace boost::locale;
+ 
+string TextUtils::toLowerCase(const string & text) {
+    generator gen;
+    locale loc=gen("pl_PL.UTF-8"); 
+    locale::global(loc); 
+    cout.imbue(loc);
+    
+    string result = text;
+    boost::locale::to_lower(result);
+    return result;
+}
+
+string TextUtils::toUpperCase(const string & text) {
+    generator gen;
+    locale loc=gen("pl_PL.UTF-8"); 
+    locale::global(loc); 
+    cout.imbue(loc);
+
+    string result = text;
+    boost::locale::to_upper(result);
+    return result;
+}
diff --git a/concordia/common/text_utils.hpp b/concordia/common/text_utils.hpp
new file mode 100644
index 0000000..0c6d1b4
--- /dev/null
+++ b/concordia/common/text_utils.hpp
@@ -0,0 +1,30 @@
+#ifndef TEXT_UTILS_HDR
+#define TEXT_UTILS_HDR
+
+#include <string>
+
+
+using namespace std;
+
+/*! Utility class for performing simple string operations.
+*/
+class TextUtils {
+public:
+
+    /*! A method for converting all string letters to lower case.
+     \param text input string
+     \returns lower case version of the input string.
+    */
+    static string toLowerCase(const string & text);
+
+    /*! A method for converting all string letters to upper case.
+     \param text input string
+     \returns upper case version of the input string.
+    */
+    static string toUpperCase(const string & text);
+
+private:
+
+};
+
+#endif
diff --git a/concordia/regex_replacement.cpp b/concordia/regex_replacement.cpp
index cc09964..0b5f7ab 100644
--- a/concordia/regex_replacement.cpp
+++ b/concordia/regex_replacement.cpp
@@ -3,11 +3,16 @@
 #include <boost/exception/all.hpp>
 #include <boost/throw_exception.hpp>
 
-RegexReplacement::RegexReplacement(string patternString, string replacement)
+RegexReplacement::RegexReplacement(string patternString, string replacement,
+                                   bool caseSensitive)
                                          throw(ConcordiaException):
                                          _replacement(replacement) {
     try {
-        _pattern = boost::regex(patternString);
+        if (caseSensitive) {
+            _pattern = boost::make_u32regex(patternString);
+        } else {
+            _pattern = boost::make_u32regex(patternString, boost::regex::icase);        
+        }
     } catch ( const std::exception & e ) {
         stringstream ss;
         
@@ -25,7 +30,7 @@ RegexReplacement::~RegexReplacement() {
 }
 
 string RegexReplacement::apply(const string & text) {
-    return boost::regex_replace(text, _pattern, _replacement,
+    return boost::u32regex_replace(text, _pattern, _replacement,
                     boost::match_default | boost::format_all);
 }
 
diff --git a/concordia/regex_replacement.hpp b/concordia/regex_replacement.hpp
index 88c33ce..9684218 100644
--- a/concordia/regex_replacement.hpp
+++ b/concordia/regex_replacement.hpp
@@ -6,6 +6,7 @@
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
 #include <boost/regex.hpp>
+#include <boost/regex/icu.hpp>
 
 
 /*!
@@ -19,7 +20,8 @@ typedef boost::error_info<struct my_tag,std::string> my_tag_error_info;
 
 class RegexReplacement {
 public:
-    explicit RegexReplacement(string patternString, string replacement)
+    RegexReplacement(string patternString, string replacement,
+                             bool caseSensitive = true)
                                               throw(ConcordiaException);
 
     /*! Destructor.
@@ -29,7 +31,7 @@ public:
     string apply(const string & text);
 
 private:
-    boost::regex _pattern;
+    boost::u32regex _pattern;
     
     string _replacement;    
 };
diff --git a/concordia/sentence_anonymizer.hpp b/concordia/sentence_anonymizer.hpp
index 6d0ded3..6259818 100644
--- a/concordia/sentence_anonymizer.hpp
+++ b/concordia/sentence_anonymizer.hpp
@@ -3,9 +3,11 @@
 
 #include <string>
 #include "concordia/common/config.hpp"
+#include "concordia/regex_replacement.hpp"
 #include "concordia/concordia_config.hpp"
 #include "concordia/concordia_exception.hpp"
 #include <boost/shared_ptr.hpp>
+#include <boost/ptr_container/ptr_vector.hpp>
 
 
 /*!
@@ -27,6 +29,14 @@ public:
     string anonymize(const string & sentence);
 
 private:
+
+    boost::ptr_vector<RegexReplacement> _namedEntities;
+
+    boost::shared_ptr<RegexReplacement> _stopWords;
+    
+    boost::shared_ptr<RegexReplacement> _stopSymbols;
+    
+    boost::shared_ptr<RegexReplacement> _spaceSymbols;    
 };
 
 #endif
diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt
index 054178c..05f892d 100644
--- a/concordia/t/CMakeLists.txt
+++ b/concordia/t/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_library(concordia-tests
+  test_text_utils.cpp
   test_regex_replacement.cpp
   test_example.cpp
   test_tm_matches.cpp
diff --git a/concordia/t/test_regex_replacement.cpp b/concordia/t/test_regex_replacement.cpp
index 02a6065..23c613a 100644
--- a/concordia/t/test_regex_replacement.cpp
+++ b/concordia/t/test_regex_replacement.cpp
@@ -2,6 +2,8 @@
 #include "concordia/regex_replacement.hpp"
 #include "concordia/common/config.hpp"
 #include <boost/algorithm/string/predicate.hpp>
+#include <boost/locale.hpp>
+#include <boost/algorithm/string/case_conv.hpp>
 
 using namespace std;
 
@@ -39,4 +41,28 @@ BOOST_AUTO_TEST_CASE( BackrefReplacement )
     BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812.");
 }
 
+BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
+{
+    RegexReplacement rr("abc","xxx", false);
+    BOOST_CHECK_EQUAL(rr.apply("This is AbC and ABC and abc and aBC."),"This is xxx and xxx and xxx and xxx.");
+}
+
+BOOST_AUTO_TEST_CASE( UnicodeReplacement )
+{
+    RegexReplacement rr("ą","x");
+    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń"),"zażółć gęślx jaźń");
+}
+
+BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
+{
+    RegexReplacement rr("ą","x", false);
+    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zażółć gęślx jaźń ZAŻÓŁĆ GĘŚLx JAŹŃ");
+}
+
+BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
+{
+    RegexReplacement rr("[ąćęłńóśżź]","x", false);
+    BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zaxxxx gxxlx jaxx ZAxxxx GxxLx JAxx");
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/concordia/t/test_text_utils.cpp b/concordia/t/test_text_utils.cpp
new file mode 100644
index 0000000..fdfa6c3
--- /dev/null
+++ b/concordia/t/test_text_utils.cpp
@@ -0,0 +1,21 @@
+#include "tests/unit-tests/unit_tests_globals.hpp"
+#include "concordia/common/config.hpp"
+#include "concordia/common/text_utils.hpp"
+
+using namespace std;
+
+BOOST_AUTO_TEST_SUITE(text_utils)
+
+BOOST_AUTO_TEST_CASE( ToLower )
+{
+    string str = "ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
+    BOOST_CHECK_EQUAL(TextUtils::toLowerCase(str),"zażółć gęślą jaźń");
+}
+
+BOOST_AUTO_TEST_CASE( ToUpper )
+{
+    string str = "zażółć gęślą jaźń";
+    BOOST_CHECK_EQUAL(TextUtils::toUpperCase(str),"ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/resources/anonymizer/html_tags.txt b/tests/resources/anonymizer/html_tags.txt
new file mode 100644
index 0000000..4cd5a33
--- /dev/null
+++ b/tests/resources/anonymizer/html_tags.txt
@@ -0,0 +1,92 @@
+a
+abbr
+acronym
+address
+applet
+area
+b
+base
+basefont
+bdo
+big
+blockquote
+body
+br
+button
+caption
+center
+cite
+code
+col
+colgroup
+dd
+del
+dir
+div
+dfn
+dl
+dt
+em
+fieldset
+font
+form
+frame
+frameset
+h1
+h2
+h3
+h4
+h5
+h6
+head
+hr
+html
+i
+iframe
+img
+input
+ins
+isindex
+kbd
+label
+legend
+li
+link
+map
+menu
+meta
+noframes
+noscript
+object
+ol
+optgroup
+option
+p
+param
+pre
+q
+s
+samp
+script
+select
+small
+span
+strike
+strong
+style
+sub
+sup
+table
+tbody
+td
+textarea
+tfoot
+th
+thead
+title
+tr
+tt
+u
+ul
+var
+xmp
diff --git a/tests/resources/anonymizer/named_entities.txt b/tests/resources/anonymizer/named_entities.txt
new file mode 100644
index 0000000..759e306
--- /dev/null
+++ b/tests/resources/anonymizer/named_entities.txt
@@ -0,0 +1,5 @@
+[0-9]{1,2})[\.\-/]([0-9]{1,2})[\.\-/]([0-9]{4}  NE_DATE
+[A-ZĄŻŹŚĘĆŃÓŁ][[A-ZĄŻŹŚĘĆŃÓŁ][a-zążźśęćńół]\-\.]*\s+Sp\.\s+z\s+o\.\s*o\.    NE_COMPANY
+[A-ZĄŻŹŚĘĆŃÓŁ][[A-ZĄŻŹŚĘĆŃÓŁ][a-zążźśęćńół]\-\.]*\s+S\.?\s?A\.? NE_COMPANY
+[\w\._\d]+@\w+(\.\w+)*  NE_EMAIL
+[0-9]+([\.\,][0-9]+)? NE_NUMBER
diff --git a/tests/resources/anonymizer/space_symbols.txt b/tests/resources/anonymizer/space_symbols.txt
new file mode 100644
index 0000000..af86cef
--- /dev/null
+++ b/tests/resources/anonymizer/space_symbols.txt
@@ -0,0 +1,4 @@
+\|
+\–
+\-
+\/
diff --git a/tests/resources/anonymizer/stop_symbols.txt b/tests/resources/anonymizer/stop_symbols.txt
new file mode 100644
index 0000000..bf699f6
--- /dev/null
+++ b/tests/resources/anonymizer/stop_symbols.txt
@@ -0,0 +1,39 @@
+\\tab
+\\emdash
+\&lt;
+\&gt;
+\&amp;
+\&quot;
+\&dash;
+\&nbsp;
+<
+>
+=
+\+
+„
+”
+\"
+…
+\.
+\,
+\?
+!
+;
+:
+'
+\(
+\)
+\{
+\}
+\@
+\#
+\$
+\%
+\^
+\&
+\*
+\[
+\]
+\\
+\~
+&#\d+
diff --git a/tests/resources/anonymizer/stop_words.txt b/tests/resources/anonymizer/stop_words.txt
new file mode 100644
index 0000000..383533c
--- /dev/null
+++ b/tests/resources/anonymizer/stop_words.txt
@@ -0,0 +1,274 @@
+a
+aby
+ach
+acz
+aczkolwiek
+aj
+albo
+ale
+ależ
+aż
+bardziej
+bardzo
+bez
+bo
+bowiem
+by
+byli
+bynajmniej
+być
+był
+była
+było
+były
+będzie
+będą
+cali
+cała
+cały
+ci
+cię
+ciebie
+co
+cokolwiek
+coś
+czasami
+czasem
+czemu
+czy
+czyli
+daleko
+dla
+dlaczego
+dlatego
+do
+dobrze
+dokąd
+dość
+dużo
+dwa
+dwaj
+dwie
+dwoje
+dziś
+dzisiaj
+gdy
+gdyby
+gdyż
+gdzie
+gdziekolwiek
+gdzieś
+go
+i
+ich
+ile
+im
+inna
+inne
+inny
+innych
+iż
+ja
+ją
+jak
+jakaś
+jakby
+jaki
+jakichś
+jakie
+jakiś
+jakiż
+jakkolwiek
+jako
+jakoś
+je
+jeden
+jedna
+jedno
+jednak
+jednakże
+jego
+jej
+jemu
+jest
+jestem
+jeszcze
+jeśli
+jeżeli
+już
+ją
+każdy
+kiedy
+kilka
+kimś
+kto
+ktokolwiek
+ktoś
+która
+które
+którego
+której
+który
+których
+którym
+którzy
+ku
+lat
+lecz
+lub
+ma
+mają
+mam
+mi
+mimo
+między
+mną
+mnie
+mogą
+moi
+moim
+moja
+moje
+może
+możliwe
+można
+mój
+mu
+musi
+my
+na
+nad
+nam
+nami
+nas
+nasi
+nasz
+nasza
+nasze
+naszego
+naszych
+natomiast
+natychmiast
+nawet
+nią
+nic
+nich
+nie
+niego
+niej
+niemu
+nigdy
+nim
+nimi
+niż
+no
+o
+obok
+od
+około
+on
+ona
+one
+oni
+ono
+oraz
+oto
+owszem
+pan
+pana
+pani
+po
+pod
+podczas
+pomimo
+ponad
+ponieważ
+powinien
+powinna
+powinni
+powinno
+poza
+prawie
+przecież
+przed
+przede
+przedtem
+przez
+przy
+roku
+również
+sam
+sama
+są
+się
+skąd
+sobie
+sobą
+sposób
+swoje
+ta
+tak
+taka
+taki
+takie
+także
+tam
+te
+tego
+tej
+ten
+teraz
+też
+to
+tobą
+tobie
+toteż
+trzeba
+tu
+tutaj
+twoi
+twoim
+twoja
+twoje
+twym
+twój
+ty
+tych
+tylko
+tym
+u
+w
+wam
+wami
+was
+wasz
+wasza
+wasze
+we
+według
+wiele
+wielu
+więc
+więcej
+wszyscy
+wszystkich
+wszystkie
+wszystkim
+wszystko
+wtedy
+wy
+właśnie
+z
+za
+zapewne
+zawsze
+ze
+znowu
+znów
+został
+żaden
+żadna
+żadne
+żadnych
+że
+żeby