text utils stub
Former-commit-id: d4459220f5696839d98848e9c30a61c084763a91
This commit is contained in:
parent
13c97f572d
commit
9358863f8d
@ -74,7 +74,7 @@ endif(WITH_PCRE)
|
||||
set(Boost_USE_STATIC_LIBS OFF)
|
||||
set(Boost_USE_STATIC_RUNTIME OFF)
|
||||
find_package(Boost COMPONENTS
|
||||
serialization unit_test_framework system filesystem program_options iostreams regex REQUIRED)
|
||||
serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)
|
||||
|
||||
# ----------------------------------------------------
|
||||
# libconfig
|
||||
|
3
TODO.txt
Normal file
3
TODO.txt
Normal file
@ -0,0 +1,3 @@
|
||||
1. lokalizowane to_lower
|
||||
2. anonimizacja zdań
|
||||
3. Dzielenie zdań (max 255 tokenów)
|
@ -22,6 +22,7 @@ add_library(concordia SHARED
|
||||
concordia_exception.cpp
|
||||
common/logging.cpp
|
||||
common/utils.cpp
|
||||
common/text_utils.cpp
|
||||
)
|
||||
|
||||
add_subdirectory(t)
|
||||
@ -29,6 +30,8 @@ add_subdirectory(t)
|
||||
|
||||
install(TARGETS concordia DESTINATION lib/)
|
||||
install(FILES
|
||||
regex_replacement.hpp
|
||||
sentence_anonymizer.hpp
|
||||
interval.hpp
|
||||
tm_matches.hpp
|
||||
anubis_search_result.hpp
|
||||
@ -47,6 +50,7 @@ install(FILES
|
||||
common/config.hpp
|
||||
common/logging.hpp
|
||||
common/utils.hpp
|
||||
common/text_utils.hpp
|
||||
DESTINATION include/concordia/common/)
|
||||
|
||||
# ----------------------------------------------------
|
||||
|
29
concordia/common/text_utils.cpp
Normal file
29
concordia/common/text_utils.cpp
Normal file
@ -0,0 +1,29 @@
|
||||
#include "concordia/common/text_utils.hpp"
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
#include <boost/locale.hpp>
|
||||
|
||||
|
||||
using namespace boost::locale;
|
||||
|
||||
string TextUtils::toLowerCase(const string & text) {
|
||||
generator gen;
|
||||
locale loc=gen("pl_PL.UTF-8");
|
||||
locale::global(loc);
|
||||
cout.imbue(loc);
|
||||
|
||||
string result = text;
|
||||
boost::locale::to_lower(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
string TextUtils::toUpperCase(const string & text) {
|
||||
generator gen;
|
||||
locale loc=gen("pl_PL.UTF-8");
|
||||
locale::global(loc);
|
||||
cout.imbue(loc);
|
||||
|
||||
string result = text;
|
||||
boost::locale::to_upper(result);
|
||||
return result;
|
||||
}
|
30
concordia/common/text_utils.hpp
Normal file
30
concordia/common/text_utils.hpp
Normal file
@ -0,0 +1,30 @@
|
||||
#ifndef TEXT_UTILS_HDR
|
||||
#define TEXT_UTILS_HDR
|
||||
|
||||
#include <string>
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*! Utility class for performing simple string operations.
|
||||
*/
|
||||
class TextUtils {
|
||||
public:
|
||||
|
||||
/*! A method for converting all string letters to lower case.
|
||||
\param text input string
|
||||
\returns lower case version of the input string.
|
||||
*/
|
||||
static string toLowerCase(const string & text);
|
||||
|
||||
/*! A method for converting all string letters to upper case.
|
||||
\param text input string
|
||||
\returns upper case version of the input string.
|
||||
*/
|
||||
static string toUpperCase(const string & text);
|
||||
|
||||
private:
|
||||
|
||||
};
|
||||
|
||||
#endif
|
@ -3,11 +3,16 @@
|
||||
#include <boost/exception/all.hpp>
|
||||
#include <boost/throw_exception.hpp>
|
||||
|
||||
RegexReplacement::RegexReplacement(string patternString, string replacement)
|
||||
RegexReplacement::RegexReplacement(string patternString, string replacement,
|
||||
bool caseSensitive)
|
||||
throw(ConcordiaException):
|
||||
_replacement(replacement) {
|
||||
try {
|
||||
_pattern = boost::regex(patternString);
|
||||
if (caseSensitive) {
|
||||
_pattern = boost::make_u32regex(patternString);
|
||||
} else {
|
||||
_pattern = boost::make_u32regex(patternString, boost::regex::icase);
|
||||
}
|
||||
} catch ( const std::exception & e ) {
|
||||
stringstream ss;
|
||||
|
||||
@ -25,7 +30,7 @@ RegexReplacement::~RegexReplacement() {
|
||||
}
|
||||
|
||||
string RegexReplacement::apply(const string & text) {
|
||||
return boost::regex_replace(text, _pattern, _replacement,
|
||||
return boost::u32regex_replace(text, _pattern, _replacement,
|
||||
boost::match_default | boost::format_all);
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/regex.hpp>
|
||||
#include <boost/regex/icu.hpp>
|
||||
|
||||
|
||||
/*!
|
||||
@ -19,7 +20,8 @@ typedef boost::error_info<struct my_tag,std::string> my_tag_error_info;
|
||||
|
||||
class RegexReplacement {
|
||||
public:
|
||||
explicit RegexReplacement(string patternString, string replacement)
|
||||
RegexReplacement(string patternString, string replacement,
|
||||
bool caseSensitive = true)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Destructor.
|
||||
@ -29,7 +31,7 @@ public:
|
||||
string apply(const string & text);
|
||||
|
||||
private:
|
||||
boost::regex _pattern;
|
||||
boost::u32regex _pattern;
|
||||
|
||||
string _replacement;
|
||||
};
|
||||
|
@ -3,9 +3,11 @@
|
||||
|
||||
#include <string>
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/regex_replacement.hpp"
|
||||
#include "concordia/concordia_config.hpp"
|
||||
#include "concordia/concordia_exception.hpp"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
|
||||
|
||||
/*!
|
||||
@ -27,6 +29,14 @@ public:
|
||||
string anonymize(const string & sentence);
|
||||
|
||||
private:
|
||||
|
||||
boost::ptr_vector<RegexReplacement> _namedEntities;
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _stopWords;
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _stopSymbols;
|
||||
|
||||
boost::shared_ptr<RegexReplacement> _spaceSymbols;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,4 +1,5 @@
|
||||
add_library(concordia-tests
|
||||
test_text_utils.cpp
|
||||
test_regex_replacement.cpp
|
||||
test_example.cpp
|
||||
test_tm_matches.cpp
|
||||
|
@ -2,6 +2,8 @@
|
||||
#include "concordia/regex_replacement.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/locale.hpp>
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -39,4 +41,28 @@ BOOST_AUTO_TEST_CASE( BackrefReplacement )
|
||||
BOOST_CHECK_EQUAL(rr.apply("This is 12 and this is 812."),"This is the number: 12 and this is the number: 812.");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveReplacement )
|
||||
{
|
||||
RegexReplacement rr("abc","xxx", false);
|
||||
BOOST_CHECK_EQUAL(rr.apply("This is AbC and ABC and abc and aBC."),"This is xxx and xxx and xxx and xxx.");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( UnicodeReplacement )
|
||||
{
|
||||
RegexReplacement rr("ą","x");
|
||||
BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń"),"zażółć gęślx jaźń");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeReplacement )
|
||||
{
|
||||
RegexReplacement rr("ą","x", false);
|
||||
BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zażółć gęślx jaźń ZAŻÓŁĆ GĘŚLx JAŹŃ");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( CaseInsensitiveUnicodeClassReplacement )
|
||||
{
|
||||
RegexReplacement rr("[ąćęłńóśżź]","x", false);
|
||||
BOOST_CHECK_EQUAL(rr.apply("zażółć gęślą jaźń ZAŻÓŁĆ GĘŚLĄ JAŹŃ"),"zaxxxx gxxlx jaxx ZAxxxx GxxLx JAxx");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
21
concordia/t/test_text_utils.cpp
Normal file
21
concordia/t/test_text_utils.cpp
Normal file
@ -0,0 +1,21 @@
|
||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||
#include "concordia/common/config.hpp"
|
||||
#include "concordia/common/text_utils.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(text_utils)
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ToLower )
|
||||
{
|
||||
string str = "ZAŻÓŁĆ GĘŚLĄ JAŹŃ";
|
||||
BOOST_CHECK_EQUAL(TextUtils::toLowerCase(str),"zażółć gęślą jaźń");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( ToUpper )
|
||||
{
|
||||
string str = "zażółć gęślą jaźń";
|
||||
BOOST_CHECK_EQUAL(TextUtils::toUpperCase(str),"ZAŻÓŁĆ GĘŚLĄ JAŹŃ");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
92
tests/resources/anonymizer/html_tags.txt
Normal file
92
tests/resources/anonymizer/html_tags.txt
Normal file
@ -0,0 +1,92 @@
|
||||
a
|
||||
abbr
|
||||
acronym
|
||||
address
|
||||
applet
|
||||
area
|
||||
b
|
||||
base
|
||||
basefont
|
||||
bdo
|
||||
big
|
||||
blockquote
|
||||
body
|
||||
br
|
||||
button
|
||||
caption
|
||||
center
|
||||
cite
|
||||
code
|
||||
col
|
||||
colgroup
|
||||
dd
|
||||
del
|
||||
dir
|
||||
div
|
||||
dfn
|
||||
dl
|
||||
dt
|
||||
em
|
||||
fieldset
|
||||
font
|
||||
form
|
||||
frame
|
||||
frameset
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
head
|
||||
hr
|
||||
html
|
||||
i
|
||||
iframe
|
||||
img
|
||||
input
|
||||
ins
|
||||
isindex
|
||||
kbd
|
||||
label
|
||||
legend
|
||||
li
|
||||
link
|
||||
map
|
||||
menu
|
||||
meta
|
||||
noframes
|
||||
noscript
|
||||
object
|
||||
ol
|
||||
optgroup
|
||||
option
|
||||
p
|
||||
param
|
||||
pre
|
||||
q
|
||||
s
|
||||
samp
|
||||
script
|
||||
select
|
||||
small
|
||||
span
|
||||
strike
|
||||
strong
|
||||
style
|
||||
sub
|
||||
sup
|
||||
table
|
||||
tbody
|
||||
td
|
||||
textarea
|
||||
tfoot
|
||||
th
|
||||
thead
|
||||
title
|
||||
tr
|
||||
tt
|
||||
u
|
||||
ul
|
||||
var
|
||||
xmp
|
5
tests/resources/anonymizer/named_entities.txt
Normal file
5
tests/resources/anonymizer/named_entities.txt
Normal file
@ -0,0 +1,5 @@
|
||||
[0-9]{1,2})[\.\-/]([0-9]{1,2})[\.\-/]([0-9]{4} NE_DATE
|
||||
[A-ZĄŻŹŚĘĆŃÓŁ][[A-ZĄŻŹŚĘĆŃÓŁ][a-zążźśęćńół]\-\.]*\s+Sp\.\s+z\s+o\.\s*o\. NE_COMPANY
|
||||
[A-ZĄŻŹŚĘĆŃÓŁ][[A-ZĄŻŹŚĘĆŃÓŁ][a-zążźśęćńół]\-\.]*\s+S\.?\s?A\.? NE_COMPANY
|
||||
[\w\._\d]+@\w+(\.\w+)* NE_EMAIL
|
||||
[0-9]+([\.\,][0-9]+)? NE_NUMBER
|
4
tests/resources/anonymizer/space_symbols.txt
Normal file
4
tests/resources/anonymizer/space_symbols.txt
Normal file
@ -0,0 +1,4 @@
|
||||
\|
|
||||
\–
|
||||
\-
|
||||
\/
|
39
tests/resources/anonymizer/stop_symbols.txt
Normal file
39
tests/resources/anonymizer/stop_symbols.txt
Normal file
@ -0,0 +1,39 @@
|
||||
\\tab
|
||||
\\emdash
|
||||
\<
|
||||
\>
|
||||
\&
|
||||
\"
|
||||
\‐
|
||||
\
|
||||
<
|
||||
>
|
||||
=
|
||||
\+
|
||||
„
|
||||
”
|
||||
\"
|
||||
…
|
||||
\.
|
||||
\,
|
||||
\?
|
||||
!
|
||||
;
|
||||
:
|
||||
'
|
||||
\(
|
||||
\)
|
||||
\{
|
||||
\}
|
||||
\@
|
||||
\#
|
||||
\$
|
||||
\%
|
||||
\^
|
||||
\&
|
||||
\*
|
||||
\[
|
||||
\]
|
||||
\\
|
||||
\~
|
||||
&#\d+
|
274
tests/resources/anonymizer/stop_words.txt
Normal file
274
tests/resources/anonymizer/stop_words.txt
Normal file
@ -0,0 +1,274 @@
|
||||
a
|
||||
aby
|
||||
ach
|
||||
acz
|
||||
aczkolwiek
|
||||
aj
|
||||
albo
|
||||
ale
|
||||
ależ
|
||||
aż
|
||||
bardziej
|
||||
bardzo
|
||||
bez
|
||||
bo
|
||||
bowiem
|
||||
by
|
||||
byli
|
||||
bynajmniej
|
||||
być
|
||||
był
|
||||
była
|
||||
było
|
||||
były
|
||||
będzie
|
||||
będą
|
||||
cali
|
||||
cała
|
||||
cały
|
||||
ci
|
||||
cię
|
||||
ciebie
|
||||
co
|
||||
cokolwiek
|
||||
coś
|
||||
czasami
|
||||
czasem
|
||||
czemu
|
||||
czy
|
||||
czyli
|
||||
daleko
|
||||
dla
|
||||
dlaczego
|
||||
dlatego
|
||||
do
|
||||
dobrze
|
||||
dokąd
|
||||
dość
|
||||
dużo
|
||||
dwa
|
||||
dwaj
|
||||
dwie
|
||||
dwoje
|
||||
dziś
|
||||
dzisiaj
|
||||
gdy
|
||||
gdyby
|
||||
gdyż
|
||||
gdzie
|
||||
gdziekolwiek
|
||||
gdzieś
|
||||
go
|
||||
i
|
||||
ich
|
||||
ile
|
||||
im
|
||||
inna
|
||||
inne
|
||||
inny
|
||||
innych
|
||||
iż
|
||||
ja
|
||||
ją
|
||||
jak
|
||||
jakaś
|
||||
jakby
|
||||
jaki
|
||||
jakichś
|
||||
jakie
|
||||
jakiś
|
||||
jakiż
|
||||
jakkolwiek
|
||||
jako
|
||||
jakoś
|
||||
je
|
||||
jeden
|
||||
jedna
|
||||
jedno
|
||||
jednak
|
||||
jednakże
|
||||
jego
|
||||
jej
|
||||
jemu
|
||||
jest
|
||||
jestem
|
||||
jeszcze
|
||||
jeśli
|
||||
jeżeli
|
||||
już
|
||||
ją
|
||||
każdy
|
||||
kiedy
|
||||
kilka
|
||||
kimś
|
||||
kto
|
||||
ktokolwiek
|
||||
ktoś
|
||||
która
|
||||
które
|
||||
którego
|
||||
której
|
||||
który
|
||||
których
|
||||
którym
|
||||
którzy
|
||||
ku
|
||||
lat
|
||||
lecz
|
||||
lub
|
||||
ma
|
||||
mają
|
||||
mam
|
||||
mi
|
||||
mimo
|
||||
między
|
||||
mną
|
||||
mnie
|
||||
mogą
|
||||
moi
|
||||
moim
|
||||
moja
|
||||
moje
|
||||
może
|
||||
możliwe
|
||||
można
|
||||
mój
|
||||
mu
|
||||
musi
|
||||
my
|
||||
na
|
||||
nad
|
||||
nam
|
||||
nami
|
||||
nas
|
||||
nasi
|
||||
nasz
|
||||
nasza
|
||||
nasze
|
||||
naszego
|
||||
naszych
|
||||
natomiast
|
||||
natychmiast
|
||||
nawet
|
||||
nią
|
||||
nic
|
||||
nich
|
||||
nie
|
||||
niego
|
||||
niej
|
||||
niemu
|
||||
nigdy
|
||||
nim
|
||||
nimi
|
||||
niż
|
||||
no
|
||||
o
|
||||
obok
|
||||
od
|
||||
około
|
||||
on
|
||||
ona
|
||||
one
|
||||
oni
|
||||
ono
|
||||
oraz
|
||||
oto
|
||||
owszem
|
||||
pan
|
||||
pana
|
||||
pani
|
||||
po
|
||||
pod
|
||||
podczas
|
||||
pomimo
|
||||
ponad
|
||||
ponieważ
|
||||
powinien
|
||||
powinna
|
||||
powinni
|
||||
powinno
|
||||
poza
|
||||
prawie
|
||||
przecież
|
||||
przed
|
||||
przede
|
||||
przedtem
|
||||
przez
|
||||
przy
|
||||
roku
|
||||
również
|
||||
sam
|
||||
sama
|
||||
są
|
||||
się
|
||||
skąd
|
||||
sobie
|
||||
sobą
|
||||
sposób
|
||||
swoje
|
||||
ta
|
||||
tak
|
||||
taka
|
||||
taki
|
||||
takie
|
||||
także
|
||||
tam
|
||||
te
|
||||
tego
|
||||
tej
|
||||
ten
|
||||
teraz
|
||||
też
|
||||
to
|
||||
tobą
|
||||
tobie
|
||||
toteż
|
||||
trzeba
|
||||
tu
|
||||
tutaj
|
||||
twoi
|
||||
twoim
|
||||
twoja
|
||||
twoje
|
||||
twym
|
||||
twój
|
||||
ty
|
||||
tych
|
||||
tylko
|
||||
tym
|
||||
u
|
||||
w
|
||||
wam
|
||||
wami
|
||||
was
|
||||
wasz
|
||||
wasza
|
||||
wasze
|
||||
we
|
||||
według
|
||||
wiele
|
||||
wielu
|
||||
więc
|
||||
więcej
|
||||
wszyscy
|
||||
wszystkich
|
||||
wszystkie
|
||||
wszystkim
|
||||
wszystko
|
||||
wtedy
|
||||
wy
|
||||
właśnie
|
||||
z
|
||||
za
|
||||
zapewne
|
||||
zawsze
|
||||
ze
|
||||
znowu
|
||||
znów
|
||||
został
|
||||
żaden
|
||||
żadna
|
||||
żadne
|
||||
żadnych
|
||||
że
|
||||
żeby
|
Loading…
Reference in New Issue
Block a user