Use standard text normalization - fixes #2898 (#2900)

* Use standard text normalization - fixes #2898

Fixes #2898. Fixes #409. Refs #650

Replaces homegrown ISO Latin-1 only character subsitition
with standard Java Normalize to NFD, followed by diacritic
removal and a few custom character expansions/replacements.

* Fix Mac build

* Improve compatibility with previous code

One intentional change is folding O with stroke to
oe instead of o.

- Use more powerful NFKD instead of NFD
- strip punctuation after decomposition since it can generate
  new punctuation
- Add compatibility test for old asciify() method
- Add some graphically similar characters to substitution table

* Add oe character/ligature & more long S forms

* More tests for ligatures and Latin Extended

* Add Latin-1 Supplement tests
This commit is contained in:
Tom Morris 2020-07-07 15:35:41 -04:00 committed by GitHub
parent ab2ec8d28f
commit 0562638ffa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 131 additions and 229 deletions

View File

@ -21,7 +21,7 @@ jobs:
- brew services start mysql
- brew services start postgresql
- brew services start mariadb@10.3
#- sleep 15 # wait for databases to start up
- sleep 15 # wait for databases to start up
# Homebrew postgres workaround - create expected user postgres
- /usr/local/opt/postgres/bin/createuser -s postgres
# FIXME this is duplicated from linux config, but don't know a better way to do it

View File

@ -33,16 +33,51 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.refine.clustering.binning;
import java.text.Normalizer;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import com.google.common.collect.ImmutableMap;
public class FingerprintKeyer extends Keyer {
// Punctuation and control characters (except for TAB which we need for split to work)
static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]");
static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]",
Pattern.UNICODE_CHARACTER_CLASS);
public static final Pattern DIACRITICS_AND_FRIENDS = Pattern
.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
// First part of table based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
private static final ImmutableMap<String, String> NONDIACRITICS = ImmutableMap.<String, String>builder()
//Replace non-diacritics as their equivalent characters
.put("ß", "ss")
.put("æ", "ae")
.put("ø", "oe")
.put("å", "aa") // TODO: We'll never see this after decomposition
.put("©", "c") // copyright character
.put("\u00F0", "d") // Small letter Icelandic eth
.put("\u0111", "d") // Small letter D with stroke
.put("\u0256", "d") // Small letter African D
.put("\u00FE", "th") // Lower case Icelandic thorn þ
// Visually similar replacements from our private former asciify() method
// (only need lower case forms since we're already downcased)
.put("\u0127", "h") // small H with stroke
.put("\u0131", "i") // dotless I
.put("\u0138", "k") // small letter Kra
.put("\u0142", "l") // Bialystock
.put("\u014B", "n") // Small letter Eng
.put("\u017F", "s") // long s
.put("\u0167", "t") // small letter T with stroke
// Additional characters following the same principle
.put("œ", "oe")
.put("", "s") // more long S forms
.put("", "s")
.build();
@Override
public String key(String s, Object... o) {
@ -50,10 +85,10 @@ public class FingerprintKeyer extends Keyer {
throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter");
}
s = s.trim(); // first off, remove whitespace around the string
s = s.toLowerCase(); // then lowercase it
s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars
s = asciify(s); // find ASCII equivalent to characters
String[] frags = StringUtils.split(s); // split by whitespace
s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
s = normalize(s);
s = punctctrl.matcher(s).replaceAll(""); // decomposition can generate punctuation so strip it after
String[] frags = StringUtils.split(s); // split by whitespace (excluding supplementary characters)
TreeSet<String> set = new TreeSet<String>();
for (String ss : frags) {
set.add(ss); // order fragments and dedupe
@ -69,222 +104,32 @@ public class FingerprintKeyer extends Keyer {
return b.toString();
}
protected String asciify(String s) {
char[] c = s.toCharArray();
StringBuffer b = new StringBuffer();
for (char element : c) {
b.append(translate(element));
}
return b.toString();
protected String normalize(String s) {
s = stripDiacritics(s);
s = stripNonDiacritics(s);
return s;
}
/**
* Translate the given unicode char in the closest ASCII representation
* NOTE: this function deals only with latin-1 supplement and latin-1 extended code charts
*/
private char translate(char c) {
switch(c) {
case '\u00C0':
case '\u00C1':
case '\u00C2':
case '\u00C3':
case '\u00C4':
case '\u00C5':
case '\u00E0':
case '\u00E1':
case '\u00E2':
case '\u00E3':
case '\u00E4':
case '\u00E5':
case '\u0100':
case '\u0101':
case '\u0102':
case '\u0103':
case '\u0104':
case '\u0105':
return 'a';
case '\u00C7':
case '\u00E7':
case '\u0106':
case '\u0107':
case '\u0108':
case '\u0109':
case '\u010A':
case '\u010B':
case '\u010C':
case '\u010D':
return 'c';
case '\u00D0':
case '\u00F0':
case '\u010E':
case '\u010F':
case '\u0110':
case '\u0111':
return 'd';
case '\u00C8':
case '\u00C9':
case '\u00CA':
case '\u00CB':
case '\u00E8':
case '\u00E9':
case '\u00EA':
case '\u00EB':
case '\u0112':
case '\u0113':
case '\u0114':
case '\u0115':
case '\u0116':
case '\u0117':
case '\u0118':
case '\u0119':
case '\u011A':
case '\u011B':
return 'e';
case '\u011C':
case '\u011D':
case '\u011E':
case '\u011F':
case '\u0120':
case '\u0121':
case '\u0122':
case '\u0123':
return 'g';
case '\u0124':
case '\u0125':
case '\u0126':
case '\u0127':
return 'h';
case '\u00CC':
case '\u00CD':
case '\u00CE':
case '\u00CF':
case '\u00EC':
case '\u00ED':
case '\u00EE':
case '\u00EF':
case '\u0128':
case '\u0129':
case '\u012A':
case '\u012B':
case '\u012C':
case '\u012D':
case '\u012E':
case '\u012F':
case '\u0130':
case '\u0131':
return 'i';
case '\u0134':
case '\u0135':
return 'j';
case '\u0136':
case '\u0137':
case '\u0138':
return 'k';
case '\u0139':
case '\u013A':
case '\u013B':
case '\u013C':
case '\u013D':
case '\u013E':
case '\u013F':
case '\u0140':
case '\u0141':
case '\u0142':
return 'l';
case '\u00D1':
case '\u00F1':
case '\u0143':
case '\u0144':
case '\u0145':
case '\u0146':
case '\u0147':
case '\u0148':
case '\u0149':
case '\u014A':
case '\u014B':
return 'n';
case '\u00D2':
case '\u00D3':
case '\u00D4':
case '\u00D5':
case '\u00D6':
case '\u00D8':
case '\u00F2':
case '\u00F3':
case '\u00F4':
case '\u00F5':
case '\u00F6':
case '\u00F8':
case '\u014C':
case '\u014D':
case '\u014E':
case '\u014F':
case '\u0150':
case '\u0151':
return 'o';
case '\u0154':
case '\u0155':
case '\u0156':
case '\u0157':
case '\u0158':
case '\u0159':
return 'r';
case '\u015A':
case '\u015B':
case '\u015C':
case '\u015D':
case '\u015E':
case '\u015F':
case '\u0160':
case '\u0161':
case '\u017F':
return 's';
case '\u0162':
case '\u0163':
case '\u0164':
case '\u0165':
case '\u0166':
case '\u0167':
return 't';
case '\u00D9':
case '\u00DA':
case '\u00DB':
case '\u00DC':
case '\u00F9':
case '\u00FA':
case '\u00FB':
case '\u00FC':
case '\u0168':
case '\u0169':
case '\u016A':
case '\u016B':
case '\u016C':
case '\u016D':
case '\u016E':
case '\u016F':
case '\u0170':
case '\u0171':
case '\u0172':
case '\u0173':
return 'u';
case '\u0174':
case '\u0175':
return 'w';
case '\u00DD':
case '\u00FD':
case '\u00FF':
case '\u0176':
case '\u0177':
case '\u0178':
return 'y';
case '\u0179':
case '\u017A':
case '\u017B':
case '\u017C':
case '\u017D':
case '\u017E':
return 'z';
}
return c;
@Deprecated
protected String asciify(String s) {
return normalize(s);
}
protected static String stripDiacritics(String str) {
str = Normalizer.normalize(str, Normalizer.Form.NFKD);
str = DIACRITICS_AND_FRIENDS.matcher(str).replaceAll("");
return str;
}
// Based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
private static String stripNonDiacritics(String orig) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < orig.length(); i++) {
String source = orig.substring(i, i + 1);
String replace = NONDIACRITICS.get(source);
result.append(replace == null ? String.valueOf(source) : replace);
}
return result.toString();
}
}

View File

@ -55,9 +55,68 @@ public class KeyerTests extends RefineTest {
{" d c b a ","a b c d"},
{"\tABC \t DEF ","abc def"}, // test leading and trailing whitespace
{"bbb\taaa","aaa bbb"},
{"",""},
{"",""},
{"",""},
// {"å","aa"}, // Requested by issue #650, but conflicts with diacritic folding
{"æø","aeoe"}, // Norwegian replacements from #650
{"©ß","css"}, // issue #409 esszet
{"\u00D0\u00DE", "dth"}, // Icelandic eth and thorn
{"fffiflffiſtst", "fffiflffistst"}, // ligatures
// Test legacy replacements
{"\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u0100\u0101\u0102\u0103\u0104\u0105", "aaaaaaaaaaaaaaaaaa"},
{"\u00C7\u00E7\u0106\u0107\u0108\u0109\u010A\u010B\u010C\u010D", "cccccccccc"},
{"\u00D0\u00F0\u010E\u010F\u0110\u0111", "dddddd"},
{"\u00C8\u00C9\u00CA\u00CB\u00E8\u00E9\u00EA\u00EB\u0112\u0113\u0114\u0115\u0116\u0117\u0118\u0119\u011A\u011B", "eeeeeeeeeeeeeeeeee"},
{"\u011C\u011D\u011E\u011F\u0120\u0121\u0122\u0123", "gggggggg"},
{"\u0124\u0125", "hh"},
{"\u0126\u0127", "hh"},
{"\u00CC\u00CD\u00CE\u00CF\u00EC\u00ED\u00EE\u00EF\u0128\u0129\u012A\u012B\u012C\u012D\u012E\u012F\u0130", "iiiiiiiiiiiiiiiii"},
{"\u0131", "i"},
{"\u0134\u0135", "jj"},
{"\u0136\u0137", "kk"},
{"\u0138", "k"},
{"\u0139\u013A\u013B\u013C\u013D\u013E\u0141\u0142", "llllllll"},
{"\u013F\u0140", "ll"},
{"\u00D1\u00F1\u0143\u0144\u0145\u0146\u0147\u0148", "nnnnnnnn"},
{"\u0149", "n"}, // decomposed to 'n, but then punctuation is stripped
{"\u014A\u014B", "nn"},
{"\u00D2\u00D3\u00D4\u00D5\u00D6\u00F2\u00F3\u00F4\u00F5\u00F6\u014C\u014D\u014E\u014F\u0150\u0151", "oooooooooooooooo"},
{"\u00D8\u00F8", "oeoe"}, // O with stroke we decompose to oe instead of o
{"\u0154\u0155\u0156\u0157\u0158\u0159", "rrrrrr"},
{"\u015A\u015B\u015C\u015D\u015E\u015F\u0160\u0161\u017F", "sssssssss"},
{"\u0162\u0163\u0164\u0165\u0166\u0167", "tttttt"},
{"\u00D9\u00DA\u00DB\u00DC\u00F9\u00FA\u00FB\u00FC\u0168\u0169\u016A\u016B\u016C\u016D\u016E\u016F\u0170\u0171\u0172\u0173", "uuuuuuuuuuuuuuuuuuuu"},
{"\u0174\u0175", "ww"},
{"\u00DD\u00FD\u00FF\u0176\u0177\u0178", "yyyyyy"},
{"\u0179\u017A\u017B\u017C\u017D\u017E", "zzzzzz"},
// Latin-1 Supplement
{//"€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ " + // These are all considered control characters
//"¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿" // punctuation
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ",
"aaaaaaaeceeeeiiiidnooooooeuuuuythssaaaaaaaeceeeeiiiidnooooooeuuuuythy"},
// Latin Extended A
{"ĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſ",
"aaaaaaccccccccddddeeeeeeeeeegggggggghhhhiiiiiiiiiiijijjjkkkllllllllllnnnnnnnnnoooooooeoerrrrrrssssssssttttttuuuuuuuuuuuuwwyyyzzzzzzs"},
// Latin Extended B
// TODO: These don't get folded to ASCII equivalents. Not sure if they should be
// {"ƄƅƉƊƋƌƔƕƖƗƘƙƚƛƜƝƞƟƠơƢƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƼƽƾƿǀǁǂǃDŽDždžLJLjljNJNjnjǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝ",
// "bbddddhiikkllmnnooooopprssssttttuuuuyyzz aaiioouuuuuuuuuux"},
// {"ǞǟǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯǰDZDzdzǴǵǶǷǸǹǺǻǼǽǾǿȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ",
// "aaaaaeaeggggkkqqqqqssjdzdzggpnnaaaeaeooaaaeeeeeiiiioooorrrruuuusstt zzaaeeooooooooyy"},
// Latin Extended Additional
{"ḀḁḂḃḄḅḆḇḈḉḊḋḌḍḎḏḐḑḒḓḔḕḖḗḘḙḚḛḜḝḞḟḠḡḢḣḤḥḦḧḨḩḪḫḬḭḮḯḰḱḲḳḴḵḶḷḸḹḺḻḼḽḾḿṀṁṂṃṄṅṆṇṈṉṊṋṌṍṎṏṐṑṒṓṔṕṖṗ",
"aabbbbbbccddddddddddeeeeeeeeeeffgghhhhhhhhhhiiiikkkkkkllllllllmmmmmmnnnnnnnnoooooooopppp"},
{"ṘṙṚṛṜṝṞṟṠṡṢṣṤṥṦṧṨṩṪṫṬṭṮṯṰṱṲṳṴṵṶṷṸṹṺṻṼṽṾṿẀẁẂẃẄẅẆẇẈẉẊẋẌẍẎẏẐẑẒẓẔẕ",
"rrrrrrrrssssssssssttttttttuuuuuuuuuuvvvvwwwwwwwwwwxxxxyyzzzzzz"},
{"ẖẗẘẙẚẛẜẝẞẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệ",
"htwyasssssaaaaaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeee"},
// Latin Extended C - TODO: not supported yet
// {"ⱠⱡⱢⱣⱤⱥⱦⱧⱨⱩⱪⱫⱬⱭⱮⱯⱰⱱⱲⱳⱴⱵⱶⱷⱸⱹⱺⱻⱼⱽⱾⱿ",
// "lllprathhkkzzamaaavwwwv e ejvsz"},
// Latin Extended D
{"", ""},
// Latin Extended E
{"", ""},
// TODO: Add tests for non-Western languages
{"", ""},
};
private static final String[][] testNGramStrings = {
@ -108,6 +167,4 @@ public class KeyerTests extends RefineTest {
}
}
}