Use standard text normalization - fixes #2898 (#2900)

* Use standard text normalization - fixes #2898 Fixes #2898. Fixes #409. Refs #650 Replaces homegrown ISO Latin-1 only character subsitition with standard Java Normalize to NFD, followed by diacritic removal and a few custom character expansions/replacements. * Fix Mac build * Improve compatibility with previous code One intentional change is folding O with stroke to oe instead of o. - Use more powerful NFKD instead of NFD - strip punctuation after decomposition since it can generate new punctuation - Add compatibility test for old asciify() method - Add some graphically similar characters to substitution table * Add oe character/ligature & more long S forms * More tests for ligatures and Latin Extended * Add Latin-1 Supplement tests
2020-07-07 15:35:41 -04:00 · 2020-07-07 15:35:41 -04:00 · 0562638ffa
commit 0562638ffa
parent ab2ec8d28f
3 changed files with 131 additions and 229 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -21,7 +21,7 @@ jobs:
        - brew services start mysql
        - brew services start postgresql
        - brew services start mariadb@10.3
-        #- sleep 15 # wait for databases to start up
+        - sleep 15 # wait for databases to start up
        # Homebrew postgres workaround - create expected user postgres
        - /usr/local/opt/postgres/bin/createuser -s postgres
        # FIXME this is duplicated from linux config, but don't know a better way to do it
--- a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java
+++ b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java
@ -33,27 +33,62 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 package com.google.refine.clustering.binning;

+import java.text.Normalizer;
 import java.util.Iterator;
 import java.util.TreeSet;
 import java.util.regex.Pattern;

 import org.apache.commons.lang.StringUtils;

+import com.google.common.collect.ImmutableMap;
+
 public class FingerprintKeyer extends Keyer {

    // Punctuation and control characters (except for TAB which we need for split to work)
-    static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]");
-    
+    static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]",
+            Pattern.UNICODE_CHARACTER_CLASS);
+
+    public static final Pattern DIACRITICS_AND_FRIENDS = Pattern
+            .compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
+
+    // First part of table based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
+    private static final ImmutableMap<String, String> NONDIACRITICS = ImmutableMap.<String, String>builder()
+            //Replace non-diacritics as their equivalent characters
+            .put("ß", "ss")
+            .put("æ", "ae")
+            .put("ø", "oe")
+            .put("å", "aa") // TODO: We'll never see this after decomposition
+            .put("©", "c") // copyright character
+            .put("\u00F0", "d") // Small letter Icelandic eth
+            .put("\u0111", "d") // Small letter D with stroke
+            .put("\u0256", "d") // Small letter African D
+            .put("\u00FE", "th") // Lower case Icelandic thorn þ
+            // Visually similar replacements from our private former asciify() method
+            // (only need lower case forms since we're already downcased)
+            .put("\u0127", "h") // small H with stroke
+            .put("\u0131", "i") // dotless I
+            .put("\u0138", "k") // small letter Kra
+            .put("\u0142", "l") // Bialystock
+            .put("\u014B", "n") // Small letter Eng
+            .put("\u017F", "s") // long s
+            .put("\u0167", "t") // small letter T with stroke
+            // Additional characters following the same principle
+            .put("œ", "oe")
+            .put("ẜ", "s") // more long S forms
+            .put("ẝ", "s")
+            .build();
+
+
    @Override
    public String key(String s, Object... o) {
        if (s == null || o !=null && o.length > 0) {
            throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter");
        }
        s = s.trim(); // first off, remove whitespace around the string
-        s = s.toLowerCase(); // then lowercase it
-        s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars
-        s = asciify(s); // find ASCII equivalent to characters
-        String[] frags = StringUtils.split(s); // split by whitespace
+        s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
+        s = normalize(s);
+        s = punctctrl.matcher(s).replaceAll(""); // decomposition can generate punctuation so strip it after
+        String[] frags = StringUtils.split(s); // split by whitespace (excluding supplementary characters)
        TreeSet<String> set = new TreeSet<String>();
        for (String ss : frags) {
            set.add(ss); // order fragments and dedupe
@ -69,222 +104,32 @@ public class FingerprintKeyer extends Keyer {
        return b.toString();
    }

+    protected String normalize(String s) {
+        s = stripDiacritics(s);
+        s = stripNonDiacritics(s);
+        return s;
+    }
+
+    @Deprecated
    protected String asciify(String s) {
-        char[] c = s.toCharArray();
-        StringBuffer b = new StringBuffer();
-        for (char element : c) {
-            b.append(translate(element));
-        }
-        return b.toString();
+        return normalize(s);
    }
-    
-    /**
-     * Translate the given unicode char in the closest ASCII representation
-     * NOTE: this function deals only with latin-1 supplement and latin-1 extended code charts
-     */
-    private char translate(char c) {
-        switch(c) {
-            case '\u00C0':
-            case '\u00C1':
-            case '\u00C2':
-            case '\u00C3':
-            case '\u00C4':
-            case '\u00C5':
-            case '\u00E0':
-            case '\u00E1':
-            case '\u00E2':
-            case '\u00E3':
-            case '\u00E4':
-            case '\u00E5':
-            case '\u0100':
-            case '\u0101':
-            case '\u0102':
-            case '\u0103':
-            case '\u0104':
-            case '\u0105':
-                return 'a';
-            case '\u00C7':
-            case '\u00E7':
-            case '\u0106':
-            case '\u0107':
-            case '\u0108':
-            case '\u0109':
-            case '\u010A':
-            case '\u010B':
-            case '\u010C':
-            case '\u010D':
-                return 'c';
-            case '\u00D0':
-            case '\u00F0':
-            case '\u010E':
-            case '\u010F':
-            case '\u0110':
-            case '\u0111':
-                return 'd';
-            case '\u00C8':
-            case '\u00C9':
-            case '\u00CA':
-            case '\u00CB':
-            case '\u00E8':
-            case '\u00E9':
-            case '\u00EA':
-            case '\u00EB':
-            case '\u0112':
-            case '\u0113':
-            case '\u0114':
-            case '\u0115':
-            case '\u0116':
-            case '\u0117':
-            case '\u0118':
-            case '\u0119':
-            case '\u011A':
-            case '\u011B':
-                return 'e';
-            case '\u011C':
-            case '\u011D':
-            case '\u011E':
-            case '\u011F':
-            case '\u0120':
-            case '\u0121':
-            case '\u0122':
-            case '\u0123':
-                return 'g';
-            case '\u0124':
-            case '\u0125':
-            case '\u0126':
-            case '\u0127':
-                return 'h';
-            case '\u00CC':
-            case '\u00CD':
-            case '\u00CE':
-            case '\u00CF':
-            case '\u00EC':
-            case '\u00ED':
-            case '\u00EE':
-            case '\u00EF':
-            case '\u0128':
-            case '\u0129':
-            case '\u012A':
-            case '\u012B':
-            case '\u012C':
-            case '\u012D':
-            case '\u012E':
-            case '\u012F':
-            case '\u0130':
-            case '\u0131':
-                return 'i';
-            case '\u0134':
-            case '\u0135':
-                return 'j';
-            case '\u0136':
-            case '\u0137':
-            case '\u0138':
-                return 'k';
-            case '\u0139':
-            case '\u013A':
-            case '\u013B':
-            case '\u013C':
-            case '\u013D':
-            case '\u013E':
-            case '\u013F':
-            case '\u0140':
-            case '\u0141':
-            case '\u0142':
-                return 'l';
-            case '\u00D1':
-            case '\u00F1':
-            case '\u0143':
-            case '\u0144':
-            case '\u0145':
-            case '\u0146':
-            case '\u0147':
-            case '\u0148':
-            case '\u0149':
-            case '\u014A':
-            case '\u014B':
-                return 'n';
-            case '\u00D2':
-            case '\u00D3':
-            case '\u00D4':
-            case '\u00D5':
-            case '\u00D6':
-            case '\u00D8':
-            case '\u00F2':
-            case '\u00F3':
-            case '\u00F4':
-            case '\u00F5':
-            case '\u00F6':
-            case '\u00F8':
-            case '\u014C':
-            case '\u014D':
-            case '\u014E':
-            case '\u014F':
-            case '\u0150':
-            case '\u0151':
-                return 'o';
-            case '\u0154':
-            case '\u0155':
-            case '\u0156':
-            case '\u0157':
-            case '\u0158':
-            case '\u0159':
-                return 'r';
-            case '\u015A':
-            case '\u015B':
-            case '\u015C':
-            case '\u015D':
-            case '\u015E':
-            case '\u015F':
-            case '\u0160':
-            case '\u0161':
-            case '\u017F':
-                return 's';
-            case '\u0162':
-            case '\u0163':
-            case '\u0164':
-            case '\u0165':
-            case '\u0166':
-            case '\u0167':
-                return 't';
-            case '\u00D9':
-            case '\u00DA':
-            case '\u00DB':
-            case '\u00DC':
-            case '\u00F9':
-            case '\u00FA':
-            case '\u00FB':
-            case '\u00FC':
-            case '\u0168':
-            case '\u0169':
-            case '\u016A':
-            case '\u016B':
-            case '\u016C':
-            case '\u016D':
-            case '\u016E':
-            case '\u016F':
-            case '\u0170':
-            case '\u0171':
-            case '\u0172':
-            case '\u0173':
-                return 'u';
-            case '\u0174':
-            case '\u0175':
-                return 'w';
-            case '\u00DD':
-            case '\u00FD':
-            case '\u00FF':
-            case '\u0176':
-            case '\u0177':
-            case '\u0178':
-                return 'y';
-            case '\u0179':
-            case '\u017A':
-            case '\u017B':
-            case '\u017C':
-            case '\u017D':
-            case '\u017E':
-                return 'z';
-        }
-        return c;
+
+    protected static String stripDiacritics(String str) {
+        str = Normalizer.normalize(str, Normalizer.Form.NFKD);
+        str = DIACRITICS_AND_FRIENDS.matcher(str).replaceAll("");
+        return str;
    }
-}
+
+    // Based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
+    private static String stripNonDiacritics(String orig) {
+        StringBuffer result = new StringBuffer();
+        for (int i = 0; i < orig.length(); i++) {
+            String source = orig.substring(i, i + 1);
+            String replace = NONDIACRITICS.get(source);
+            result.append(replace == null ? String.valueOf(source) : replace);
+        }
+        return result.toString();
+    }
+
+}
--- a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java
+++ b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java
@ -55,11 +55,70 @@ public class KeyerTests extends RefineTest {
        {" d c b a ","a b c d"},
        {"\tABC \t DEF ","abc def"}, // test leading and trailing whitespace
        {"bbb\taaa","aaa bbb"},
-        {"",""},
-        {"",""},
-        {"",""},
+//        {"å","aa"}, // Requested by issue #650, but conflicts with diacritic folding
+        {"æø","aeoe"}, // Norwegian replacements from #650
+        {"©ß","css"}, // issue #409 esszet
+        {"\u00D0\u00DE", "dth"}, // Icelandic eth and thorn 
+        {"ﬀﬁﬂﬃﬅﬆ", "fffiflffistst"}, // ligatures
+        // Test legacy replacements
+        {"\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u0100\u0101\u0102\u0103\u0104\u0105", "aaaaaaaaaaaaaaaaaa"},
+        {"\u00C7\u00E7\u0106\u0107\u0108\u0109\u010A\u010B\u010C\u010D", "cccccccccc"},
+        {"\u00D0\u00F0\u010E\u010F\u0110\u0111", "dddddd"},
+        {"\u00C8\u00C9\u00CA\u00CB\u00E8\u00E9\u00EA\u00EB\u0112\u0113\u0114\u0115\u0116\u0117\u0118\u0119\u011A\u011B", "eeeeeeeeeeeeeeeeee"},
+        {"\u011C\u011D\u011E\u011F\u0120\u0121\u0122\u0123", "gggggggg"},
+        {"\u0124\u0125", "hh"},
+        {"\u0126\u0127", "hh"},
+        {"\u00CC\u00CD\u00CE\u00CF\u00EC\u00ED\u00EE\u00EF\u0128\u0129\u012A\u012B\u012C\u012D\u012E\u012F\u0130", "iiiiiiiiiiiiiiiii"},
+        {"\u0131", "i"},
+        {"\u0134\u0135", "jj"},
+        {"\u0136\u0137", "kk"},
+        {"\u0138", "k"},
+        {"\u0139\u013A\u013B\u013C\u013D\u013E\u0141\u0142", "llllllll"},
+        {"\u013F\u0140", "ll"},
+        {"\u00D1\u00F1\u0143\u0144\u0145\u0146\u0147\u0148", "nnnnnnnn"},
+        {"\u0149", "n"}, // decomposed to 'n, but then punctuation is stripped
+        {"\u014A\u014B", "nn"},
+        {"\u00D2\u00D3\u00D4\u00D5\u00D6\u00F2\u00F3\u00F4\u00F5\u00F6\u014C\u014D\u014E\u014F\u0150\u0151", "oooooooooooooooo"},
+        {"\u00D8\u00F8", "oeoe"}, // O with stroke we decompose to oe instead of o
+        {"\u0154\u0155\u0156\u0157\u0158\u0159", "rrrrrr"},
+        {"\u015A\u015B\u015C\u015D\u015E\u015F\u0160\u0161\u017F", "sssssssss"},
+        {"\u0162\u0163\u0164\u0165\u0166\u0167", "tttttt"},
+        {"\u00D9\u00DA\u00DB\u00DC\u00F9\u00FA\u00FB\u00FC\u0168\u0169\u016A\u016B\u016C\u016D\u016E\u016F\u0170\u0171\u0172\u0173", "uuuuuuuuuuuuuuuuuuuu"},
+        {"\u0174\u0175", "ww"},
+        {"\u00DD\u00FD\u00FF\u0176\u0177\u0178", "yyyyyy"},
+        {"\u0179\u017A\u017B\u017C\u017D\u017E", "zzzzzz"},
+        // Latin-1 Supplement
+        {//"€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ " + // These are all considered control characters
+         //"¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿" // punctuation
+         "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ",
+         "aaaaaaaeceeeeiiiidnooooooeuuuuythssaaaaaaaeceeeeiiiidnooooooeuuuuythy"},
+        // Latin Extended A
+        {"ĀāĂăĄąĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıĲĳĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňŉŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſ",
+         "aaaaaaccccccccddddeeeeeeeeeegggggggghhhhiiiiiiiiiiijijjjkkkllllllllllnnnnnnnnnoooooooeoerrrrrrssssssssttttttuuuuuuuuuuuuwwyyyzzzzzzs"},
+        // Latin Extended B
+        // TODO: These don't get folded to ASCII equivalents. Not sure if they should be
+//        {"ƄƅƉƊƋƌƔƕƖƗƘƙƚƛƜƝƞƟƠơƢƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƼƽƾƿǀǁǂǃǄǅǆǇǈǉǊǋǌǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝ",
+//         "bbddddhiikkllmnnooooopprssssttttuuuuyyzz                      aaiioouuuuuuuuuux"},
+//        {"ǞǟǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯǰǱǲǳǴǵǶǷǸǹǺǻǼǽǾǿȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ",
+//         "aaaaaeaeggggkkqqqqqssjdzdzggpnnaaaeaeooaaaeeeeeiiiioooorrrruuuusstt    zzaaeeooooooooyy"},
+        // Latin Extended Additional
+        {"ḀḁḂḃḄḅḆḇḈḉḊḋḌḍḎḏḐḑḒḓḔḕḖḗḘḙḚḛḜḝḞḟḠḡḢḣḤḥḦḧḨḩḪḫḬḭḮḯḰḱḲḳḴḵḶḷḸḹḺḻḼḽḾḿṀṁṂṃṄṅṆṇṈṉṊṋṌṍṎṏṐṑṒṓṔṕṖṗ",
+         "aabbbbbbccddddddddddeeeeeeeeeeffgghhhhhhhhhhiiiikkkkkkllllllllmmmmmmnnnnnnnnoooooooopppp"},
+        {"ṘṙṚṛṜṝṞṟṠṡṢṣṤṥṦṧṨṩṪṫṬṭṮṯṰṱṲṳṴṵṶṷṸṹṺṻṼṽṾṿẀẁẂẃẄẅẆẇẈẉẊẋẌẍẎẏẐẑẒẓẔẕ",
+         "rrrrrrrrssssssssssttttttttuuuuuuuuuuvvvvwwwwwwwwwwxxxxyyzzzzzz"},
+        {"ẖẗẘẙẚẛẜẝẞẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệ",
+         "htwyasssssaaaaaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeee"},
+        // Latin Extended C - TODO: not supported yet
+//        {"ⱠⱡⱢⱣⱤⱥⱦⱧⱨⱩⱪⱫⱬⱭⱮⱯⱰⱱⱲⱳⱴⱵⱶⱷⱸⱹⱺⱻⱼⱽⱾⱿ",
+//         "lllprathhkkzzamaaavwwwv   e ejvsz"},
+        // Latin Extended D
+        {"", ""},
+        // Latin Extended E
+        {"", ""},
+        // TODO: Add tests for non-Western languages
+        {"", ""},
    };
-    
+
    private static final String[][] testNGramStrings = {
        {"abcdefg","abbccddeeffg"},
        {" a,b.c d\te!f?g ","abbccddeeffg"},
@ -107,7 +166,5 @@ public class KeyerTests extends RefineTest {
                    "Fingerprint for string: " + ss[0] + " failed");
        }
    }
-    
- 

 }