Improve fingerprint keyers - fixes #3282 (#3283)

* Add more keyer tests - All forms of Unicode whitespace for both fingerprint & N-gram fingerprint - additional N-gram fingerprint cases * Improve fingerprint keyers - Update N-gram fingerprint keyer to match (missed last time) - refactor string normalization to reduce redundancy between two keyers - add C1 controls to control characters that are stripped - include all Unicode whitespace characters in splitting delimiter and don't strip controls which are whitespace (HT, LF, VT, FF, CR, NEL) - minor cleanups, simplifications, and performance optimizations
2020-10-25 15:32:30 -04:00 · 2020-10-25 15:32:30 -04:00 · c8220d687e
commit c8220d687e
parent a3fc40aa0d
3 changed files with 79 additions and 46 deletions
--- a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java
+++ b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java
@ -34,26 +34,35 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 package com.google.refine.clustering.binning;

 import java.text.Normalizer;
-import java.util.Iterator;
-import java.util.TreeSet;
 import java.util.regex.Pattern;
-
-import org.apache.commons.lang.StringUtils;
+import java.util.stream.Collectors;

 import com.google.common.collect.ImmutableMap;

+/**
+ * Fingerprint keyer where fingerprint is sorted list of unique words
+ * after case and diacritic folding and removing all punctuation. Word boundary
+ * is any whitespace character, while output key has words joined with a single
+ * ASCII space character.
+ *
+ */
 public class FingerprintKeyer extends Keyer {

-    // Punctuation and control characters (except for TAB which we need for split to work)
-    static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]",
+    // Punctuation plus C0 & C1 controls (except for whitespace characters which we need for split to work)
+    // Added LF, VT, FF, CR, NEL to the control characters not stripped - tfm 2020-10-17
+    static final Pattern punctctrl =
+            Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0E-\\x1F\\x7F\\x80-\\x84\\x86-\\x9F]",
            Pattern.UNICODE_CHARACTER_CLASS);

    public static final Pattern DIACRITICS_AND_FRIENDS = Pattern
+            // Lm = modifier letter, Sk = modifier symbol
            .compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");

+    private static final Pattern WHITESPACE = Pattern.compile("\\s+",
+            Pattern.UNICODE_CHARACTER_CLASS);
    // First part of table based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
    private static final ImmutableMap<String, String> NONDIACRITICS = ImmutableMap.<String, String>builder()
-            //Replace non-diacritics as their equivalent characters
+            //Replace non-diacritics with their equivalent characters
            .put("ß", "ss")
            .put("æ", "ae")
            .put("ø", "oe")
@ -84,32 +93,32 @@ public class FingerprintKeyer extends Keyer {
        if (s == null || o !=null && o.length > 0) {
            throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter");
        }
-        s = s.trim(); // first off, remove whitespace around the string
-        s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
-        s = normalize(s);
-        s = punctctrl.matcher(s).replaceAll(""); // decomposition can generate punctuation so strip it after
-        String[] frags = StringUtils.split(s); // split by whitespace (excluding supplementary characters)
-        TreeSet<String> set = new TreeSet<String>();
-        for (String ss : frags) {
-            set.add(ss); // order fragments and dedupe
-        }
-        StringBuffer b = new StringBuffer();
-        Iterator<String> i = set.iterator();
-        while (i.hasNext()) {  // join ordered fragments back together
-            b.append(i.next());
-            if (i.hasNext()) {
-                b.append(' ');
-            }
-        }
-        return b.toString();
+        return WHITESPACE.splitAsStream(normalize(s, true)).sorted().distinct().collect(Collectors.joining(" "));
    }

    protected String normalize(String s) {
-        s = stripDiacritics(s);
-        s = stripNonDiacritics(s);
+        s = normalize(s, false); // letter transforms only for backward compatibility
        return s;
    }

+    protected String normalize(String s, boolean strong) {
+        if (strong) {
+            s = s.trim(); // first off, remove whitespace around the string
+            s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
+        }
+        s = stripDiacritics(s);
+        s = stripNonDiacritics(s);
+        if (strong) {
+            // TODO: Should these be converted to spaces instead of being removed?
+            s = punctctrl.matcher(s).replaceAll("");
+        }
+        return s;
+    }
+
+    /**
+     * @deprecated by tfmorris 2020-07-07 Use {@link #normalize(String)} or
+     *             {{@link #normalize(String, boolean)}
+     */
    @Deprecated
    protected String asciify(String s) {
        return normalize(s);
@ -127,7 +136,7 @@ public class FingerprintKeyer extends Keyer {
        for (int i = 0; i < orig.length(); i++) {
            String source = orig.substring(i, i + 1);
            String replace = NONDIACRITICS.get(source);
-            result.append(replace == null ? String.valueOf(source) : replace);
+            result.append(replace == null ? source : replace);
        }
        return result.toString();
    }
--- a/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java
+++ b/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java
@ -33,13 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 package com.google.refine.clustering.binning;

-import java.util.Iterator;
 import java.util.TreeSet;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import java.util.stream.Stream;

+
+/**
+ * Fingerprint keyer which generates a fingerprint from a sorted list of
+ * unique character N-grams after removing all whitespace, control characters,
+ * and punctuation. N-grams are concatenated to form a single output key.
+ *
+ */
 public class NGramFingerprintKeyer extends FingerprintKeyer {

-    static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}");
+    static final Pattern ctrlspace = Pattern.compile("\\p{Cntrl}|\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS);
    
    @Override
    public String key(String s, Object... o) {
@ -47,24 +56,36 @@ public class NGramFingerprintKeyer extends FingerprintKeyer {
        if (o != null && o.length > 0 && o[0] instanceof Number) {
            ngram_size = (Integer) o[0];
        }
-        s = s.toLowerCase(); // then lowercase it
-        s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
-        s = asciify(s); // find ASCII equivalent to characters
-        TreeSet<String> set = ngram_split(s,ngram_size);
-        StringBuffer b = new StringBuffer();
-        Iterator<String> i = set.iterator();
-        while (i.hasNext()) { // join ordered fragments back together
-            b.append(i.next());
-        }
-        return b.toString();
+        s = normalize(s, true);
+        s = ctrlspace.matcher(s).replaceAll(""); // then remove all control chars & whitespace
+        return sorted_ngrams(s, ngram_size).collect(Collectors.joining());
    }

+    /**
+     * Generate a stream of sorted unique character N-grams from a string
+     * 
+     * @param String to generate N-grams from
+     * @param number of characters per N-gram
+     * @return a stream of sorted unique N-gram Strings
+     */
+    protected Stream<String> sorted_ngrams(String s, int size) {
+        return IntStream.rangeClosed(0, s.length() - size)
+        .mapToObj(i -> s.substring(i,  i+size))
+        .sorted()
+        .distinct();
+    }
+
+    /**
+     * @deprecated 2020-10-17 by tfmorris. Use {@link #sorted_ngrams(String, int)}
+     */
+    @Deprecated
    protected TreeSet<String> ngram_split(String s, int size) {
        TreeSet<String> set = new TreeSet<String>();
-        char[] chars = s.toCharArray();
-        for (int i = 0; i + size <= chars.length; i++) {
-            set.add(new String(chars,i,size));
+        int length = s.length();
+        for (int i = 0; i + size <= length; i++) {
+            set.add(s.substring(i, i + size));
        }
        return set;
    }
+
 }
--- a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java
+++ b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java
@ -39,9 +39,6 @@ import org.testng.annotations.BeforeTest;
 import org.testng.annotations.Test;

 import com.google.refine.RefineTest;
-import com.google.refine.clustering.binning.FingerprintKeyer;
-import com.google.refine.clustering.binning.Keyer;
-import com.google.refine.clustering.binning.NGramFingerprintKeyer;


 public class KeyerTests extends RefineTest {
@ -87,6 +84,8 @@ public class KeyerTests extends RefineTest {
        {"\u0174\u0175", "ww"},
        {"\u00DD\u00FD\u00FF\u0176\u0177\u0178", "yyyyyy"},
        {"\u0179\u017A\u017B\u017C\u017D\u017E", "zzzzzz"},
+        // Various forms of Unicode whitespace characters - NBSP, em space, en space, etc
+        {"a\u0009\nb\u000Bc\u000Cd\re\u0085f\u00A0g\u1680h\u2000i\u2001j\u2002k\u2003l\u2004m\u2005n\u2006o\u2007p\u2008q\u2009r\u200As\u2028t\u2029u\u202Fv\u205Fw\u3000z","a b c d e f g h i j k l m n o p q r s t u v w z"},
        // Latin-1 Supplement
        {//"€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ " + // These are all considered control characters
         //"¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿" // punctuation
@ -121,8 +120,12 @@ public class KeyerTests extends RefineTest {

    private static final String[][] testNGramStrings = {
        {"abcdefg","abbccddeeffg"},
+        {"gfedcba","bacbdcedfegf"},
+        {"a b c d e f g","abbccddeeffg"},
        {" a,b.c d\te!f?g ","abbccddeeffg"},
        {"écÉCec","ceec"},
+        // All the whitespace characters below should be skipped
+        {"a\u0009\nb\u000Bc\u000Cd\re\u0085f\u00A0g\u1680h\u2000i\u2001j\u2002k\u2003l\u2004m\u2005n\u2006o\u2007p\u2008q\u2009r\u200As\u2028t\u2029u\u202Fv\u205Fw\u3000z","abbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwz"},
        {"",""}, //TODO: add more test cases
        {"",""},
    };