From c8220d687ed76f02168fd55c0fdcb336a8fb37eb Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Sun, 25 Oct 2020 15:32:30 -0400
Subject: [PATCH] Improve fingerprint keyers - fixes #3282 (#3283)

* Add more keyer tests

- All forms of Unicode whitespace for both fingerprint & N-gram fingerprint
- additional N-gram fingerprint cases

* Improve fingerprint keyers

- Update N-gram fingerprint keyer to match (missed last time)
- refactor string normalization to reduce redundancy between two keyers
- add C1 controls to control characters that are stripped
- include all Unicode whitespace characters in splitting delimiter
  and don't strip controls which are whitespace (HT, LF, VT, FF, CR,
NEL)
- minor cleanups, simplifications, and performance optimizations
---
 .../clustering/binning/FingerprintKeyer.java  | 65 +++++++++++--------
 .../binning/NGramFingerprintKeyer.java        | 51 ++++++++++-----
 .../refine/clustering/binning/KeyerTests.java |  9 ++-
 3 files changed, 79 insertions(+), 46 deletions(-)
diff --git a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java
index 9ceddd989..728fa510c 100644
--- a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java
+++ b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java
@@ -34,26 +34,35 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 package com.google.refine.clustering.binning;
 
 import java.text.Normalizer;
-import java.util.Iterator;
-import java.util.TreeSet;
 import java.util.regex.Pattern;
-
-import org.apache.commons.lang.StringUtils;
+import java.util.stream.Collectors;
 
 import com.google.common.collect.ImmutableMap;
 
+/**
+ * Fingerprint keyer where fingerprint is sorted list of unique words
+ * after case and diacritic folding and removing all punctuation. Word boundary
+ * is any whitespace character, while output key has words joined with a single
+ * ASCII space character.
+ *
+ */
 public class FingerprintKeyer extends Keyer {
 
-    // Punctuation and control characters (except for TAB which we need for split to work)
-    static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]",
+    // Punctuation plus C0 & C1 controls (except for whitespace characters which we need for split to work)
+    // Added LF, VT, FF, CR, NEL to the control characters not stripped - tfm 2020-10-17
+    static final Pattern punctctrl =
+            Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0E-\\x1F\\x7F\\x80-\\x84\\x86-\\x9F]",
             Pattern.UNICODE_CHARACTER_CLASS);
 
     public static final Pattern DIACRITICS_AND_FRIENDS = Pattern
+            // Lm = modifier letter, Sk = modifier symbol
             .compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
 
+    private static final Pattern WHITESPACE = Pattern.compile("\\s+",
+            Pattern.UNICODE_CHARACTER_CLASS);
     // First part of table based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
     private static final ImmutableMap<String, String> NONDIACRITICS = ImmutableMap.<String, String>builder()
-            //Replace non-diacritics as their equivalent characters
+            //Replace non-diacritics with their equivalent characters
             .put("ß", "ss")
             .put("æ", "ae")
             .put("ø", "oe")
@@ -84,32 +93,32 @@ public class FingerprintKeyer extends Keyer {
         if (s == null || o !=null && o.length > 0) {
             throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter");
         }
-        s = s.trim(); // first off, remove whitespace around the string
-        s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
-        s = normalize(s);
-        s = punctctrl.matcher(s).replaceAll(""); // decomposition can generate punctuation so strip it after
-        String[] frags = StringUtils.split(s); // split by whitespace (excluding supplementary characters)
-        TreeSet<String> set = new TreeSet<String>();
-        for (String ss : frags) {
-            set.add(ss); // order fragments and dedupe
-        }
-        StringBuffer b = new StringBuffer();
-        Iterator<String> i = set.iterator();
-        while (i.hasNext()) {  // join ordered fragments back together
-            b.append(i.next());
-            if (i.hasNext()) {
-                b.append(' ');
-            }
-        }
-        return b.toString();
+        return WHITESPACE.splitAsStream(normalize(s, true)).sorted().distinct().collect(Collectors.joining(" "));
     }
 
     protected String normalize(String s) {
-        s = stripDiacritics(s);
-        s = stripNonDiacritics(s);
+        s = normalize(s, false); // letter transforms only for backward compatibility
         return s;
     }
 
+    protected String normalize(String s, boolean strong) {
+        if (strong) {
+            s = s.trim(); // first off, remove whitespace around the string
+            s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
+        }
+        s = stripDiacritics(s);
+        s = stripNonDiacritics(s);
+        if (strong) {
+            // TODO: Should these be converted to spaces instead of being removed?
+            s = punctctrl.matcher(s).replaceAll("");
+        }
+        return s;
+    }
+
+    /**
+     * @deprecated by tfmorris 2020-07-07 Use {@link #normalize(String)} or
+     *             {{@link #normalize(String, boolean)}
+     */
     @Deprecated
     protected String asciify(String s) {
         return normalize(s);
@@ -127,7 +136,7 @@ public class FingerprintKeyer extends Keyer {
         for (int i = 0; i < orig.length(); i++) {
             String source = orig.substring(i, i + 1);
             String replace = NONDIACRITICS.get(source);
-            result.append(replace == null ? String.valueOf(source) : replace);
+            result.append(replace == null ? source : replace);
         }
         return result.toString();
     }
diff --git a/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java b/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java
index 927012ecd..aacd60e4e 100644
--- a/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java
+++ b/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java
@@ -33,13 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 package com.google.refine.clustering.binning;
 
-import java.util.Iterator;
 import java.util.TreeSet;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import java.util.stream.Stream;
 
+
+/**
+ * Fingerprint keyer which generates a fingerprint from a sorted list of
+ * unique character N-grams after removing all whitespace, control characters,
+ * and punctuation. N-grams are concatenated to form a single output key.
+ *
+ */
 public class NGramFingerprintKeyer extends FingerprintKeyer {
 
-    static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}");
+    static final Pattern ctrlspace = Pattern.compile("\\p{Cntrl}|\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS);
     
     @Override
     public String key(String s, Object... o) {
@@ -47,24 +56,36 @@ public class NGramFingerprintKeyer extends FingerprintKeyer {
         if (o != null && o.length > 0 && o[0] instanceof Number) {
             ngram_size = (Integer) o[0];
         }
-        s = s.toLowerCase(); // then lowercase it
-        s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
-        s = asciify(s); // find ASCII equivalent to characters
-        TreeSet<String> set = ngram_split(s,ngram_size);
-        StringBuffer b = new StringBuffer();
-        Iterator<String> i = set.iterator();
-        while (i.hasNext()) { // join ordered fragments back together
-            b.append(i.next());
-        }
-        return b.toString();
+        s = normalize(s, true);
+        s = ctrlspace.matcher(s).replaceAll(""); // then remove all control chars & whitespace
+        return sorted_ngrams(s, ngram_size).collect(Collectors.joining());
     }
 
+    /**
+     * Generate a stream of sorted unique character N-grams from a string
+     * 
+     * @param String to generate N-grams from
+     * @param number of characters per N-gram
+     * @return a stream of sorted unique N-gram Strings
+     */
+    protected Stream<String> sorted_ngrams(String s, int size) {
+        return IntStream.rangeClosed(0, s.length() - size)
+        .mapToObj(i -> s.substring(i,  i+size))
+        .sorted()
+        .distinct();
+    }
+
+    /**
+     * @deprecated 2020-10-17 by tfmorris. Use {@link #sorted_ngrams(String, int)}
+     */
+    @Deprecated
     protected TreeSet<String> ngram_split(String s, int size) {
         TreeSet<String> set = new TreeSet<String>();
-        char[] chars = s.toCharArray();
-        for (int i = 0; i + size <= chars.length; i++) {
-            set.add(new String(chars,i,size));
+        int length = s.length();
+        for (int i = 0; i + size <= length; i++) {
+            set.add(s.substring(i, i + size));
         }
         return set;
     }
+
 }
diff --git a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java
index 396186919..1335ee463 100644
--- a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java
+++ b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java
@@ -39,9 +39,6 @@ import org.testng.annotations.BeforeTest;
 import org.testng.annotations.Test;
 
 import com.google.refine.RefineTest;
-import com.google.refine.clustering.binning.FingerprintKeyer;
-import com.google.refine.clustering.binning.Keyer;
-import com.google.refine.clustering.binning.NGramFingerprintKeyer;
 
 
 public class KeyerTests extends RefineTest {
@@ -87,6 +84,8 @@ public class KeyerTests extends RefineTest {
         {"\u0174\u0175", "ww"},
         {"\u00DD\u00FD\u00FF\u0176\u0177\u0178", "yyyyyy"},
         {"\u0179\u017A\u017B\u017C\u017D\u017E", "zzzzzz"},
+        // Various forms of Unicode whitespace characters - NBSP, em space, en space, etc
+        {"a\u0009\nb\u000Bc\u000Cd\re\u0085f\u00A0g\u1680h\u2000i\u2001j\u2002k\u2003l\u2004m\u2005n\u2006o\u2007p\u2008q\u2009r\u200As\u2028t\u2029u\u202Fv\u205Fw\u3000z","a b c d e f g h i j k l m n o p q r s t u v w z"},
         // Latin-1 Supplement
         {//"€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ " + // These are all considered control characters
          //"¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿" // punctuation
@@ -121,8 +120,12 @@ public class KeyerTests extends RefineTest {
 
     private static final String[][] testNGramStrings = {
         {"abcdefg","abbccddeeffg"},
+        {"gfedcba","bacbdcedfegf"},
+        {"a b c d e f g","abbccddeeffg"},
         {" a,b.c d\te!f?g ","abbccddeeffg"},
         {"écÉCec","ceec"},
+        // All the whitespace characters below should be skipped
+        {"a\u0009\nb\u000Bc\u000Cd\re\u0085f\u00A0g\u1680h\u2000i\u2001j\u2002k\u2003l\u2004m\u2005n\u2006o\u2007p\u2008q\u2009r\u200As\u2028t\u2029u\u202Fv\u205Fw\u3000z","abbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwz"},
         {"",""}, //TODO: add more test cases
         {"",""},
     };