From c8220d687ed76f02168fd55c0fdcb336a8fb37eb Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Sun, 25 Oct 2020 15:32:30 -0400 Subject: [PATCH] Improve fingerprint keyers - fixes #3282 (#3283) * Add more keyer tests - All forms of Unicode whitespace for both fingerprint & N-gram fingerprint - additional N-gram fingerprint cases * Improve fingerprint keyers - Update N-gram fingerprint keyer to match (missed last time) - refactor string normalization to reduce redundancy between two keyers - add C1 controls to control characters that are stripped - include all Unicode whitespace characters in splitting delimiter and don't strip controls which are whitespace (HT, LF, VT, FF, CR, NEL) - minor cleanups, simplifications, and performance optimizations --- .../clustering/binning/FingerprintKeyer.java | 65 +++++++++++-------- .../binning/NGramFingerprintKeyer.java | 51 ++++++++++----- .../refine/clustering/binning/KeyerTests.java | 9 ++- 3 files changed, 79 insertions(+), 46 deletions(-) diff --git a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java index 9ceddd989..728fa510c 100644 --- a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java +++ b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java @@ -34,26 +34,35 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.clustering.binning; import java.text.Normalizer; -import java.util.Iterator; -import java.util.TreeSet; import java.util.regex.Pattern; - -import org.apache.commons.lang.StringUtils; +import java.util.stream.Collectors; import com.google.common.collect.ImmutableMap; +/** + * Fingerprint keyer where fingerprint is sorted list of unique words + * after case and diacritic folding and removing all punctuation. Word boundary + * is any whitespace character, while output key has words joined with a single + * ASCII space character. + * + */ public class FingerprintKeyer extends Keyer { - // Punctuation and control characters (except for TAB which we need for split to work) - static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]", + // Punctuation plus C0 & C1 controls (except for whitespace characters which we need for split to work) + // Added LF, VT, FF, CR, NEL to the control characters not stripped - tfm 2020-10-17 + static final Pattern punctctrl = + Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0E-\\x1F\\x7F\\x80-\\x84\\x86-\\x9F]", Pattern.UNICODE_CHARACTER_CLASS); public static final Pattern DIACRITICS_AND_FRIENDS = Pattern + // Lm = modifier letter, Sk = modifier symbol .compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+"); + private static final Pattern WHITESPACE = Pattern.compile("\\s+", + Pattern.UNICODE_CHARACTER_CLASS); // First part of table based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson private static final ImmutableMap NONDIACRITICS = ImmutableMap.builder() - //Replace non-diacritics as their equivalent characters + //Replace non-diacritics with their equivalent characters .put("ß", "ss") .put("æ", "ae") .put("ø", "oe") @@ -84,32 +93,32 @@ public class FingerprintKeyer extends Keyer { if (s == null || o !=null && o.length > 0) { throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter"); } - s = s.trim(); // first off, remove whitespace around the string - s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want? - s = normalize(s); - s = punctctrl.matcher(s).replaceAll(""); // decomposition can generate punctuation so strip it after - String[] frags = StringUtils.split(s); // split by whitespace (excluding supplementary characters) - TreeSet set = new TreeSet(); - for (String ss : frags) { - set.add(ss); // order fragments and dedupe - } - StringBuffer b = new StringBuffer(); - Iterator i = set.iterator(); - while (i.hasNext()) { // join ordered fragments back together - b.append(i.next()); - if (i.hasNext()) { - b.append(' '); - } - } - return b.toString(); + return WHITESPACE.splitAsStream(normalize(s, true)).sorted().distinct().collect(Collectors.joining(" ")); } protected String normalize(String s) { - s = stripDiacritics(s); - s = stripNonDiacritics(s); + s = normalize(s, false); // letter transforms only for backward compatibility return s; } + protected String normalize(String s, boolean strong) { + if (strong) { + s = s.trim(); // first off, remove whitespace around the string + s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want? + } + s = stripDiacritics(s); + s = stripNonDiacritics(s); + if (strong) { + // TODO: Should these be converted to spaces instead of being removed? + s = punctctrl.matcher(s).replaceAll(""); + } + return s; + } + + /** + * @deprecated by tfmorris 2020-07-07 Use {@link #normalize(String)} or + * {{@link #normalize(String, boolean)} + */ @Deprecated protected String asciify(String s) { return normalize(s); @@ -127,7 +136,7 @@ public class FingerprintKeyer extends Keyer { for (int i = 0; i < orig.length(); i++) { String source = orig.substring(i, i + 1); String replace = NONDIACRITICS.get(source); - result.append(replace == null ? String.valueOf(source) : replace); + result.append(replace == null ? source : replace); } return result.toString(); } diff --git a/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java b/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java index 927012ecd..aacd60e4e 100644 --- a/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java +++ b/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java @@ -33,13 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.clustering.binning; -import java.util.Iterator; import java.util.TreeSet; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +/** + * Fingerprint keyer which generates a fingerprint from a sorted list of + * unique character N-grams after removing all whitespace, control characters, + * and punctuation. N-grams are concatenated to form a single output key. + * + */ public class NGramFingerprintKeyer extends FingerprintKeyer { - static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}"); + static final Pattern ctrlspace = Pattern.compile("\\p{Cntrl}|\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS); @Override public String key(String s, Object... o) { @@ -47,24 +56,36 @@ public class NGramFingerprintKeyer extends FingerprintKeyer { if (o != null && o.length > 0 && o[0] instanceof Number) { ngram_size = (Integer) o[0]; } - s = s.toLowerCase(); // then lowercase it - s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars - s = asciify(s); // find ASCII equivalent to characters - TreeSet set = ngram_split(s,ngram_size); - StringBuffer b = new StringBuffer(); - Iterator i = set.iterator(); - while (i.hasNext()) { // join ordered fragments back together - b.append(i.next()); - } - return b.toString(); + s = normalize(s, true); + s = ctrlspace.matcher(s).replaceAll(""); // then remove all control chars & whitespace + return sorted_ngrams(s, ngram_size).collect(Collectors.joining()); } + /** + * Generate a stream of sorted unique character N-grams from a string + * + * @param String to generate N-grams from + * @param number of characters per N-gram + * @return a stream of sorted unique N-gram Strings + */ + protected Stream sorted_ngrams(String s, int size) { + return IntStream.rangeClosed(0, s.length() - size) + .mapToObj(i -> s.substring(i, i+size)) + .sorted() + .distinct(); + } + + /** + * @deprecated 2020-10-17 by tfmorris. Use {@link #sorted_ngrams(String, int)} + */ + @Deprecated protected TreeSet ngram_split(String s, int size) { TreeSet set = new TreeSet(); - char[] chars = s.toCharArray(); - for (int i = 0; i + size <= chars.length; i++) { - set.add(new String(chars,i,size)); + int length = s.length(); + for (int i = 0; i + size <= length; i++) { + set.add(s.substring(i, i + size)); } return set; } + } diff --git a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java index 396186919..1335ee463 100644 --- a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java +++ b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java @@ -39,9 +39,6 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import com.google.refine.RefineTest; -import com.google.refine.clustering.binning.FingerprintKeyer; -import com.google.refine.clustering.binning.Keyer; -import com.google.refine.clustering.binning.NGramFingerprintKeyer; public class KeyerTests extends RefineTest { @@ -87,6 +84,8 @@ public class KeyerTests extends RefineTest { {"\u0174\u0175", "ww"}, {"\u00DD\u00FD\u00FF\u0176\u0177\u0178", "yyyyyy"}, {"\u0179\u017A\u017B\u017C\u017D\u017E", "zzzzzz"}, + // Various forms of Unicode whitespace characters - NBSP, em space, en space, etc + {"a\u0009\nb\u000Bc\u000Cd\re\u0085f\u00A0g\u1680h\u2000i\u2001j\u2002k\u2003l\u2004m\u2005n\u2006o\u2007p\u2008q\u2009r\u200As\u2028t\u2029u\u202Fv\u205Fw\u3000z","a b c d e f g h i j k l m n o p q r s t u v w z"}, // Latin-1 Supplement {//"€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ " + // These are all considered control characters //"¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿" // punctuation @@ -121,8 +120,12 @@ public class KeyerTests extends RefineTest { private static final String[][] testNGramStrings = { {"abcdefg","abbccddeeffg"}, + {"gfedcba","bacbdcedfegf"}, + {"a b c d e f g","abbccddeeffg"}, {" a,b.c d\te!f?g ","abbccddeeffg"}, {"écÉCec","ceec"}, + // All the whitespace characters below should be skipped + {"a\u0009\nb\u000Bc\u000Cd\re\u0085f\u00A0g\u1680h\u2000i\u2001j\u2002k\u2003l\u2004m\u2005n\u2006o\u2007p\u2008q\u2009r\u200As\u2028t\u2029u\u202Fv\u205Fw\u3000z","abbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwz"}, {"",""}, //TODO: add more test cases {"",""}, };