Improve fingerprint keyers - fixes #3282 (#3283)

* Add more keyer tests

- All forms of Unicode whitespace for both fingerprint & N-gram fingerprint
- additional N-gram fingerprint cases

* Improve fingerprint keyers

- Update N-gram fingerprint keyer to match (missed last time)
- refactor string normalization to reduce redundancy between two keyers
- add C1 controls to control characters that are stripped
- include all Unicode whitespace characters in splitting delimiter
  and don't strip controls which are whitespace (HT, LF, VT, FF, CR,
NEL)
- minor cleanups, simplifications, and performance optimizations
This commit is contained in:
Tom Morris 2020-10-25 15:32:30 -04:00 committed by GitHub
parent a3fc40aa0d
commit c8220d687e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 79 additions and 46 deletions

View File

@ -34,26 +34,35 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.refine.clustering.binning;
import java.text.Normalizer;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import java.util.stream.Collectors;
import com.google.common.collect.ImmutableMap;
/**
* Fingerprint keyer where fingerprint is sorted list of unique words
* after case and diacritic folding and removing all punctuation. Word boundary
* is any whitespace character, while output key has words joined with a single
* ASCII space character.
*
*/
public class FingerprintKeyer extends Keyer {
// Punctuation and control characters (except for TAB which we need for split to work)
static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]",
// Punctuation plus C0 & C1 controls (except for whitespace characters which we need for split to work)
// Added LF, VT, FF, CR, NEL to the control characters not stripped - tfm 2020-10-17
static final Pattern punctctrl =
Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0E-\\x1F\\x7F\\x80-\\x84\\x86-\\x9F]",
Pattern.UNICODE_CHARACTER_CLASS);
public static final Pattern DIACRITICS_AND_FRIENDS = Pattern
// Lm = modifier letter, Sk = modifier symbol
.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
private static final Pattern WHITESPACE = Pattern.compile("\\s+",
Pattern.UNICODE_CHARACTER_CLASS);
// First part of table based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
private static final ImmutableMap<String, String> NONDIACRITICS = ImmutableMap.<String, String>builder()
//Replace non-diacritics as their equivalent characters
//Replace non-diacritics with their equivalent characters
.put("ß", "ss")
.put("æ", "ae")
.put("ø", "oe")
@ -84,32 +93,32 @@ public class FingerprintKeyer extends Keyer {
if (s == null || o !=null && o.length > 0) {
throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter");
}
s = s.trim(); // first off, remove whitespace around the string
s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
s = normalize(s);
s = punctctrl.matcher(s).replaceAll(""); // decomposition can generate punctuation so strip it after
String[] frags = StringUtils.split(s); // split by whitespace (excluding supplementary characters)
TreeSet<String> set = new TreeSet<String>();
for (String ss : frags) {
set.add(ss); // order fragments and dedupe
}
StringBuffer b = new StringBuffer();
Iterator<String> i = set.iterator();
while (i.hasNext()) { // join ordered fragments back together
b.append(i.next());
if (i.hasNext()) {
b.append(' ');
}
}
return b.toString();
return WHITESPACE.splitAsStream(normalize(s, true)).sorted().distinct().collect(Collectors.joining(" "));
}
protected String normalize(String s) {
s = stripDiacritics(s);
s = stripNonDiacritics(s);
s = normalize(s, false); // letter transforms only for backward compatibility
return s;
}
protected String normalize(String s, boolean strong) {
if (strong) {
s = s.trim(); // first off, remove whitespace around the string
s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
}
s = stripDiacritics(s);
s = stripNonDiacritics(s);
if (strong) {
// TODO: Should these be converted to spaces instead of being removed?
s = punctctrl.matcher(s).replaceAll("");
}
return s;
}
/**
* @deprecated by tfmorris 2020-07-07 Use {@link #normalize(String)} or
* {{@link #normalize(String, boolean)}
*/
@Deprecated
protected String asciify(String s) {
return normalize(s);
@ -127,7 +136,7 @@ public class FingerprintKeyer extends Keyer {
for (int i = 0; i < orig.length(); i++) {
String source = orig.substring(i, i + 1);
String replace = NONDIACRITICS.get(source);
result.append(replace == null ? String.valueOf(source) : replace);
result.append(replace == null ? source : replace);
}
return result.toString();
}

View File

@ -33,13 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.refine.clustering.binning;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* Fingerprint keyer which generates a fingerprint from a sorted list of
* unique character N-grams after removing all whitespace, control characters,
* and punctuation. N-grams are concatenated to form a single output key.
*
*/
public class NGramFingerprintKeyer extends FingerprintKeyer {
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}");
static final Pattern ctrlspace = Pattern.compile("\\p{Cntrl}|\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS);
@Override
public String key(String s, Object... o) {
@ -47,24 +56,36 @@ public class NGramFingerprintKeyer extends FingerprintKeyer {
if (o != null && o.length > 0 && o[0] instanceof Number) {
ngram_size = (Integer) o[0];
}
s = s.toLowerCase(); // then lowercase it
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
s = asciify(s); // find ASCII equivalent to characters
TreeSet<String> set = ngram_split(s,ngram_size);
StringBuffer b = new StringBuffer();
Iterator<String> i = set.iterator();
while (i.hasNext()) { // join ordered fragments back together
b.append(i.next());
}
return b.toString();
s = normalize(s, true);
s = ctrlspace.matcher(s).replaceAll(""); // then remove all control chars & whitespace
return sorted_ngrams(s, ngram_size).collect(Collectors.joining());
}
/**
* Generate a stream of sorted unique character N-grams from a string
*
* @param String to generate N-grams from
* @param number of characters per N-gram
* @return a stream of sorted unique N-gram Strings
*/
protected Stream<String> sorted_ngrams(String s, int size) {
return IntStream.rangeClosed(0, s.length() - size)
.mapToObj(i -> s.substring(i, i+size))
.sorted()
.distinct();
}
/**
* @deprecated 2020-10-17 by tfmorris. Use {@link #sorted_ngrams(String, int)}
*/
@Deprecated
protected TreeSet<String> ngram_split(String s, int size) {
TreeSet<String> set = new TreeSet<String>();
char[] chars = s.toCharArray();
for (int i = 0; i + size <= chars.length; i++) {
set.add(new String(chars,i,size));
int length = s.length();
for (int i = 0; i + size <= length; i++) {
set.add(s.substring(i, i + size));
}
return set;
}
}

View File

@ -39,9 +39,6 @@ import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
import com.google.refine.RefineTest;
import com.google.refine.clustering.binning.FingerprintKeyer;
import com.google.refine.clustering.binning.Keyer;
import com.google.refine.clustering.binning.NGramFingerprintKeyer;
public class KeyerTests extends RefineTest {
@ -87,6 +84,8 @@ public class KeyerTests extends RefineTest {
{"\u0174\u0175", "ww"},
{"\u00DD\u00FD\u00FF\u0176\u0177\u0178", "yyyyyy"},
{"\u0179\u017A\u017B\u017C\u017D\u017E", "zzzzzz"},
// Various forms of Unicode whitespace characters - NBSP, em space, en space, etc
{"a\u0009\nb\u000Bc\u000Cd\re\u0085f\u00A0g\u1680h\u2000i\u2001j\u2002k\u2003l\u2004m\u2005n\u2006o\u2007p\u2008q\u2009r\u200As\u2028t\u2029u\u202Fv\u205Fw\u3000z","a b c d e f g h i j k l m n o p q r s t u v w z"},
// Latin-1 Supplement
{//"€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ " + // These are all considered control characters
//"¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿" // punctuation
@ -121,8 +120,12 @@ public class KeyerTests extends RefineTest {
private static final String[][] testNGramStrings = {
{"abcdefg","abbccddeeffg"},
{"gfedcba","bacbdcedfegf"},
{"a b c d e f g","abbccddeeffg"},
{" a,b.c d\te!f?g ","abbccddeeffg"},
{"écÉCec","ceec"},
// All the whitespace characters below should be skipped
{"a\u0009\nb\u000Bc\u000Cd\re\u0085f\u00A0g\u1680h\u2000i\u2001j\u2002k\u2003l\u2004m\u2005n\u2006o\u2007p\u2008q\u2009r\u200As\u2028t\u2029u\u202Fv\u205Fw\u3000z","abbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwz"},
{"",""}, //TODO: add more test cases
{"",""},
};