* Add more keyer tests - All forms of Unicode whitespace for both fingerprint & N-gram fingerprint - additional N-gram fingerprint cases * Improve fingerprint keyers - Update N-gram fingerprint keyer to match (missed last time) - refactor string normalization to reduce redundancy between two keyers - add C1 controls to control characters that are stripped - include all Unicode whitespace characters in splitting delimiter and don't strip controls which are whitespace (HT, LF, VT, FF, CR, NEL) - minor cleanups, simplifications, and performance optimizations
This commit is contained in:
parent
a3fc40aa0d
commit
c8220d687e
@ -34,26 +34,35 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
package com.google.refine.clustering.binning;
|
||||
|
||||
import java.text.Normalizer;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
|
||||
/**
|
||||
* Fingerprint keyer where fingerprint is sorted list of unique words
|
||||
* after case and diacritic folding and removing all punctuation. Word boundary
|
||||
* is any whitespace character, while output key has words joined with a single
|
||||
* ASCII space character.
|
||||
*
|
||||
*/
|
||||
public class FingerprintKeyer extends Keyer {
|
||||
|
||||
// Punctuation and control characters (except for TAB which we need for split to work)
|
||||
static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]",
|
||||
// Punctuation plus C0 & C1 controls (except for whitespace characters which we need for split to work)
|
||||
// Added LF, VT, FF, CR, NEL to the control characters not stripped - tfm 2020-10-17
|
||||
static final Pattern punctctrl =
|
||||
Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0E-\\x1F\\x7F\\x80-\\x84\\x86-\\x9F]",
|
||||
Pattern.UNICODE_CHARACTER_CLASS);
|
||||
|
||||
public static final Pattern DIACRITICS_AND_FRIENDS = Pattern
|
||||
// Lm = modifier letter, Sk = modifier symbol
|
||||
.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
|
||||
|
||||
private static final Pattern WHITESPACE = Pattern.compile("\\s+",
|
||||
Pattern.UNICODE_CHARACTER_CLASS);
|
||||
// First part of table based on https://stackoverflow.com/a/1453284/167425 by Andreas Petersson
|
||||
private static final ImmutableMap<String, String> NONDIACRITICS = ImmutableMap.<String, String>builder()
|
||||
//Replace non-diacritics as their equivalent characters
|
||||
//Replace non-diacritics with their equivalent characters
|
||||
.put("ß", "ss")
|
||||
.put("æ", "ae")
|
||||
.put("ø", "oe")
|
||||
@ -84,32 +93,32 @@ public class FingerprintKeyer extends Keyer {
|
||||
if (s == null || o !=null && o.length > 0) {
|
||||
throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter");
|
||||
}
|
||||
s = s.trim(); // first off, remove whitespace around the string
|
||||
s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
|
||||
s = normalize(s);
|
||||
s = punctctrl.matcher(s).replaceAll(""); // decomposition can generate punctuation so strip it after
|
||||
String[] frags = StringUtils.split(s); // split by whitespace (excluding supplementary characters)
|
||||
TreeSet<String> set = new TreeSet<String>();
|
||||
for (String ss : frags) {
|
||||
set.add(ss); // order fragments and dedupe
|
||||
}
|
||||
StringBuffer b = new StringBuffer();
|
||||
Iterator<String> i = set.iterator();
|
||||
while (i.hasNext()) { // join ordered fragments back together
|
||||
b.append(i.next());
|
||||
if (i.hasNext()) {
|
||||
b.append(' ');
|
||||
}
|
||||
}
|
||||
return b.toString();
|
||||
return WHITESPACE.splitAsStream(normalize(s, true)).sorted().distinct().collect(Collectors.joining(" "));
|
||||
}
|
||||
|
||||
protected String normalize(String s) {
|
||||
s = stripDiacritics(s);
|
||||
s = stripNonDiacritics(s);
|
||||
s = normalize(s, false); // letter transforms only for backward compatibility
|
||||
return s;
|
||||
}
|
||||
|
||||
protected String normalize(String s, boolean strong) {
|
||||
if (strong) {
|
||||
s = s.trim(); // first off, remove whitespace around the string
|
||||
s = s.toLowerCase(); // TODO: This is using the default locale. Is that what we want?
|
||||
}
|
||||
s = stripDiacritics(s);
|
||||
s = stripNonDiacritics(s);
|
||||
if (strong) {
|
||||
// TODO: Should these be converted to spaces instead of being removed?
|
||||
s = punctctrl.matcher(s).replaceAll("");
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated by tfmorris 2020-07-07 Use {@link #normalize(String)} or
|
||||
* {{@link #normalize(String, boolean)}
|
||||
*/
|
||||
@Deprecated
|
||||
protected String asciify(String s) {
|
||||
return normalize(s);
|
||||
@ -127,7 +136,7 @@ public class FingerprintKeyer extends Keyer {
|
||||
for (int i = 0; i < orig.length(); i++) {
|
||||
String source = orig.substring(i, i + 1);
|
||||
String replace = NONDIACRITICS.get(source);
|
||||
result.append(replace == null ? String.valueOf(source) : replace);
|
||||
result.append(replace == null ? source : replace);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
@ -33,13 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
package com.google.refine.clustering.binning;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
|
||||
/**
|
||||
* Fingerprint keyer which generates a fingerprint from a sorted list of
|
||||
* unique character N-grams after removing all whitespace, control characters,
|
||||
* and punctuation. N-grams are concatenated to form a single output key.
|
||||
*
|
||||
*/
|
||||
public class NGramFingerprintKeyer extends FingerprintKeyer {
|
||||
|
||||
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}");
|
||||
static final Pattern ctrlspace = Pattern.compile("\\p{Cntrl}|\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS);
|
||||
|
||||
@Override
|
||||
public String key(String s, Object... o) {
|
||||
@ -47,24 +56,36 @@ public class NGramFingerprintKeyer extends FingerprintKeyer {
|
||||
if (o != null && o.length > 0 && o[0] instanceof Number) {
|
||||
ngram_size = (Integer) o[0];
|
||||
}
|
||||
s = s.toLowerCase(); // then lowercase it
|
||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||
s = asciify(s); // find ASCII equivalent to characters
|
||||
TreeSet<String> set = ngram_split(s,ngram_size);
|
||||
StringBuffer b = new StringBuffer();
|
||||
Iterator<String> i = set.iterator();
|
||||
while (i.hasNext()) { // join ordered fragments back together
|
||||
b.append(i.next());
|
||||
}
|
||||
return b.toString();
|
||||
s = normalize(s, true);
|
||||
s = ctrlspace.matcher(s).replaceAll(""); // then remove all control chars & whitespace
|
||||
return sorted_ngrams(s, ngram_size).collect(Collectors.joining());
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a stream of sorted unique character N-grams from a string
|
||||
*
|
||||
* @param String to generate N-grams from
|
||||
* @param number of characters per N-gram
|
||||
* @return a stream of sorted unique N-gram Strings
|
||||
*/
|
||||
protected Stream<String> sorted_ngrams(String s, int size) {
|
||||
return IntStream.rangeClosed(0, s.length() - size)
|
||||
.mapToObj(i -> s.substring(i, i+size))
|
||||
.sorted()
|
||||
.distinct();
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated 2020-10-17 by tfmorris. Use {@link #sorted_ngrams(String, int)}
|
||||
*/
|
||||
@Deprecated
|
||||
protected TreeSet<String> ngram_split(String s, int size) {
|
||||
TreeSet<String> set = new TreeSet<String>();
|
||||
char[] chars = s.toCharArray();
|
||||
for (int i = 0; i + size <= chars.length; i++) {
|
||||
set.add(new String(chars,i,size));
|
||||
int length = s.length();
|
||||
for (int i = 0; i + size <= length; i++) {
|
||||
set.add(s.substring(i, i + size));
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -39,9 +39,6 @@ import org.testng.annotations.BeforeTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import com.google.refine.RefineTest;
|
||||
import com.google.refine.clustering.binning.FingerprintKeyer;
|
||||
import com.google.refine.clustering.binning.Keyer;
|
||||
import com.google.refine.clustering.binning.NGramFingerprintKeyer;
|
||||
|
||||
|
||||
public class KeyerTests extends RefineTest {
|
||||
@ -87,6 +84,8 @@ public class KeyerTests extends RefineTest {
|
||||
{"\u0174\u0175", "ww"},
|
||||
{"\u00DD\u00FD\u00FF\u0176\u0177\u0178", "yyyyyy"},
|
||||
{"\u0179\u017A\u017B\u017C\u017D\u017E", "zzzzzz"},
|
||||
// Various forms of Unicode whitespace characters - NBSP, em space, en space, etc
|
||||
{"a\u0009\nb\u000Bc\u000Cd\re\u0085f\u00A0g\u1680h\u2000i\u2001j\u2002k\u2003l\u2004m\u2005n\u2006o\u2007p\u2008q\u2009r\u200As\u2028t\u2029u\u202Fv\u205Fw\u3000z","a b c d e f g h i j k l m n o p q r s t u v w z"},
|
||||
// Latin-1 Supplement
|
||||
{//"€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ " + // These are all considered control characters
|
||||
//"¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿" // punctuation
|
||||
@ -121,8 +120,12 @@ public class KeyerTests extends RefineTest {
|
||||
|
||||
private static final String[][] testNGramStrings = {
|
||||
{"abcdefg","abbccddeeffg"},
|
||||
{"gfedcba","bacbdcedfegf"},
|
||||
{"a b c d e f g","abbccddeeffg"},
|
||||
{" a,b.c d\te!f?g ","abbccddeeffg"},
|
||||
{"écÉCec","ceec"},
|
||||
// All the whitespace characters below should be skipped
|
||||
{"a\u0009\nb\u000Bc\u000Cd\re\u0085f\u00A0g\u1680h\u2000i\u2001j\u2002k\u2003l\u2004m\u2005n\u2006o\u2007p\u2008q\u2009r\u200As\u2028t\u2029u\u202Fv\u205Fw\u3000z","abbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwz"},
|
||||
{"",""}, //TODO: add more test cases
|
||||
{"",""},
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user