From 1da3c00cb197e871787af03024cc746be8394dc1 Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Wed, 27 Sep 2017 09:23:40 +0100 Subject: [PATCH] Perform ASCII normalization earlier in FingerprintKeyer. This closes #1256. --- .../com/google/refine/clustering/binning/FingerprintKeyer.java | 3 ++- .../com/google/refine/tests/clustering/binning/KeyerTests.java | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java index 005207fdc..5d8885996 100644 --- a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java +++ b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java @@ -52,6 +52,7 @@ public class FingerprintKeyer extends Keyer { s = s.trim(); // first off, remove whitespace around the string s = s.toLowerCase(); // then lowercase it s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars + s = asciify(s); // find ASCII equivalent to characters String[] frags = StringUtils.split(s); // split by whitespace TreeSet set = new TreeSet(); for (String ss : frags) { @@ -65,7 +66,7 @@ public class FingerprintKeyer extends Keyer { b.append(' '); } } - return asciify(b.toString()); // find ASCII equivalent to characters + return b.toString(); } protected String asciify(String s) { diff --git a/main/tests/server/src/com/google/refine/tests/clustering/binning/KeyerTests.java b/main/tests/server/src/com/google/refine/tests/clustering/binning/KeyerTests.java index 237ca1ea8..12604439b 100644 --- a/main/tests/server/src/com/google/refine/tests/clustering/binning/KeyerTests.java +++ b/main/tests/server/src/com/google/refine/tests/clustering/binning/KeyerTests.java @@ -50,6 +50,7 @@ public class KeyerTests extends RefineTest { private static final String[][] testStrings = { {"the multi multi word test","multi test the word"}, + {" école ÉCole ecoLe ", "ecole"}, {"a b c d","a b c d"}, {" d c b a ","a b c d"}, {"\tABC \t DEF ","abc def"}, // test leading and trailing whitespace