diff --git a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java index 005207fdc..5d8885996 100644 --- a/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java +++ b/main/src/com/google/refine/clustering/binning/FingerprintKeyer.java @@ -52,6 +52,7 @@ public class FingerprintKeyer extends Keyer { s = s.trim(); // first off, remove whitespace around the string s = s.toLowerCase(); // then lowercase it s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars + s = asciify(s); // find ASCII equivalent to characters String[] frags = StringUtils.split(s); // split by whitespace TreeSet set = new TreeSet(); for (String ss : frags) { @@ -65,7 +66,7 @@ public class FingerprintKeyer extends Keyer { b.append(' '); } } - return asciify(b.toString()); // find ASCII equivalent to characters + return b.toString(); } protected String asciify(String s) { diff --git a/main/tests/server/src/com/google/refine/tests/clustering/binning/KeyerTests.java b/main/tests/server/src/com/google/refine/tests/clustering/binning/KeyerTests.java index 237ca1ea8..12604439b 100644 --- a/main/tests/server/src/com/google/refine/tests/clustering/binning/KeyerTests.java +++ b/main/tests/server/src/com/google/refine/tests/clustering/binning/KeyerTests.java @@ -50,6 +50,7 @@ public class KeyerTests extends RefineTest { private static final String[][] testStrings = { {"the multi multi word test","multi test the word"}, + {" école ÉCole ecoLe ", "ecole"}, {"a b c d","a b c d"}, {" d c b a ","a b c d"}, {"\tABC \t DEF ","abc def"}, // test leading and trailing whitespace