diff --git a/benchmark/src/main/java/org/openrefine/ToNumberBenchmark.java b/benchmark/src/main/java/org/openrefine/benchmark/ToNumberBenchmark.java similarity index 100% rename from benchmark/src/main/java/org/openrefine/ToNumberBenchmark.java rename to benchmark/src/main/java/org/openrefine/benchmark/ToNumberBenchmark.java diff --git a/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java b/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java index e5e27264d..927012ecd 100644 --- a/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java +++ b/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java @@ -49,13 +49,14 @@ public class NGramFingerprintKeyer extends FingerprintKeyer { } s = s.toLowerCase(); // then lowercase it s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars + s = asciify(s); // find ASCII equivalent to characters TreeSet set = ngram_split(s,ngram_size); StringBuffer b = new StringBuffer(); Iterator i = set.iterator(); while (i.hasNext()) { // join ordered fragments back together b.append(i.next()); } - return asciify(b.toString()); // find ASCII equivalent to characters + return b.toString(); } protected TreeSet ngram_split(String s, int size) { diff --git a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java index 77d6cf48e..458501414 100644 --- a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java +++ b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java @@ -62,9 +62,10 @@ public class KeyerTests extends RefineTest { private static final String[][] testNGramStrings = { {"abcdefg","abbccddeeffg"}, + {" a,b.c d\te!f?g ","abbccddeeffg"}, + {"écÉCec","ceec"}, {"",""}, //TODO: add more test cases {"",""}, - {"",""}, }; @Override