From e61d50a1aa35a13507c54946147663a4ba1863be Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Tue, 7 Jul 2020 03:02:49 -0400 Subject: [PATCH] Fix NGramFingerprintKeyer to ignore accents - fixes #1161 (#2899) Fixes #1161 This change parallels what was done in #1257 1da3c00 to fix the FingerprintKeyer and moves the diacritic removal before the deduping. Includes a test. --- .../java/org/openrefine/{ => benchmark}/ToNumberBenchmark.java | 0 .../refine/clustering/binning/NGramFingerprintKeyer.java | 3 ++- .../src/com/google/refine/clustering/binning/KeyerTests.java | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) rename benchmark/src/main/java/org/openrefine/{ => benchmark}/ToNumberBenchmark.java (100%) diff --git a/benchmark/src/main/java/org/openrefine/ToNumberBenchmark.java b/benchmark/src/main/java/org/openrefine/benchmark/ToNumberBenchmark.java similarity index 100% rename from benchmark/src/main/java/org/openrefine/ToNumberBenchmark.java rename to benchmark/src/main/java/org/openrefine/benchmark/ToNumberBenchmark.java diff --git a/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java b/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java index e5e27264d..927012ecd 100644 --- a/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java +++ b/main/src/com/google/refine/clustering/binning/NGramFingerprintKeyer.java @@ -49,13 +49,14 @@ public class NGramFingerprintKeyer extends FingerprintKeyer { } s = s.toLowerCase(); // then lowercase it s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars + s = asciify(s); // find ASCII equivalent to characters TreeSet set = ngram_split(s,ngram_size); StringBuffer b = new StringBuffer(); Iterator i = set.iterator(); while (i.hasNext()) { // join ordered fragments back together b.append(i.next()); } - return asciify(b.toString()); // find ASCII equivalent to characters + return b.toString(); } protected TreeSet ngram_split(String s, int size) { diff --git a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java index 77d6cf48e..458501414 100644 --- a/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java +++ b/main/tests/server/src/com/google/refine/clustering/binning/KeyerTests.java @@ -62,9 +62,10 @@ public class KeyerTests extends RefineTest { private static final String[][] testNGramStrings = { {"abcdefg","abbccddeeffg"}, + {" a,b.c d\te!f?g ","abbccddeeffg"}, + {"écÉCec","ceec"}, {"",""}, //TODO: add more test cases {"",""}, - {"",""}, }; @Override