Merge pull request #1257 from OpenRefine/issue1256

Perform ASCII normalization earlier in FingerprintKeyer.
This commit is contained in:
Antonin Delpeuch 2017-09-27 17:54:43 +02:00 committed by GitHub
commit 4036f3ff91
2 changed files with 3 additions and 1 deletions

View File

@ -52,6 +52,7 @@ public class FingerprintKeyer extends Keyer {
s = s.trim(); // first off, remove whitespace around the string s = s.trim(); // first off, remove whitespace around the string
s = s.toLowerCase(); // then lowercase it s = s.toLowerCase(); // then lowercase it
s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars
s = asciify(s); // find ASCII equivalent to characters
String[] frags = StringUtils.split(s); // split by whitespace String[] frags = StringUtils.split(s); // split by whitespace
TreeSet<String> set = new TreeSet<String>(); TreeSet<String> set = new TreeSet<String>();
for (String ss : frags) { for (String ss : frags) {
@ -65,7 +66,7 @@ public class FingerprintKeyer extends Keyer {
b.append(' '); b.append(' ');
} }
} }
return asciify(b.toString()); // find ASCII equivalent to characters return b.toString();
} }
protected String asciify(String s) { protected String asciify(String s) {

View File

@ -50,6 +50,7 @@ public class KeyerTests extends RefineTest {
private static final String[][] testStrings = { private static final String[][] testStrings = {
{"the multi multi word test","multi test the word"}, {"the multi multi word test","multi test the word"},
{" école ÉCole ecoLe ", "ecole"},
{"a b c d","a b c d"}, {"a b c d","a b c d"},
{" d c b a ","a b c d"}, {" d c b a ","a b c d"},
{"\tABC \t DEF ","abc def"}, // test leading and trailing whitespace {"\tABC \t DEF ","abc def"}, // test leading and trailing whitespace