Perform ASCII normalization earlier in FingerprintKeyer.
This closes #1256.
This commit is contained in:
parent
5d387b1df8
commit
1da3c00cb1
@ -52,6 +52,7 @@ public class FingerprintKeyer extends Keyer {
|
||||
s = s.trim(); // first off, remove whitespace around the string
|
||||
s = s.toLowerCase(); // then lowercase it
|
||||
s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||
s = asciify(s); // find ASCII equivalent to characters
|
||||
String[] frags = StringUtils.split(s); // split by whitespace
|
||||
TreeSet<String> set = new TreeSet<String>();
|
||||
for (String ss : frags) {
|
||||
@ -65,7 +66,7 @@ public class FingerprintKeyer extends Keyer {
|
||||
b.append(' ');
|
||||
}
|
||||
}
|
||||
return asciify(b.toString()); // find ASCII equivalent to characters
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
protected String asciify(String s) {
|
||||
|
@ -50,6 +50,7 @@ public class KeyerTests extends RefineTest {
|
||||
|
||||
private static final String[][] testStrings = {
|
||||
{"the multi multi word test","multi test the word"},
|
||||
{" école ÉCole ecoLe ", "ecole"},
|
||||
{"a b c d","a b c d"},
|
||||
{" d c b a ","a b c d"},
|
||||
{"\tABC \t DEF ","abc def"}, // test leading and trailing whitespace
|
||||
|
Loading…
Reference in New Issue
Block a user