Perform ASCII normalization earlier in FingerprintKeyer.
This closes #1256.
This commit is contained in:
parent
5d387b1df8
commit
1da3c00cb1
@ -52,6 +52,7 @@ public class FingerprintKeyer extends Keyer {
|
|||||||
s = s.trim(); // first off, remove whitespace around the string
|
s = s.trim(); // first off, remove whitespace around the string
|
||||||
s = s.toLowerCase(); // then lowercase it
|
s = s.toLowerCase(); // then lowercase it
|
||||||
s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||||
|
s = asciify(s); // find ASCII equivalent to characters
|
||||||
String[] frags = StringUtils.split(s); // split by whitespace
|
String[] frags = StringUtils.split(s); // split by whitespace
|
||||||
TreeSet<String> set = new TreeSet<String>();
|
TreeSet<String> set = new TreeSet<String>();
|
||||||
for (String ss : frags) {
|
for (String ss : frags) {
|
||||||
@ -65,7 +66,7 @@ public class FingerprintKeyer extends Keyer {
|
|||||||
b.append(' ');
|
b.append(' ');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return asciify(b.toString()); // find ASCII equivalent to characters
|
return b.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String asciify(String s) {
|
protected String asciify(String s) {
|
||||||
|
@ -50,6 +50,7 @@ public class KeyerTests extends RefineTest {
|
|||||||
|
|
||||||
private static final String[][] testStrings = {
|
private static final String[][] testStrings = {
|
||||||
{"the multi multi word test","multi test the word"},
|
{"the multi multi word test","multi test the word"},
|
||||||
|
{" école ÉCole ecoLe ", "ecole"},
|
||||||
{"a b c d","a b c d"},
|
{"a b c d","a b c d"},
|
||||||
{" d c b a ","a b c d"},
|
{" d c b a ","a b c d"},
|
||||||
{"\tABC \t DEF ","abc def"}, // test leading and trailing whitespace
|
{"\tABC \t DEF ","abc def"}, // test leading and trailing whitespace
|
||||||
|
Loading…
Reference in New Issue
Block a user