Fix NGramFingerprintKeyer to ignore accents - fixes #1161 (#2899)

Fixes #1161
This change parallels what was done in #1257 1da3c00 to fix
the FingerprintKeyer and moves the diacritic removal before
the deduping. Includes a test.
This commit is contained in:
Tom Morris 2020-07-07 03:02:49 -04:00 committed by GitHub
parent 66aeaa4409
commit e61d50a1aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 4 additions and 2 deletions

View File

@ -49,13 +49,14 @@ public class NGramFingerprintKeyer extends FingerprintKeyer {
}
s = s.toLowerCase(); // then lowercase it
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
s = asciify(s); // find ASCII equivalent to characters
TreeSet<String> set = ngram_split(s,ngram_size);
StringBuffer b = new StringBuffer();
Iterator<String> i = set.iterator();
while (i.hasNext()) { // join ordered fragments back together
b.append(i.next());
}
return asciify(b.toString()); // find ASCII equivalent to characters
return b.toString();
}
protected TreeSet<String> ngram_split(String s, int size) {

View File

@ -62,9 +62,10 @@ public class KeyerTests extends RefineTest {
private static final String[][] testNGramStrings = {
{"abcdefg","abbccddeeffg"},
{" a,b.c d\te!f?g ","abbccddeeffg"},
{"écÉCec","ceec"},
{"",""}, //TODO: add more test cases
{"",""},
{"",""},
};
@Override