Fixes #1161
This change parallels what was done in #1257 1da3c00
to fix
the FingerprintKeyer and moves the diacritic removal before
the deduping. Includes a test.
This commit is contained in:
parent
66aeaa4409
commit
e61d50a1aa
@ -49,13 +49,14 @@ public class NGramFingerprintKeyer extends FingerprintKeyer {
|
||||
}
|
||||
s = s.toLowerCase(); // then lowercase it
|
||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||
s = asciify(s); // find ASCII equivalent to characters
|
||||
TreeSet<String> set = ngram_split(s,ngram_size);
|
||||
StringBuffer b = new StringBuffer();
|
||||
Iterator<String> i = set.iterator();
|
||||
while (i.hasNext()) { // join ordered fragments back together
|
||||
b.append(i.next());
|
||||
}
|
||||
return asciify(b.toString()); // find ASCII equivalent to characters
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
protected TreeSet<String> ngram_split(String s, int size) {
|
||||
|
@ -62,9 +62,10 @@ public class KeyerTests extends RefineTest {
|
||||
|
||||
private static final String[][] testNGramStrings = {
|
||||
{"abcdefg","abbccddeeffg"},
|
||||
{" a,b.c d\te!f?g ","abbccddeeffg"},
|
||||
{"écÉCec","ceec"},
|
||||
{"",""}, //TODO: add more test cases
|
||||
{"",""},
|
||||
{"",""},
|
||||
};
|
||||
|
||||
@Override
|
||||
|
Loading…
Reference in New Issue
Block a user