Fixes #1161
This change parallels what was done in #1257 1da3c00
to fix
the FingerprintKeyer and moves the diacritic removal before
the deduping. Includes a test.
This commit is contained in:
parent
66aeaa4409
commit
e61d50a1aa
@ -49,13 +49,14 @@ public class NGramFingerprintKeyer extends FingerprintKeyer {
|
|||||||
}
|
}
|
||||||
s = s.toLowerCase(); // then lowercase it
|
s = s.toLowerCase(); // then lowercase it
|
||||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||||
|
s = asciify(s); // find ASCII equivalent to characters
|
||||||
TreeSet<String> set = ngram_split(s,ngram_size);
|
TreeSet<String> set = ngram_split(s,ngram_size);
|
||||||
StringBuffer b = new StringBuffer();
|
StringBuffer b = new StringBuffer();
|
||||||
Iterator<String> i = set.iterator();
|
Iterator<String> i = set.iterator();
|
||||||
while (i.hasNext()) { // join ordered fragments back together
|
while (i.hasNext()) { // join ordered fragments back together
|
||||||
b.append(i.next());
|
b.append(i.next());
|
||||||
}
|
}
|
||||||
return asciify(b.toString()); // find ASCII equivalent to characters
|
return b.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected TreeSet<String> ngram_split(String s, int size) {
|
protected TreeSet<String> ngram_split(String s, int size) {
|
||||||
|
@ -62,9 +62,10 @@ public class KeyerTests extends RefineTest {
|
|||||||
|
|
||||||
private static final String[][] testNGramStrings = {
|
private static final String[][] testNGramStrings = {
|
||||||
{"abcdefg","abbccddeeffg"},
|
{"abcdefg","abbccddeeffg"},
|
||||||
|
{" a,b.c d\te!f?g ","abbccddeeffg"},
|
||||||
|
{"écÉCec","ceec"},
|
||||||
{"",""}, //TODO: add more test cases
|
{"",""}, //TODO: add more test cases
|
||||||
{"",""},
|
{"",""},
|
||||||
{"",""},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
Loading…
Reference in New Issue
Block a user