Clean up to pass tests:
- don't include TAB in control characters which get stripped so we can use it for splitting - remove trailing space from normalize strings
This commit is contained in:
parent
1907bcd8dc
commit
067fcacec7
@ -41,13 +41,17 @@ import org.apache.commons.lang.StringUtils;
|
|||||||
|
|
||||||
public class FingerprintKeyer extends Keyer {
|
public class FingerprintKeyer extends Keyer {
|
||||||
|
|
||||||
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}");
|
// Punctuation and control characters (except for TAB which we need for split to work)
|
||||||
|
static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]");
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String key(String s, Object... o) {
|
public String key(String s, Object... o) {
|
||||||
|
if (s == null || o !=null && o.length > 0) {
|
||||||
|
throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter");
|
||||||
|
}
|
||||||
s = s.trim(); // first off, remove whitespace around the string
|
s = s.trim(); // first off, remove whitespace around the string
|
||||||
s = s.toLowerCase(); // then lowercase it
|
s = s.toLowerCase(); // then lowercase it
|
||||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||||
String[] frags = StringUtils.split(s); // split by whitespace
|
String[] frags = StringUtils.split(s); // split by whitespace
|
||||||
TreeSet<String> set = new TreeSet<String>();
|
TreeSet<String> set = new TreeSet<String>();
|
||||||
for (String ss : frags) {
|
for (String ss : frags) {
|
||||||
@ -57,7 +61,9 @@ public class FingerprintKeyer extends Keyer {
|
|||||||
Iterator<String> i = set.iterator();
|
Iterator<String> i = set.iterator();
|
||||||
while (i.hasNext()) { // join ordered fragments back together
|
while (i.hasNext()) { // join ordered fragments back together
|
||||||
b.append(i.next());
|
b.append(i.next());
|
||||||
b.append(' ');
|
if (i.hasNext()) {
|
||||||
|
b.append(' ');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return asciify(b.toString()); // find ASCII equivalent to characters
|
return asciify(b.toString()); // find ASCII equivalent to characters
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user