Clean up to pass tests:

- don't include TAB in control characters which get stripped so we can
use it for splitting
- remove trailing space from normalize strings
This commit is contained in:
Tom Morris 2013-05-31 17:06:03 -04:00
parent 1907bcd8dc
commit 067fcacec7

View File

@ -41,13 +41,17 @@ import org.apache.commons.lang.StringUtils;
public class FingerprintKeyer extends Keyer { public class FingerprintKeyer extends Keyer {
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}"); // Punctuation and control characters (except for TAB which we need for split to work)
static final Pattern punctctrl = Pattern.compile("\\p{Punct}|[\\x00-\\x08\\x0A-\\x1F\\x7F]");
@Override @Override
public String key(String s, Object... o) { public String key(String s, Object... o) {
if (s == null || o !=null && o.length > 0) {
throw new IllegalArgumentException("Fingerprint keyer accepts a single string parameter");
}
s = s.trim(); // first off, remove whitespace around the string s = s.trim(); // first off, remove whitespace around the string
s = s.toLowerCase(); // then lowercase it s = s.toLowerCase(); // then lowercase it
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars s = punctctrl.matcher(s).replaceAll(""); // then remove all punctuation and control chars
String[] frags = StringUtils.split(s); // split by whitespace String[] frags = StringUtils.split(s); // split by whitespace
TreeSet<String> set = new TreeSet<String>(); TreeSet<String> set = new TreeSet<String>();
for (String ss : frags) { for (String ss : frags) {
@ -57,7 +61,9 @@ public class FingerprintKeyer extends Keyer {
Iterator<String> i = set.iterator(); Iterator<String> i = set.iterator();
while (i.hasNext()) { // join ordered fragments back together while (i.hasNext()) { // join ordered fragments back together
b.append(i.next()); b.append(i.next());
b.append(' '); if (i.hasNext()) {
b.append(' ');
}
} }
return asciify(b.toString()); // find ASCII equivalent to characters return asciify(b.toString()); // find ASCII equivalent to characters
} }