use a TreeSet to do both sorting and de-dupe of the split fragments

git-svn-id: http://google-refine.googlecode.com/svn/trunk@190 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-04 02:37:06 +00:00
parent 70df6821a0
commit 5c3ca7723a

View File

@ -1,7 +1,8 @@
package com.metaweb.gridworks.expr.functions.strings;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Properties;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
@ -22,9 +23,17 @@ public class Fingerprint implements Function {
s = s.toLowerCase(); // then lowercase it
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
String[] frags = StringUtils.split(s); // split by whitespace
Arrays.sort(frags); // sort the fragments
return StringUtils.join(frags," "); // rejoin them with a single space between them
TreeSet<String> set = new TreeSet<String>();
for (String ss : frags) {
set.add(ss); // order fragments and dedupe
}
StringBuffer b = new StringBuffer();
Iterator<String> i = set.iterator();
while (i.hasNext()) {
b.append(i.next());
b.append(' ');
}
return b.toString(); // join ordered fragments back together
}
return null;
}