use a TreeSet to do both sorting and de-dupe of the split fragments
git-svn-id: http://google-refine.googlecode.com/svn/trunk@190 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
70df6821a0
commit
5c3ca7723a
@ -1,7 +1,8 @@
|
|||||||
package com.metaweb.gridworks.expr.functions.strings;
|
package com.metaweb.gridworks.expr.functions.strings;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Iterator;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
import java.util.TreeSet;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
@ -22,9 +23,17 @@ public class Fingerprint implements Function {
|
|||||||
s = s.toLowerCase(); // then lowercase it
|
s = s.toLowerCase(); // then lowercase it
|
||||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||||
String[] frags = StringUtils.split(s); // split by whitespace
|
String[] frags = StringUtils.split(s); // split by whitespace
|
||||||
Arrays.sort(frags); // sort the fragments
|
TreeSet<String> set = new TreeSet<String>();
|
||||||
return StringUtils.join(frags," "); // rejoin them with a single space between them
|
for (String ss : frags) {
|
||||||
|
set.add(ss); // order fragments and dedupe
|
||||||
|
}
|
||||||
|
StringBuffer b = new StringBuffer();
|
||||||
|
Iterator<String> i = set.iterator();
|
||||||
|
while (i.hasNext()) {
|
||||||
|
b.append(i.next());
|
||||||
|
b.append(' ');
|
||||||
|
}
|
||||||
|
return b.toString(); // join ordered fragments back together
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user