From 5c3ca7723a754ae3832be2a3a04c38646eb0026b Mon Sep 17 00:00:00 2001 From: Stefano Mazzocchi Date: Thu, 4 Mar 2010 02:37:06 +0000 Subject: [PATCH] use a TreeSet to do both sorting and de-dupe of the split fragments git-svn-id: http://google-refine.googlecode.com/svn/trunk@190 7d457c2a-affb-35e4-300a-418c747d4874 --- .../expr/functions/strings/Fingerprint.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java index c50c235f6..a236ac089 100644 --- a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java @@ -1,7 +1,8 @@ package com.metaweb.gridworks.expr.functions.strings; -import java.util.Arrays; +import java.util.Iterator; import java.util.Properties; +import java.util.TreeSet; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; @@ -22,9 +23,17 @@ public class Fingerprint implements Function { s = s.toLowerCase(); // then lowercase it s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars String[] frags = StringUtils.split(s); // split by whitespace - Arrays.sort(frags); // sort the fragments - return StringUtils.join(frags," "); // rejoin them with a single space between them - + TreeSet set = new TreeSet(); + for (String ss : frags) { + set.add(ss); // order fragments and dedupe + } + StringBuffer b = new StringBuffer(); + Iterator i = set.iterator(); + while (i.hasNext()) { + b.append(i.next()); + b.append(' '); + } + return b.toString(); // join ordered fragments back together } return null; }