- moved from float to double to avoid excessive casting from secondstring

- added a few of the more powerful distances
- fixed a bug in the VPTree builder (although is still not working as I expect it to)


git-svn-id: http://google-refine.googlecode.com/svn/trunk@248 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-09 05:11:36 +00:00
parent af9e9f590b
commit 5b079b04b7
18 changed files with 84 additions and 47 deletions

View File

@ -27,6 +27,8 @@ import edu.mit.simile.vicino.distances.BZip2Distance;
import edu.mit.simile.vicino.distances.GZipDistance; import edu.mit.simile.vicino.distances.GZipDistance;
import edu.mit.simile.vicino.distances.JaccardDistance; import edu.mit.simile.vicino.distances.JaccardDistance;
import edu.mit.simile.vicino.distances.JaroDistance; import edu.mit.simile.vicino.distances.JaroDistance;
import edu.mit.simile.vicino.distances.JaroWinklerDistance;
import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
import edu.mit.simile.vicino.distances.LevenshteinDistance; import edu.mit.simile.vicino.distances.LevenshteinDistance;
import edu.mit.simile.vicino.distances.PPMDistance; import edu.mit.simile.vicino.distances.PPMDistance;
import edu.mit.simile.vicino.vptree.VPTreeBuilder; import edu.mit.simile.vicino.vptree.VPTreeBuilder;
@ -41,8 +43,10 @@ public class kNNClusterer extends Clusterer {
static { static {
_distances.put("levenshtein", new LevenshteinDistance()); _distances.put("levenshtein", new LevenshteinDistance());
_distances.put("jaro", new JaroDistance());
_distances.put("jaccard", new JaccardDistance()); _distances.put("jaccard", new JaccardDistance());
_distances.put("jaro", new JaroDistance());
_distances.put("jaro-winkler", new JaroWinklerDistance());
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
_distances.put("gzip", new GZipDistance()); _distances.put("gzip", new GZipDistance());
_distances.put("bzip2", new BZip2Distance()); _distances.put("bzip2", new BZip2Distance());
_distances.put("ppm", new PPMDistance()); _distances.put("ppm", new PPMDistance());

View File

@ -2,6 +2,6 @@ package edu.mit.simile.vicino;
public interface Distance { public interface Distance {
public float d(String x, String y); public double d(String x, String y);
} }

View File

@ -27,8 +27,7 @@ public class Seeker extends Operator {
VPTreeSeeker seeker = new VPTreeSeeker(d, tree); VPTreeSeeker seeker = new VPTreeSeeker(d, tree);
log("type a string|range then hit return:"); log("type a string|range then hit return:");
BufferedReader input = new BufferedReader(new InputStreamReader( BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
System.in));
String line = null; String line = null;
while ((line = input.readLine()) != null) { while ((line = input.readLine()) != null) {
int index = line.indexOf('|'); int index = line.indexOf('|');

View File

@ -12,7 +12,7 @@ public class Tester extends Operator {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
int size = strings.size(); int size = strings.size();
for (int i = 0; i < size; i++) { for (int i = 0; i < size * size * size; i++) {
String x = (String) strings.get((int) (Math.random() * size)); String x = (String) strings.get((int) (Math.random() * size));
String y = (String) strings.get((int) (Math.random() * size)); String y = (String) strings.get((int) (Math.random() * size));
String z = (String) strings.get((int) (Math.random() * size)); String z = (String) strings.get((int) (Math.random() * size));
@ -31,20 +31,14 @@ public class Tester extends Operator {
} }
static boolean metric(String x, String y, String z, Distance d) { static boolean metric(String x, String y, String z, Distance d) {
float dxx = d.d(x, x); double dxx = d.d(x, x);
boolean identity = (dxx == 0.0f); boolean identity = (dxx == 0.0f);
float dxy = d.d(x, y); double dxy = d.d(x, y);
float dyx = d.d(y, x); double dyx = d.d(y, x);
boolean simmetrical = (dxy == dyx); boolean simmetrical = (dxy == dyx);
float dxz = d.d(x, z); double dxz = d.d(x, z);
float dyz = d.d(y, z); double dyz = d.d(y, z);
boolean triangular = (dxz <= dxy + dyz); boolean triangular = (dxz <= dxy + dyz);
return (identity && simmetrical && triangular); return (identity && simmetrical && triangular);
} }
static Distance getDistance(String distance) throws Exception {
return (Distance) Class.forName(
"edu.mit.simile.vicino.distances." + distance + "Distance")
.newInstance();
}
} }

View File

@ -7,7 +7,7 @@ import org.apache.tools.bzip2.CBZip2OutputStream;
public class BZip2Distance extends PseudoMetricDistance { public class BZip2Distance extends PseudoMetricDistance {
public float d2(String x, String y) { public double d2(String x, String y) {
String str = x + y; String str = x + y;
float result = 0.0f; float result = 0.0f;
try { try {

View File

@ -6,7 +6,7 @@ import java.util.zip.GZIPOutputStream;
public class GZipDistance extends PseudoMetricDistance { public class GZipDistance extends PseudoMetricDistance {
public float d2(String x, String y) { public double d2(String x, String y) {
String str = x + y; String str = x + y;
float result = 0.0f; float result = 0.0f;
try { try {

View File

@ -11,8 +11,8 @@ public class JaccardDistance extends MetricDistance {
this.distance = new Jaccard(); this.distance = new Jaccard();
} }
protected float d2(String x, String y) { protected double d2(String x, String y) {
return Math.abs((float) this.distance.score(x, y) - 1.0f); return this.distance.score(x, y);
} }
} }

View File

@ -11,8 +11,8 @@ public class JaroDistance extends MetricDistance {
this.distance = new Jaro(); this.distance = new Jaro();
} }
protected float d2(String x, String y) { protected double d2(String x, String y) {
return Math.abs((float) this.distance.score(x, y) - 1.0f); return this.distance.score(x, y);
} }
} }

View File

@ -0,0 +1,18 @@
package edu.mit.simile.vicino.distances;
import com.wcohen.ss.JaroWinkler;
import com.wcohen.ss.api.StringDistance;
public class JaroWinklerDistance extends MetricDistance {
StringDistance distance;
public JaroWinklerDistance() {
this.distance = new JaroWinkler();
}
protected double d2(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -0,0 +1,18 @@
package edu.mit.simile.vicino.distances;
import com.wcohen.ss.JaroWinklerTFIDF;
import com.wcohen.ss.api.StringDistance;
public class JaroWinklerTFIDFDistance extends MetricDistance {
StringDistance distance;
public JaroWinklerTFIDFDistance() {
this.distance = new JaroWinklerTFIDF();
}
protected double d2(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -13,10 +13,8 @@ public class LevenshteinDistance implements Distance {
this.distance = new Levenstein(); this.distance = new Levenstein();
} }
public float d(String x, String y) { public double d(String x, String y) {
float d = Math.abs((float) this.distance.score(x, y)); return this.distance.score(x, y);
// System.out.println(this.distance.explainScore(x,y));
return d / (x.length() + y.length());
} }
} }

View File

@ -5,15 +5,19 @@ import edu.mit.simile.vicino.Distance;
public abstract class MetricDistance implements Distance { public abstract class MetricDistance implements Distance {
/* /*
* public float d(String x,String y) { float dxy = d2(x,y); float dx = * public float d(String x,String y) {
* d2(x,""); float dy = d2(y,""); float result = dxy / (dx + dy); return * float dxy = d2(x,y);
* result; } * float dx = d2(x,"");
* float dy = d2(y,"");
* float result = dxy / (dx + dy);
* return result;
* }
*/ */
public float d(String x, String y) { public double d(String x, String y) {
return d2(x, y); return d2(x, y);
} }
abstract float d2(String x, String y); abstract double d2(String x, String y);
} }

View File

@ -8,7 +8,7 @@ import com.colloquial.arithcode.PPMModel;
public class PPMDistance extends PseudoMetricDistance { public class PPMDistance extends PseudoMetricDistance {
public float d2(String x, String y) { public double d2(String x, String y) {
String str = x + y; String str = x + y;
float result = 0.0f; float result = 0.0f;
try { try {

View File

@ -4,14 +4,14 @@ import edu.mit.simile.vicino.Distance;
public abstract class PseudoMetricDistance implements Distance { public abstract class PseudoMetricDistance implements Distance {
public float d(String x, String y) { public double d(String x, String y) {
float cxx = d2(x, x); double cxx = d2(x, x);
float cyy = d2(y, y); double cyy = d2(y, y);
float cxy = d2(x, y); double cxy = d2(x, y);
float cyx = d2(y, x); double cyx = d2(y, x);
float result1 = (cxy + cyx) / (cxx + cyy) - 1.0f; double result1 = (cxy + cyx) / (cxx + cyy) - 1.0d;
return result1; return result1;
} }
protected abstract float d2(String x, String y); protected abstract double d2(String x, String y);
} }

View File

@ -13,7 +13,7 @@ public class Node implements Serializable {
private static final long serialVersionUID = -2077473220894258550L; private static final long serialVersionUID = -2077473220894258550L;
private final Serializable obj; private final Serializable obj;
private float distance; private double distance;
public Node(Serializable obj, int i) { public Node(Serializable obj, int i) {
this.obj = obj; this.obj = obj;
@ -28,11 +28,11 @@ public class Node implements Serializable {
return this.obj; return this.obj;
} }
public void setDistance(float distance) { public void setDistance(double distance) {
this.distance = distance; this.distance = distance;
} }
public float getDistance() { public double getDistance() {
return distance; return distance;
} }

View File

@ -117,7 +117,7 @@ public class VPTreeBuilder {
for (int i = begin; i <= end; i++) { for (int i = begin; i <= end; i++) {
Object x = pivot.get(); Object x = pivot.get();
Object y = nodes[i].get(); Object y = nodes[i].get();
float d = (x == y) ? 0.0f : distance.d(x.toString(), y.toString()); double d = (x == y) ? 0.0d : distance.d(x.toString(), y.toString());
nodes[i].setDistance(d); nodes[i].setDistance(d);
} }
} }

View File

@ -19,14 +19,14 @@ public class VPTreeSeeker {
this.tree = tree; this.tree = tree;
} }
public List<? extends Serializable> range(Object query, float range) { public List<? extends Serializable> range(Serializable query, float range) {
return rangeTraversal(query, range, tree.getRoot(), new ArrayList<Serializable>()); return rangeTraversal(query, range, tree.getRoot(), new ArrayList<Serializable>());
} }
private List<Serializable> rangeTraversal(Object query, float range, TNode tNode, List<Serializable> results) { private List<Serializable> rangeTraversal(Serializable query, float range, TNode tNode, List<Serializable> results) {
if (tNode != null) { if (tNode != null) {
float distance = this.distance.d(query.toString(), tNode.toString()); double distance = this.distance.d(query.toString(), tNode.get().toString());
if (distance < range) { if (distance < range) {
results.add(tNode.get()); results.add(tNode.get());

View File

@ -40,8 +40,10 @@ FacetBasedEditDialog.prototype._createDialog = function() {
'</select></div>' + '</select></div>' +
'<div class="knn-controls hidden">Distance Function: <select bind="distanceFunctionSelector">' + '<div class="knn-controls hidden">Distance Function: <select bind="distanceFunctionSelector">' +
'<option selected="true">levenshtein</option>' + '<option selected="true">levenshtein</option>' +
'<option>jaro</option>' +
'<option>jaccard</option>' + '<option>jaccard</option>' +
'<option>jaro</option>' +
'<option>jaro-winkler</option>' +
'<option>jaro-winkler-TFIDF</option>' +
'<option>gzip</option>' + '<option>gzip</option>' +
'<option>bzip2</option>' + '<option>bzip2</option>' +
'<option>PPM</option>' + '<option>PPM</option>' +