- moved from float to double to avoid excessive casting from secondstring

- added a few of the more powerful distances
- fixed a bug in the VPTree builder (although is still not working as I expect it to)


git-svn-id: http://google-refine.googlecode.com/svn/trunk@248 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-09 05:11:36 +00:00
parent af9e9f590b
commit 5b079b04b7
18 changed files with 84 additions and 47 deletions

View File

@ -27,6 +27,8 @@ import edu.mit.simile.vicino.distances.BZip2Distance;
import edu.mit.simile.vicino.distances.GZipDistance;
import edu.mit.simile.vicino.distances.JaccardDistance;
import edu.mit.simile.vicino.distances.JaroDistance;
import edu.mit.simile.vicino.distances.JaroWinklerDistance;
import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
import edu.mit.simile.vicino.distances.LevenshteinDistance;
import edu.mit.simile.vicino.distances.PPMDistance;
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
@ -41,8 +43,10 @@ public class kNNClusterer extends Clusterer {
static {
_distances.put("levenshtein", new LevenshteinDistance());
_distances.put("jaro", new JaroDistance());
_distances.put("jaccard", new JaccardDistance());
_distances.put("jaro", new JaroDistance());
_distances.put("jaro-winkler", new JaroWinklerDistance());
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
_distances.put("gzip", new GZipDistance());
_distances.put("bzip2", new BZip2Distance());
_distances.put("ppm", new PPMDistance());

View File

@ -2,6 +2,6 @@ package edu.mit.simile.vicino;
public interface Distance {
public float d(String x, String y);
public double d(String x, String y);
}

View File

@ -27,8 +27,7 @@ public class Seeker extends Operator {
VPTreeSeeker seeker = new VPTreeSeeker(d, tree);
log("type a string|range then hit return:");
BufferedReader input = new BufferedReader(new InputStreamReader(
System.in));
BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
String line = null;
while ((line = input.readLine()) != null) {
int index = line.indexOf('|');

View File

@ -12,7 +12,7 @@ public class Tester extends Operator {
long start = System.currentTimeMillis();
int size = strings.size();
for (int i = 0; i < size; i++) {
for (int i = 0; i < size * size * size; i++) {
String x = (String) strings.get((int) (Math.random() * size));
String y = (String) strings.get((int) (Math.random() * size));
String z = (String) strings.get((int) (Math.random() * size));
@ -31,20 +31,14 @@ public class Tester extends Operator {
}
static boolean metric(String x, String y, String z, Distance d) {
float dxx = d.d(x, x);
double dxx = d.d(x, x);
boolean identity = (dxx == 0.0f);
float dxy = d.d(x, y);
float dyx = d.d(y, x);
double dxy = d.d(x, y);
double dyx = d.d(y, x);
boolean simmetrical = (dxy == dyx);
float dxz = d.d(x, z);
float dyz = d.d(y, z);
double dxz = d.d(x, z);
double dyz = d.d(y, z);
boolean triangular = (dxz <= dxy + dyz);
return (identity && simmetrical && triangular);
}
static Distance getDistance(String distance) throws Exception {
return (Distance) Class.forName(
"edu.mit.simile.vicino.distances." + distance + "Distance")
.newInstance();
}
}

View File

@ -7,7 +7,7 @@ import org.apache.tools.bzip2.CBZip2OutputStream;
public class BZip2Distance extends PseudoMetricDistance {
public float d2(String x, String y) {
public double d2(String x, String y) {
String str = x + y;
float result = 0.0f;
try {

View File

@ -6,7 +6,7 @@ import java.util.zip.GZIPOutputStream;
public class GZipDistance extends PseudoMetricDistance {
public float d2(String x, String y) {
public double d2(String x, String y) {
String str = x + y;
float result = 0.0f;
try {

View File

@ -11,8 +11,8 @@ public class JaccardDistance extends MetricDistance {
this.distance = new Jaccard();
}
protected float d2(String x, String y) {
return Math.abs((float) this.distance.score(x, y) - 1.0f);
protected double d2(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -11,8 +11,8 @@ public class JaroDistance extends MetricDistance {
this.distance = new Jaro();
}
protected float d2(String x, String y) {
return Math.abs((float) this.distance.score(x, y) - 1.0f);
protected double d2(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -0,0 +1,18 @@
package edu.mit.simile.vicino.distances;
import com.wcohen.ss.JaroWinkler;
import com.wcohen.ss.api.StringDistance;
public class JaroWinklerDistance extends MetricDistance {
StringDistance distance;
public JaroWinklerDistance() {
this.distance = new JaroWinkler();
}
protected double d2(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -0,0 +1,18 @@
package edu.mit.simile.vicino.distances;
import com.wcohen.ss.JaroWinklerTFIDF;
import com.wcohen.ss.api.StringDistance;
public class JaroWinklerTFIDFDistance extends MetricDistance {
StringDistance distance;
public JaroWinklerTFIDFDistance() {
this.distance = new JaroWinklerTFIDF();
}
protected double d2(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -13,10 +13,8 @@ public class LevenshteinDistance implements Distance {
this.distance = new Levenstein();
}
public float d(String x, String y) {
float d = Math.abs((float) this.distance.score(x, y));
// System.out.println(this.distance.explainScore(x,y));
return d / (x.length() + y.length());
public double d(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -5,15 +5,19 @@ import edu.mit.simile.vicino.Distance;
public abstract class MetricDistance implements Distance {
/*
* public float d(String x,String y) { float dxy = d2(x,y); float dx =
* d2(x,""); float dy = d2(y,""); float result = dxy / (dx + dy); return
* result; }
* public float d(String x,String y) {
* float dxy = d2(x,y);
* float dx = d2(x,"");
* float dy = d2(y,"");
* float result = dxy / (dx + dy);
* return result;
* }
*/
public float d(String x, String y) {
public double d(String x, String y) {
return d2(x, y);
}
abstract float d2(String x, String y);
abstract double d2(String x, String y);
}

View File

@ -8,7 +8,7 @@ import com.colloquial.arithcode.PPMModel;
public class PPMDistance extends PseudoMetricDistance {
public float d2(String x, String y) {
public double d2(String x, String y) {
String str = x + y;
float result = 0.0f;
try {

View File

@ -4,14 +4,14 @@ import edu.mit.simile.vicino.Distance;
public abstract class PseudoMetricDistance implements Distance {
public float d(String x, String y) {
float cxx = d2(x, x);
float cyy = d2(y, y);
float cxy = d2(x, y);
float cyx = d2(y, x);
float result1 = (cxy + cyx) / (cxx + cyy) - 1.0f;
public double d(String x, String y) {
double cxx = d2(x, x);
double cyy = d2(y, y);
double cxy = d2(x, y);
double cyx = d2(y, x);
double result1 = (cxy + cyx) / (cxx + cyy) - 1.0d;
return result1;
}
protected abstract float d2(String x, String y);
protected abstract double d2(String x, String y);
}

View File

@ -13,7 +13,7 @@ public class Node implements Serializable {
private static final long serialVersionUID = -2077473220894258550L;
private final Serializable obj;
private float distance;
private double distance;
public Node(Serializable obj, int i) {
this.obj = obj;
@ -28,11 +28,11 @@ public class Node implements Serializable {
return this.obj;
}
public void setDistance(float distance) {
public void setDistance(double distance) {
this.distance = distance;
}
public float getDistance() {
public double getDistance() {
return distance;
}

View File

@ -117,7 +117,7 @@ public class VPTreeBuilder {
for (int i = begin; i <= end; i++) {
Object x = pivot.get();
Object y = nodes[i].get();
float d = (x == y) ? 0.0f : distance.d(x.toString(), y.toString());
double d = (x == y) ? 0.0d : distance.d(x.toString(), y.toString());
nodes[i].setDistance(d);
}
}

View File

@ -19,14 +19,14 @@ public class VPTreeSeeker {
this.tree = tree;
}
public List<? extends Serializable> range(Object query, float range) {
public List<? extends Serializable> range(Serializable query, float range) {
return rangeTraversal(query, range, tree.getRoot(), new ArrayList<Serializable>());
}
private List<Serializable> rangeTraversal(Object query, float range, TNode tNode, List<Serializable> results) {
private List<Serializable> rangeTraversal(Serializable query, float range, TNode tNode, List<Serializable> results) {
if (tNode != null) {
float distance = this.distance.d(query.toString(), tNode.toString());
double distance = this.distance.d(query.toString(), tNode.get().toString());
if (distance < range) {
results.add(tNode.get());

View File

@ -40,8 +40,10 @@ FacetBasedEditDialog.prototype._createDialog = function() {
'</select></div>' +
'<div class="knn-controls hidden">Distance Function: <select bind="distanceFunctionSelector">' +
'<option selected="true">levenshtein</option>' +
'<option>jaro</option>' +
'<option>jaccard</option>' +
'<option>jaro</option>' +
'<option>jaro-winkler</option>' +
'<option>jaro-winkler-TFIDF</option>' +
'<option>gzip</option>' +
'<option>bzip2</option>' +
'<option>PPM</option>' +