- moved from float to double to avoid excessive casting from secondstring
- added a few of the more powerful distances - fixed a bug in the VPTree builder (although is still not working as I expect it to) git-svn-id: http://google-refine.googlecode.com/svn/trunk@248 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
af9e9f590b
commit
5b079b04b7
@ -27,6 +27,8 @@ import edu.mit.simile.vicino.distances.BZip2Distance;
|
||||
import edu.mit.simile.vicino.distances.GZipDistance;
|
||||
import edu.mit.simile.vicino.distances.JaccardDistance;
|
||||
import edu.mit.simile.vicino.distances.JaroDistance;
|
||||
import edu.mit.simile.vicino.distances.JaroWinklerDistance;
|
||||
import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
|
||||
import edu.mit.simile.vicino.distances.LevenshteinDistance;
|
||||
import edu.mit.simile.vicino.distances.PPMDistance;
|
||||
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
|
||||
@ -41,8 +43,10 @@ public class kNNClusterer extends Clusterer {
|
||||
|
||||
static {
|
||||
_distances.put("levenshtein", new LevenshteinDistance());
|
||||
_distances.put("jaro", new JaroDistance());
|
||||
_distances.put("jaccard", new JaccardDistance());
|
||||
_distances.put("jaro", new JaroDistance());
|
||||
_distances.put("jaro-winkler", new JaroWinklerDistance());
|
||||
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
|
||||
_distances.put("gzip", new GZipDistance());
|
||||
_distances.put("bzip2", new BZip2Distance());
|
||||
_distances.put("ppm", new PPMDistance());
|
||||
|
@ -2,6 +2,6 @@ package edu.mit.simile.vicino;
|
||||
|
||||
public interface Distance {
|
||||
|
||||
public float d(String x, String y);
|
||||
public double d(String x, String y);
|
||||
|
||||
}
|
||||
|
@ -27,8 +27,7 @@ public class Seeker extends Operator {
|
||||
VPTreeSeeker seeker = new VPTreeSeeker(d, tree);
|
||||
|
||||
log("type a string|range then hit return:");
|
||||
BufferedReader input = new BufferedReader(new InputStreamReader(
|
||||
System.in));
|
||||
BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
|
||||
String line = null;
|
||||
while ((line = input.readLine()) != null) {
|
||||
int index = line.indexOf('|');
|
||||
|
@ -12,7 +12,7 @@ public class Tester extends Operator {
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
int size = strings.size();
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (int i = 0; i < size * size * size; i++) {
|
||||
String x = (String) strings.get((int) (Math.random() * size));
|
||||
String y = (String) strings.get((int) (Math.random() * size));
|
||||
String z = (String) strings.get((int) (Math.random() * size));
|
||||
@ -31,20 +31,14 @@ public class Tester extends Operator {
|
||||
}
|
||||
|
||||
static boolean metric(String x, String y, String z, Distance d) {
|
||||
float dxx = d.d(x, x);
|
||||
double dxx = d.d(x, x);
|
||||
boolean identity = (dxx == 0.0f);
|
||||
float dxy = d.d(x, y);
|
||||
float dyx = d.d(y, x);
|
||||
double dxy = d.d(x, y);
|
||||
double dyx = d.d(y, x);
|
||||
boolean simmetrical = (dxy == dyx);
|
||||
float dxz = d.d(x, z);
|
||||
float dyz = d.d(y, z);
|
||||
double dxz = d.d(x, z);
|
||||
double dyz = d.d(y, z);
|
||||
boolean triangular = (dxz <= dxy + dyz);
|
||||
return (identity && simmetrical && triangular);
|
||||
}
|
||||
|
||||
static Distance getDistance(String distance) throws Exception {
|
||||
return (Distance) Class.forName(
|
||||
"edu.mit.simile.vicino.distances." + distance + "Distance")
|
||||
.newInstance();
|
||||
}
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ import org.apache.tools.bzip2.CBZip2OutputStream;
|
||||
|
||||
public class BZip2Distance extends PseudoMetricDistance {
|
||||
|
||||
public float d2(String x, String y) {
|
||||
public double d2(String x, String y) {
|
||||
String str = x + y;
|
||||
float result = 0.0f;
|
||||
try {
|
||||
|
@ -6,7 +6,7 @@ import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public class GZipDistance extends PseudoMetricDistance {
|
||||
|
||||
public float d2(String x, String y) {
|
||||
public double d2(String x, String y) {
|
||||
String str = x + y;
|
||||
float result = 0.0f;
|
||||
try {
|
||||
|
@ -11,8 +11,8 @@ public class JaccardDistance extends MetricDistance {
|
||||
this.distance = new Jaccard();
|
||||
}
|
||||
|
||||
protected float d2(String x, String y) {
|
||||
return Math.abs((float) this.distance.score(x, y) - 1.0f);
|
||||
protected double d2(String x, String y) {
|
||||
return this.distance.score(x, y);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -11,8 +11,8 @@ public class JaroDistance extends MetricDistance {
|
||||
this.distance = new Jaro();
|
||||
}
|
||||
|
||||
protected float d2(String x, String y) {
|
||||
return Math.abs((float) this.distance.score(x, y) - 1.0f);
|
||||
protected double d2(String x, String y) {
|
||||
return this.distance.score(x, y);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,18 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import com.wcohen.ss.JaroWinkler;
|
||||
import com.wcohen.ss.api.StringDistance;
|
||||
|
||||
public class JaroWinklerDistance extends MetricDistance {
|
||||
|
||||
StringDistance distance;
|
||||
|
||||
public JaroWinklerDistance() {
|
||||
this.distance = new JaroWinkler();
|
||||
}
|
||||
|
||||
protected double d2(String x, String y) {
|
||||
return this.distance.score(x, y);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import com.wcohen.ss.JaroWinklerTFIDF;
|
||||
import com.wcohen.ss.api.StringDistance;
|
||||
|
||||
public class JaroWinklerTFIDFDistance extends MetricDistance {
|
||||
|
||||
StringDistance distance;
|
||||
|
||||
public JaroWinklerTFIDFDistance() {
|
||||
this.distance = new JaroWinklerTFIDF();
|
||||
}
|
||||
|
||||
protected double d2(String x, String y) {
|
||||
return this.distance.score(x, y);
|
||||
}
|
||||
|
||||
}
|
@ -13,10 +13,8 @@ public class LevenshteinDistance implements Distance {
|
||||
this.distance = new Levenstein();
|
||||
}
|
||||
|
||||
public float d(String x, String y) {
|
||||
float d = Math.abs((float) this.distance.score(x, y));
|
||||
// System.out.println(this.distance.explainScore(x,y));
|
||||
return d / (x.length() + y.length());
|
||||
public double d(String x, String y) {
|
||||
return this.distance.score(x, y);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,15 +5,19 @@ import edu.mit.simile.vicino.Distance;
|
||||
public abstract class MetricDistance implements Distance {
|
||||
|
||||
/*
|
||||
* public float d(String x,String y) { float dxy = d2(x,y); float dx =
|
||||
* d2(x,""); float dy = d2(y,""); float result = dxy / (dx + dy); return
|
||||
* result; }
|
||||
* public float d(String x,String y) {
|
||||
* float dxy = d2(x,y);
|
||||
* float dx = d2(x,"");
|
||||
* float dy = d2(y,"");
|
||||
* float result = dxy / (dx + dy);
|
||||
* return result;
|
||||
* }
|
||||
*/
|
||||
|
||||
public float d(String x, String y) {
|
||||
public double d(String x, String y) {
|
||||
return d2(x, y);
|
||||
}
|
||||
|
||||
abstract float d2(String x, String y);
|
||||
abstract double d2(String x, String y);
|
||||
|
||||
}
|
||||
|
@ -8,7 +8,7 @@ import com.colloquial.arithcode.PPMModel;
|
||||
|
||||
public class PPMDistance extends PseudoMetricDistance {
|
||||
|
||||
public float d2(String x, String y) {
|
||||
public double d2(String x, String y) {
|
||||
String str = x + y;
|
||||
float result = 0.0f;
|
||||
try {
|
||||
|
@ -4,14 +4,14 @@ import edu.mit.simile.vicino.Distance;
|
||||
|
||||
public abstract class PseudoMetricDistance implements Distance {
|
||||
|
||||
public float d(String x, String y) {
|
||||
float cxx = d2(x, x);
|
||||
float cyy = d2(y, y);
|
||||
float cxy = d2(x, y);
|
||||
float cyx = d2(y, x);
|
||||
float result1 = (cxy + cyx) / (cxx + cyy) - 1.0f;
|
||||
public double d(String x, String y) {
|
||||
double cxx = d2(x, x);
|
||||
double cyy = d2(y, y);
|
||||
double cxy = d2(x, y);
|
||||
double cyx = d2(y, x);
|
||||
double result1 = (cxy + cyx) / (cxx + cyy) - 1.0d;
|
||||
return result1;
|
||||
}
|
||||
|
||||
protected abstract float d2(String x, String y);
|
||||
protected abstract double d2(String x, String y);
|
||||
}
|
||||
|
@ -13,7 +13,7 @@ public class Node implements Serializable {
|
||||
private static final long serialVersionUID = -2077473220894258550L;
|
||||
|
||||
private final Serializable obj;
|
||||
private float distance;
|
||||
private double distance;
|
||||
|
||||
public Node(Serializable obj, int i) {
|
||||
this.obj = obj;
|
||||
@ -28,11 +28,11 @@ public class Node implements Serializable {
|
||||
return this.obj;
|
||||
}
|
||||
|
||||
public void setDistance(float distance) {
|
||||
public void setDistance(double distance) {
|
||||
this.distance = distance;
|
||||
}
|
||||
|
||||
public float getDistance() {
|
||||
public double getDistance() {
|
||||
return distance;
|
||||
}
|
||||
|
||||
|
@ -117,7 +117,7 @@ public class VPTreeBuilder {
|
||||
for (int i = begin; i <= end; i++) {
|
||||
Object x = pivot.get();
|
||||
Object y = nodes[i].get();
|
||||
float d = (x == y) ? 0.0f : distance.d(x.toString(), y.toString());
|
||||
double d = (x == y) ? 0.0d : distance.d(x.toString(), y.toString());
|
||||
nodes[i].setDistance(d);
|
||||
}
|
||||
}
|
||||
|
@ -19,14 +19,14 @@ public class VPTreeSeeker {
|
||||
this.tree = tree;
|
||||
}
|
||||
|
||||
public List<? extends Serializable> range(Object query, float range) {
|
||||
public List<? extends Serializable> range(Serializable query, float range) {
|
||||
return rangeTraversal(query, range, tree.getRoot(), new ArrayList<Serializable>());
|
||||
}
|
||||
|
||||
private List<Serializable> rangeTraversal(Object query, float range, TNode tNode, List<Serializable> results) {
|
||||
private List<Serializable> rangeTraversal(Serializable query, float range, TNode tNode, List<Serializable> results) {
|
||||
|
||||
if (tNode != null) {
|
||||
float distance = this.distance.d(query.toString(), tNode.toString());
|
||||
double distance = this.distance.d(query.toString(), tNode.get().toString());
|
||||
|
||||
if (distance < range) {
|
||||
results.add(tNode.get());
|
||||
|
@ -40,8 +40,10 @@ FacetBasedEditDialog.prototype._createDialog = function() {
|
||||
'</select></div>' +
|
||||
'<div class="knn-controls hidden">Distance Function: <select bind="distanceFunctionSelector">' +
|
||||
'<option selected="true">levenshtein</option>' +
|
||||
'<option>jaro</option>' +
|
||||
'<option>jaccard</option>' +
|
||||
'<option>jaro</option>' +
|
||||
'<option>jaro-winkler</option>' +
|
||||
'<option>jaro-winkler-TFIDF</option>' +
|
||||
'<option>gzip</option>' +
|
||||
'<option>bzip2</option>' +
|
||||
'<option>PPM</option>' +
|
||||
|
Loading…
Reference in New Issue
Block a user