- moved from float to double to avoid excessive casting from secondstring
- added a few of the more powerful distances - fixed a bug in the VPTree builder (although is still not working as I expect it to) git-svn-id: http://google-refine.googlecode.com/svn/trunk@248 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
af9e9f590b
commit
5b079b04b7
@ -27,6 +27,8 @@ import edu.mit.simile.vicino.distances.BZip2Distance;
|
|||||||
import edu.mit.simile.vicino.distances.GZipDistance;
|
import edu.mit.simile.vicino.distances.GZipDistance;
|
||||||
import edu.mit.simile.vicino.distances.JaccardDistance;
|
import edu.mit.simile.vicino.distances.JaccardDistance;
|
||||||
import edu.mit.simile.vicino.distances.JaroDistance;
|
import edu.mit.simile.vicino.distances.JaroDistance;
|
||||||
|
import edu.mit.simile.vicino.distances.JaroWinklerDistance;
|
||||||
|
import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
|
||||||
import edu.mit.simile.vicino.distances.LevenshteinDistance;
|
import edu.mit.simile.vicino.distances.LevenshteinDistance;
|
||||||
import edu.mit.simile.vicino.distances.PPMDistance;
|
import edu.mit.simile.vicino.distances.PPMDistance;
|
||||||
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
|
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
|
||||||
@ -41,8 +43,10 @@ public class kNNClusterer extends Clusterer {
|
|||||||
|
|
||||||
static {
|
static {
|
||||||
_distances.put("levenshtein", new LevenshteinDistance());
|
_distances.put("levenshtein", new LevenshteinDistance());
|
||||||
_distances.put("jaro", new JaroDistance());
|
|
||||||
_distances.put("jaccard", new JaccardDistance());
|
_distances.put("jaccard", new JaccardDistance());
|
||||||
|
_distances.put("jaro", new JaroDistance());
|
||||||
|
_distances.put("jaro-winkler", new JaroWinklerDistance());
|
||||||
|
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
|
||||||
_distances.put("gzip", new GZipDistance());
|
_distances.put("gzip", new GZipDistance());
|
||||||
_distances.put("bzip2", new BZip2Distance());
|
_distances.put("bzip2", new BZip2Distance());
|
||||||
_distances.put("ppm", new PPMDistance());
|
_distances.put("ppm", new PPMDistance());
|
||||||
|
@ -2,6 +2,6 @@ package edu.mit.simile.vicino;
|
|||||||
|
|
||||||
public interface Distance {
|
public interface Distance {
|
||||||
|
|
||||||
public float d(String x, String y);
|
public double d(String x, String y);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -27,8 +27,7 @@ public class Seeker extends Operator {
|
|||||||
VPTreeSeeker seeker = new VPTreeSeeker(d, tree);
|
VPTreeSeeker seeker = new VPTreeSeeker(d, tree);
|
||||||
|
|
||||||
log("type a string|range then hit return:");
|
log("type a string|range then hit return:");
|
||||||
BufferedReader input = new BufferedReader(new InputStreamReader(
|
BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
|
||||||
System.in));
|
|
||||||
String line = null;
|
String line = null;
|
||||||
while ((line = input.readLine()) != null) {
|
while ((line = input.readLine()) != null) {
|
||||||
int index = line.indexOf('|');
|
int index = line.indexOf('|');
|
||||||
|
@ -12,7 +12,7 @@ public class Tester extends Operator {
|
|||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
|
||||||
int size = strings.size();
|
int size = strings.size();
|
||||||
for (int i = 0; i < size; i++) {
|
for (int i = 0; i < size * size * size; i++) {
|
||||||
String x = (String) strings.get((int) (Math.random() * size));
|
String x = (String) strings.get((int) (Math.random() * size));
|
||||||
String y = (String) strings.get((int) (Math.random() * size));
|
String y = (String) strings.get((int) (Math.random() * size));
|
||||||
String z = (String) strings.get((int) (Math.random() * size));
|
String z = (String) strings.get((int) (Math.random() * size));
|
||||||
@ -31,20 +31,14 @@ public class Tester extends Operator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static boolean metric(String x, String y, String z, Distance d) {
|
static boolean metric(String x, String y, String z, Distance d) {
|
||||||
float dxx = d.d(x, x);
|
double dxx = d.d(x, x);
|
||||||
boolean identity = (dxx == 0.0f);
|
boolean identity = (dxx == 0.0f);
|
||||||
float dxy = d.d(x, y);
|
double dxy = d.d(x, y);
|
||||||
float dyx = d.d(y, x);
|
double dyx = d.d(y, x);
|
||||||
boolean simmetrical = (dxy == dyx);
|
boolean simmetrical = (dxy == dyx);
|
||||||
float dxz = d.d(x, z);
|
double dxz = d.d(x, z);
|
||||||
float dyz = d.d(y, z);
|
double dyz = d.d(y, z);
|
||||||
boolean triangular = (dxz <= dxy + dyz);
|
boolean triangular = (dxz <= dxy + dyz);
|
||||||
return (identity && simmetrical && triangular);
|
return (identity && simmetrical && triangular);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Distance getDistance(String distance) throws Exception {
|
|
||||||
return (Distance) Class.forName(
|
|
||||||
"edu.mit.simile.vicino.distances." + distance + "Distance")
|
|
||||||
.newInstance();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ import org.apache.tools.bzip2.CBZip2OutputStream;
|
|||||||
|
|
||||||
public class BZip2Distance extends PseudoMetricDistance {
|
public class BZip2Distance extends PseudoMetricDistance {
|
||||||
|
|
||||||
public float d2(String x, String y) {
|
public double d2(String x, String y) {
|
||||||
String str = x + y;
|
String str = x + y;
|
||||||
float result = 0.0f;
|
float result = 0.0f;
|
||||||
try {
|
try {
|
||||||
|
@ -6,7 +6,7 @@ import java.util.zip.GZIPOutputStream;
|
|||||||
|
|
||||||
public class GZipDistance extends PseudoMetricDistance {
|
public class GZipDistance extends PseudoMetricDistance {
|
||||||
|
|
||||||
public float d2(String x, String y) {
|
public double d2(String x, String y) {
|
||||||
String str = x + y;
|
String str = x + y;
|
||||||
float result = 0.0f;
|
float result = 0.0f;
|
||||||
try {
|
try {
|
||||||
|
@ -11,8 +11,8 @@ public class JaccardDistance extends MetricDistance {
|
|||||||
this.distance = new Jaccard();
|
this.distance = new Jaccard();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected float d2(String x, String y) {
|
protected double d2(String x, String y) {
|
||||||
return Math.abs((float) this.distance.score(x, y) - 1.0f);
|
return this.distance.score(x, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -11,8 +11,8 @@ public class JaroDistance extends MetricDistance {
|
|||||||
this.distance = new Jaro();
|
this.distance = new Jaro();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected float d2(String x, String y) {
|
protected double d2(String x, String y) {
|
||||||
return Math.abs((float) this.distance.score(x, y) - 1.0f);
|
return this.distance.score(x, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,18 @@
|
|||||||
|
package edu.mit.simile.vicino.distances;
|
||||||
|
|
||||||
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
import com.wcohen.ss.api.StringDistance;
|
||||||
|
|
||||||
|
public class JaroWinklerDistance extends MetricDistance {
|
||||||
|
|
||||||
|
StringDistance distance;
|
||||||
|
|
||||||
|
public JaroWinklerDistance() {
|
||||||
|
this.distance = new JaroWinkler();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected double d2(String x, String y) {
|
||||||
|
return this.distance.score(x, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,18 @@
|
|||||||
|
package edu.mit.simile.vicino.distances;
|
||||||
|
|
||||||
|
import com.wcohen.ss.JaroWinklerTFIDF;
|
||||||
|
import com.wcohen.ss.api.StringDistance;
|
||||||
|
|
||||||
|
public class JaroWinklerTFIDFDistance extends MetricDistance {
|
||||||
|
|
||||||
|
StringDistance distance;
|
||||||
|
|
||||||
|
public JaroWinklerTFIDFDistance() {
|
||||||
|
this.distance = new JaroWinklerTFIDF();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected double d2(String x, String y) {
|
||||||
|
return this.distance.score(x, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -13,10 +13,8 @@ public class LevenshteinDistance implements Distance {
|
|||||||
this.distance = new Levenstein();
|
this.distance = new Levenstein();
|
||||||
}
|
}
|
||||||
|
|
||||||
public float d(String x, String y) {
|
public double d(String x, String y) {
|
||||||
float d = Math.abs((float) this.distance.score(x, y));
|
return this.distance.score(x, y);
|
||||||
// System.out.println(this.distance.explainScore(x,y));
|
|
||||||
return d / (x.length() + y.length());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,15 +5,19 @@ import edu.mit.simile.vicino.Distance;
|
|||||||
public abstract class MetricDistance implements Distance {
|
public abstract class MetricDistance implements Distance {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* public float d(String x,String y) { float dxy = d2(x,y); float dx =
|
* public float d(String x,String y) {
|
||||||
* d2(x,""); float dy = d2(y,""); float result = dxy / (dx + dy); return
|
* float dxy = d2(x,y);
|
||||||
* result; }
|
* float dx = d2(x,"");
|
||||||
|
* float dy = d2(y,"");
|
||||||
|
* float result = dxy / (dx + dy);
|
||||||
|
* return result;
|
||||||
|
* }
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public float d(String x, String y) {
|
public double d(String x, String y) {
|
||||||
return d2(x, y);
|
return d2(x, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
abstract float d2(String x, String y);
|
abstract double d2(String x, String y);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@ import com.colloquial.arithcode.PPMModel;
|
|||||||
|
|
||||||
public class PPMDistance extends PseudoMetricDistance {
|
public class PPMDistance extends PseudoMetricDistance {
|
||||||
|
|
||||||
public float d2(String x, String y) {
|
public double d2(String x, String y) {
|
||||||
String str = x + y;
|
String str = x + y;
|
||||||
float result = 0.0f;
|
float result = 0.0f;
|
||||||
try {
|
try {
|
||||||
|
@ -4,14 +4,14 @@ import edu.mit.simile.vicino.Distance;
|
|||||||
|
|
||||||
public abstract class PseudoMetricDistance implements Distance {
|
public abstract class PseudoMetricDistance implements Distance {
|
||||||
|
|
||||||
public float d(String x, String y) {
|
public double d(String x, String y) {
|
||||||
float cxx = d2(x, x);
|
double cxx = d2(x, x);
|
||||||
float cyy = d2(y, y);
|
double cyy = d2(y, y);
|
||||||
float cxy = d2(x, y);
|
double cxy = d2(x, y);
|
||||||
float cyx = d2(y, x);
|
double cyx = d2(y, x);
|
||||||
float result1 = (cxy + cyx) / (cxx + cyy) - 1.0f;
|
double result1 = (cxy + cyx) / (cxx + cyy) - 1.0d;
|
||||||
return result1;
|
return result1;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract float d2(String x, String y);
|
protected abstract double d2(String x, String y);
|
||||||
}
|
}
|
||||||
|
@ -13,7 +13,7 @@ public class Node implements Serializable {
|
|||||||
private static final long serialVersionUID = -2077473220894258550L;
|
private static final long serialVersionUID = -2077473220894258550L;
|
||||||
|
|
||||||
private final Serializable obj;
|
private final Serializable obj;
|
||||||
private float distance;
|
private double distance;
|
||||||
|
|
||||||
public Node(Serializable obj, int i) {
|
public Node(Serializable obj, int i) {
|
||||||
this.obj = obj;
|
this.obj = obj;
|
||||||
@ -28,11 +28,11 @@ public class Node implements Serializable {
|
|||||||
return this.obj;
|
return this.obj;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDistance(float distance) {
|
public void setDistance(double distance) {
|
||||||
this.distance = distance;
|
this.distance = distance;
|
||||||
}
|
}
|
||||||
|
|
||||||
public float getDistance() {
|
public double getDistance() {
|
||||||
return distance;
|
return distance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -117,7 +117,7 @@ public class VPTreeBuilder {
|
|||||||
for (int i = begin; i <= end; i++) {
|
for (int i = begin; i <= end; i++) {
|
||||||
Object x = pivot.get();
|
Object x = pivot.get();
|
||||||
Object y = nodes[i].get();
|
Object y = nodes[i].get();
|
||||||
float d = (x == y) ? 0.0f : distance.d(x.toString(), y.toString());
|
double d = (x == y) ? 0.0d : distance.d(x.toString(), y.toString());
|
||||||
nodes[i].setDistance(d);
|
nodes[i].setDistance(d);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,14 +19,14 @@ public class VPTreeSeeker {
|
|||||||
this.tree = tree;
|
this.tree = tree;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<? extends Serializable> range(Object query, float range) {
|
public List<? extends Serializable> range(Serializable query, float range) {
|
||||||
return rangeTraversal(query, range, tree.getRoot(), new ArrayList<Serializable>());
|
return rangeTraversal(query, range, tree.getRoot(), new ArrayList<Serializable>());
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Serializable> rangeTraversal(Object query, float range, TNode tNode, List<Serializable> results) {
|
private List<Serializable> rangeTraversal(Serializable query, float range, TNode tNode, List<Serializable> results) {
|
||||||
|
|
||||||
if (tNode != null) {
|
if (tNode != null) {
|
||||||
float distance = this.distance.d(query.toString(), tNode.toString());
|
double distance = this.distance.d(query.toString(), tNode.get().toString());
|
||||||
|
|
||||||
if (distance < range) {
|
if (distance < range) {
|
||||||
results.add(tNode.get());
|
results.add(tNode.get());
|
||||||
|
@ -40,8 +40,10 @@ FacetBasedEditDialog.prototype._createDialog = function() {
|
|||||||
'</select></div>' +
|
'</select></div>' +
|
||||||
'<div class="knn-controls hidden">Distance Function: <select bind="distanceFunctionSelector">' +
|
'<div class="knn-controls hidden">Distance Function: <select bind="distanceFunctionSelector">' +
|
||||||
'<option selected="true">levenshtein</option>' +
|
'<option selected="true">levenshtein</option>' +
|
||||||
'<option>jaro</option>' +
|
|
||||||
'<option>jaccard</option>' +
|
'<option>jaccard</option>' +
|
||||||
|
'<option>jaro</option>' +
|
||||||
|
'<option>jaro-winkler</option>' +
|
||||||
|
'<option>jaro-winkler-TFIDF</option>' +
|
||||||
'<option>gzip</option>' +
|
'<option>gzip</option>' +
|
||||||
'<option>bzip2</option>' +
|
'<option>bzip2</option>' +
|
||||||
'<option>PPM</option>' +
|
'<option>PPM</option>' +
|
||||||
|
Loading…
Reference in New Issue
Block a user