Expose own interface for similarity distances, isolating from Vicino
This commit is contained in:
parent
72d76ee22a
commit
d9af48a49a
@ -5,7 +5,6 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import edu.mit.simile.vicino.distances.BZip2Distance;
|
import edu.mit.simile.vicino.distances.BZip2Distance;
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
import edu.mit.simile.vicino.distances.GZipDistance;
|
import edu.mit.simile.vicino.distances.GZipDistance;
|
||||||
import edu.mit.simile.vicino.distances.JaccardDistance;
|
import edu.mit.simile.vicino.distances.JaccardDistance;
|
||||||
import edu.mit.simile.vicino.distances.JaroDistance;
|
import edu.mit.simile.vicino.distances.JaroDistance;
|
||||||
@ -21,30 +20,30 @@ import edu.mit.simile.vicino.distances.PPMDistance;
|
|||||||
*/
|
*/
|
||||||
public class DistanceFactory {
|
public class DistanceFactory {
|
||||||
|
|
||||||
static final protected Map<String, Distance> _distances = new HashMap<String, Distance>();
|
static final protected Map<String, SimilarityDistance> _distances = new HashMap<>();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
_distances.put("levenshtein", new LevenshteinDistance());
|
_distances.put("levenshtein", new VicinoDistance(new LevenshteinDistance()));
|
||||||
_distances.put("jaccard", new JaccardDistance());
|
_distances.put("jaccard", new VicinoDistance(new JaccardDistance()));
|
||||||
_distances.put("jaro", new JaroDistance());
|
_distances.put("jaro", new VicinoDistance(new JaroDistance()));
|
||||||
_distances.put("jaro-winkler", new JaroWinklerDistance());
|
_distances.put("jaro-winkler", new VicinoDistance(new JaroWinklerDistance()));
|
||||||
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
|
_distances.put("jaro-winkler-tfidf", new VicinoDistance(new JaroWinklerTFIDFDistance()));
|
||||||
_distances.put("gzip", new GZipDistance());
|
_distances.put("gzip", new VicinoDistance(new GZipDistance()));
|
||||||
_distances.put("bzip2", new BZip2Distance());
|
_distances.put("bzip2", new VicinoDistance(new BZip2Distance()));
|
||||||
_distances.put("ppm", new PPMDistance());
|
_distances.put("ppm", new VicinoDistance(new PPMDistance()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the distance registered under this name, or null if it does not exist.
|
* Returns the distance registered under this name, or null if it does not exist.
|
||||||
*/
|
*/
|
||||||
public static Distance get(String name) {
|
public static SimilarityDistance get(String name) {
|
||||||
return _distances.get(name);
|
return _distances.get(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Registers a new distance under a name.
|
* Registers a new distance under a name.
|
||||||
*/
|
*/
|
||||||
public static void put(String name, Distance distance) {
|
public static void put(String name, SimilarityDistance distance) {
|
||||||
_distances.put(name, distance);
|
_distances.put(name, distance);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,17 @@
|
|||||||
|
package com.google.refine.clustering.knn;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A function that computes a similarity distance between
|
||||||
|
* two strings.
|
||||||
|
*
|
||||||
|
* This is introduced to provide a clean interface for extensions, independent
|
||||||
|
* of the library currently used (Simile Vicino, unmaintained since 2010 as of
|
||||||
|
* December 2018).
|
||||||
|
*/
|
||||||
|
public interface SimilarityDistance {
|
||||||
|
/**
|
||||||
|
* Compute the distance between two strings. This should return 0 when the
|
||||||
|
* two arguments are equal, and rise as their differences increase.
|
||||||
|
*/
|
||||||
|
public double compute(String a, String b);
|
||||||
|
}
|
@ -0,0 +1,23 @@
|
|||||||
|
package com.google.refine.clustering.knn;
|
||||||
|
|
||||||
|
import edu.mit.simile.vicino.distances.Distance;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wrapper to expose a similarity function from the Vicino
|
||||||
|
* library.
|
||||||
|
*
|
||||||
|
* @author Antonin Delpeuch
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class VicinoDistance implements SimilarityDistance {
|
||||||
|
private final Distance _d;
|
||||||
|
|
||||||
|
public VicinoDistance(Distance d) {
|
||||||
|
_d = d;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double compute(String a, String b) {
|
||||||
|
return _d.d(a, b);
|
||||||
|
}
|
||||||
|
}
|
@ -61,15 +61,7 @@ import com.google.refine.model.Row;
|
|||||||
|
|
||||||
import edu.mit.simile.vicino.clustering.NGramClusterer;
|
import edu.mit.simile.vicino.clustering.NGramClusterer;
|
||||||
import edu.mit.simile.vicino.clustering.VPTreeClusterer;
|
import edu.mit.simile.vicino.clustering.VPTreeClusterer;
|
||||||
import edu.mit.simile.vicino.distances.BZip2Distance;
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
import edu.mit.simile.vicino.distances.Distance;
|
||||||
import edu.mit.simile.vicino.distances.GZipDistance;
|
|
||||||
import edu.mit.simile.vicino.distances.JaccardDistance;
|
|
||||||
import edu.mit.simile.vicino.distances.JaroDistance;
|
|
||||||
import edu.mit.simile.vicino.distances.JaroWinklerDistance;
|
|
||||||
import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
|
|
||||||
import edu.mit.simile.vicino.distances.LevenshteinDistance;
|
|
||||||
import edu.mit.simile.vicino.distances.PPMDistance;
|
|
||||||
|
|
||||||
public class kNNClusterer extends Clusterer {
|
public class kNNClusterer extends Clusterer {
|
||||||
|
|
||||||
@ -77,12 +69,12 @@ public class kNNClusterer extends Clusterer {
|
|||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private String _distanceStr;
|
private String _distanceStr;
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private Distance _distance;
|
private SimilarityDistance _distance;
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private kNNClustererConfigParameters _parameters = null;
|
private kNNClustererConfigParameters _parameters = null;
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public Distance getDistance() {
|
public SimilarityDistance getDistance() {
|
||||||
return _distance;
|
return _distance;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,7 +122,7 @@ public class kNNClusterer extends Clusterer {
|
|||||||
public int blockingNgramSize = defaultBlockingNgramSize;
|
public int blockingNgramSize = defaultBlockingNgramSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Distance _distance;
|
private SimilarityDistance _distance;
|
||||||
private kNNClustererConfigParameters _params;
|
private kNNClustererConfigParameters _params;
|
||||||
|
|
||||||
List<Set<Serializable>> _clusters;
|
List<Set<Serializable>> _clusters;
|
||||||
@ -180,18 +172,31 @@ public class kNNClusterer extends Clusterer {
|
|||||||
|
|
||||||
class BlockingClusteringRowVisitor implements RowVisitor {
|
class BlockingClusteringRowVisitor implements RowVisitor {
|
||||||
|
|
||||||
Distance _distance;
|
SimilarityDistance _distance;
|
||||||
double _radius = 1.0d;
|
double _radius = 1.0d;
|
||||||
int _blockingNgramSize = 6;
|
int _blockingNgramSize = 6;
|
||||||
HashSet<String> _data;
|
HashSet<String> _data;
|
||||||
NGramClusterer _clusterer;
|
NGramClusterer _clusterer;
|
||||||
|
|
||||||
public BlockingClusteringRowVisitor(Distance d, kNNClustererConfigParameters params) {
|
private class DistanceWrapper extends Distance {
|
||||||
_distance = d;
|
private final SimilarityDistance _d;
|
||||||
|
|
||||||
|
protected DistanceWrapper(SimilarityDistance d) {
|
||||||
|
_d = d;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double d(String arg0, String arg1) {
|
||||||
|
return _d.compute(arg0, arg1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public BlockingClusteringRowVisitor(SimilarityDistance _distance2, kNNClustererConfigParameters params) {
|
||||||
|
_distance = _distance2;
|
||||||
_data = new HashSet<String>();
|
_data = new HashSet<String>();
|
||||||
_blockingNgramSize = params.blockingNgramSize;
|
_blockingNgramSize = params.blockingNgramSize;
|
||||||
_radius = params.radius;
|
_radius = params.radius;
|
||||||
_clusterer = new NGramClusterer(_distance, _blockingNgramSize);
|
_clusterer = new NGramClusterer(new DistanceWrapper(_distance), _blockingNgramSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -0,0 +1,23 @@
|
|||||||
|
package com.google.refine.commands.browsing;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import javax.servlet.ServletException;
|
||||||
|
import javax.servlet.http.HttpServletRequest;
|
||||||
|
import javax.servlet.http.HttpServletResponse;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.refine.commands.Command;
|
||||||
|
|
||||||
|
public class GetClusteringFunctionsAndDistances extends Command {
|
||||||
|
final static Logger logger = LoggerFactory.getLogger("get-clustering-functions-and-distances_command");
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void doGet(HttpServletRequest request, HttpServletResponse response)
|
||||||
|
throws ServletException, IOException {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user