Expose own interface for similarity distances, isolating from Vicino
This commit is contained in:
parent
72d76ee22a
commit
d9af48a49a
@ -5,7 +5,6 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import edu.mit.simile.vicino.distances.BZip2Distance;
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
import edu.mit.simile.vicino.distances.GZipDistance;
|
||||
import edu.mit.simile.vicino.distances.JaccardDistance;
|
||||
import edu.mit.simile.vicino.distances.JaroDistance;
|
||||
@ -21,30 +20,30 @@ import edu.mit.simile.vicino.distances.PPMDistance;
|
||||
*/
|
||||
public class DistanceFactory {
|
||||
|
||||
static final protected Map<String, Distance> _distances = new HashMap<String, Distance>();
|
||||
static final protected Map<String, SimilarityDistance> _distances = new HashMap<>();
|
||||
|
||||
static {
|
||||
_distances.put("levenshtein", new LevenshteinDistance());
|
||||
_distances.put("jaccard", new JaccardDistance());
|
||||
_distances.put("jaro", new JaroDistance());
|
||||
_distances.put("jaro-winkler", new JaroWinklerDistance());
|
||||
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
|
||||
_distances.put("gzip", new GZipDistance());
|
||||
_distances.put("bzip2", new BZip2Distance());
|
||||
_distances.put("ppm", new PPMDistance());
|
||||
_distances.put("levenshtein", new VicinoDistance(new LevenshteinDistance()));
|
||||
_distances.put("jaccard", new VicinoDistance(new JaccardDistance()));
|
||||
_distances.put("jaro", new VicinoDistance(new JaroDistance()));
|
||||
_distances.put("jaro-winkler", new VicinoDistance(new JaroWinklerDistance()));
|
||||
_distances.put("jaro-winkler-tfidf", new VicinoDistance(new JaroWinklerTFIDFDistance()));
|
||||
_distances.put("gzip", new VicinoDistance(new GZipDistance()));
|
||||
_distances.put("bzip2", new VicinoDistance(new BZip2Distance()));
|
||||
_distances.put("ppm", new VicinoDistance(new PPMDistance()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the distance registered under this name, or null if it does not exist.
|
||||
*/
|
||||
public static Distance get(String name) {
|
||||
public static SimilarityDistance get(String name) {
|
||||
return _distances.get(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers a new distance under a name.
|
||||
*/
|
||||
public static void put(String name, Distance distance) {
|
||||
public static void put(String name, SimilarityDistance distance) {
|
||||
_distances.put(name, distance);
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,17 @@
|
||||
package com.google.refine.clustering.knn;
|
||||
|
||||
/**
|
||||
* A function that computes a similarity distance between
|
||||
* two strings.
|
||||
*
|
||||
* This is introduced to provide a clean interface for extensions, independent
|
||||
* of the library currently used (Simile Vicino, unmaintained since 2010 as of
|
||||
* December 2018).
|
||||
*/
|
||||
public interface SimilarityDistance {
|
||||
/**
|
||||
* Compute the distance between two strings. This should return 0 when the
|
||||
* two arguments are equal, and rise as their differences increase.
|
||||
*/
|
||||
public double compute(String a, String b);
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
package com.google.refine.clustering.knn;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
/**
|
||||
* Wrapper to expose a similarity function from the Vicino
|
||||
* library.
|
||||
*
|
||||
* @author Antonin Delpeuch
|
||||
*
|
||||
*/
|
||||
public class VicinoDistance implements SimilarityDistance {
|
||||
private final Distance _d;
|
||||
|
||||
public VicinoDistance(Distance d) {
|
||||
_d = d;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compute(String a, String b) {
|
||||
return _d.d(a, b);
|
||||
}
|
||||
}
|
@ -61,15 +61,7 @@ import com.google.refine.model.Row;
|
||||
|
||||
import edu.mit.simile.vicino.clustering.NGramClusterer;
|
||||
import edu.mit.simile.vicino.clustering.VPTreeClusterer;
|
||||
import edu.mit.simile.vicino.distances.BZip2Distance;
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
import edu.mit.simile.vicino.distances.GZipDistance;
|
||||
import edu.mit.simile.vicino.distances.JaccardDistance;
|
||||
import edu.mit.simile.vicino.distances.JaroDistance;
|
||||
import edu.mit.simile.vicino.distances.JaroWinklerDistance;
|
||||
import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
|
||||
import edu.mit.simile.vicino.distances.LevenshteinDistance;
|
||||
import edu.mit.simile.vicino.distances.PPMDistance;
|
||||
|
||||
public class kNNClusterer extends Clusterer {
|
||||
|
||||
@ -77,12 +69,12 @@ public class kNNClusterer extends Clusterer {
|
||||
@JsonIgnore
|
||||
private String _distanceStr;
|
||||
@JsonIgnore
|
||||
private Distance _distance;
|
||||
private SimilarityDistance _distance;
|
||||
@JsonIgnore
|
||||
private kNNClustererConfigParameters _parameters = null;
|
||||
|
||||
@JsonIgnore
|
||||
public Distance getDistance() {
|
||||
public SimilarityDistance getDistance() {
|
||||
return _distance;
|
||||
}
|
||||
|
||||
@ -130,7 +122,7 @@ public class kNNClusterer extends Clusterer {
|
||||
public int blockingNgramSize = defaultBlockingNgramSize;
|
||||
}
|
||||
|
||||
private Distance _distance;
|
||||
private SimilarityDistance _distance;
|
||||
private kNNClustererConfigParameters _params;
|
||||
|
||||
List<Set<Serializable>> _clusters;
|
||||
@ -180,18 +172,31 @@ public class kNNClusterer extends Clusterer {
|
||||
|
||||
class BlockingClusteringRowVisitor implements RowVisitor {
|
||||
|
||||
Distance _distance;
|
||||
SimilarityDistance _distance;
|
||||
double _radius = 1.0d;
|
||||
int _blockingNgramSize = 6;
|
||||
HashSet<String> _data;
|
||||
NGramClusterer _clusterer;
|
||||
|
||||
public BlockingClusteringRowVisitor(Distance d, kNNClustererConfigParameters params) {
|
||||
_distance = d;
|
||||
private class DistanceWrapper extends Distance {
|
||||
private final SimilarityDistance _d;
|
||||
|
||||
protected DistanceWrapper(SimilarityDistance d) {
|
||||
_d = d;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double d(String arg0, String arg1) {
|
||||
return _d.compute(arg0, arg1);
|
||||
}
|
||||
}
|
||||
|
||||
public BlockingClusteringRowVisitor(SimilarityDistance _distance2, kNNClustererConfigParameters params) {
|
||||
_distance = _distance2;
|
||||
_data = new HashSet<String>();
|
||||
_blockingNgramSize = params.blockingNgramSize;
|
||||
_radius = params.radius;
|
||||
_clusterer = new NGramClusterer(_distance, _blockingNgramSize);
|
||||
_clusterer = new NGramClusterer(new DistanceWrapper(_distance), _blockingNgramSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -0,0 +1,23 @@
|
||||
package com.google.refine.commands.browsing;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import javax.servlet.ServletException;
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.refine.commands.Command;
|
||||
|
||||
public class GetClusteringFunctionsAndDistances extends Command {
|
||||
final static Logger logger = LoggerFactory.getLogger("get-clustering-functions-and-distances_command");
|
||||
|
||||
@Override
|
||||
public void doGet(HttpServletRequest request, HttpServletResponse response)
|
||||
throws ServletException, IOException {
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user