Expose own interface for similarity distances, isolating from Vicino

This commit is contained in:
Antonin Delpeuch 2018-12-11 09:57:26 +00:00
parent 72d76ee22a
commit d9af48a49a
5 changed files with 94 additions and 27 deletions

View File

@ -5,7 +5,6 @@ import java.util.Map;
import java.util.Set;
import edu.mit.simile.vicino.distances.BZip2Distance;
import edu.mit.simile.vicino.distances.Distance;
import edu.mit.simile.vicino.distances.GZipDistance;
import edu.mit.simile.vicino.distances.JaccardDistance;
import edu.mit.simile.vicino.distances.JaroDistance;
@ -21,30 +20,30 @@ import edu.mit.simile.vicino.distances.PPMDistance;
*/
public class DistanceFactory {
static final protected Map<String, Distance> _distances = new HashMap<String, Distance>();
static final protected Map<String, SimilarityDistance> _distances = new HashMap<>();
static {
_distances.put("levenshtein", new LevenshteinDistance());
_distances.put("jaccard", new JaccardDistance());
_distances.put("jaro", new JaroDistance());
_distances.put("jaro-winkler", new JaroWinklerDistance());
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
_distances.put("gzip", new GZipDistance());
_distances.put("bzip2", new BZip2Distance());
_distances.put("ppm", new PPMDistance());
_distances.put("levenshtein", new VicinoDistance(new LevenshteinDistance()));
_distances.put("jaccard", new VicinoDistance(new JaccardDistance()));
_distances.put("jaro", new VicinoDistance(new JaroDistance()));
_distances.put("jaro-winkler", new VicinoDistance(new JaroWinklerDistance()));
_distances.put("jaro-winkler-tfidf", new VicinoDistance(new JaroWinklerTFIDFDistance()));
_distances.put("gzip", new VicinoDistance(new GZipDistance()));
_distances.put("bzip2", new VicinoDistance(new BZip2Distance()));
_distances.put("ppm", new VicinoDistance(new PPMDistance()));
}
/**
* Returns the distance registered under this name, or null if it does not exist.
*/
public static Distance get(String name) {
public static SimilarityDistance get(String name) {
return _distances.get(name);
}
/**
* Registers a new distance under a name.
*/
public static void put(String name, Distance distance) {
public static void put(String name, SimilarityDistance distance) {
_distances.put(name, distance);
}

View File

@ -0,0 +1,17 @@
package com.google.refine.clustering.knn;
/**
* A function that computes a similarity distance between
* two strings.
*
* This is introduced to provide a clean interface for extensions, independent
* of the library currently used (Simile Vicino, unmaintained since 2010 as of
* December 2018).
*/
public interface SimilarityDistance {
/**
* Compute the distance between two strings. This should return 0 when the
* two arguments are equal, and rise as their differences increase.
*/
public double compute(String a, String b);
}

View File

@ -0,0 +1,23 @@
package com.google.refine.clustering.knn;
import edu.mit.simile.vicino.distances.Distance;
/**
* Wrapper to expose a similarity function from the Vicino
* library.
*
* @author Antonin Delpeuch
*
*/
public class VicinoDistance implements SimilarityDistance {
private final Distance _d;
public VicinoDistance(Distance d) {
_d = d;
}
@Override
public double compute(String a, String b) {
return _d.d(a, b);
}
}

View File

@ -61,15 +61,7 @@ import com.google.refine.model.Row;
import edu.mit.simile.vicino.clustering.NGramClusterer;
import edu.mit.simile.vicino.clustering.VPTreeClusterer;
import edu.mit.simile.vicino.distances.BZip2Distance;
import edu.mit.simile.vicino.distances.Distance;
import edu.mit.simile.vicino.distances.GZipDistance;
import edu.mit.simile.vicino.distances.JaccardDistance;
import edu.mit.simile.vicino.distances.JaroDistance;
import edu.mit.simile.vicino.distances.JaroWinklerDistance;
import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
import edu.mit.simile.vicino.distances.LevenshteinDistance;
import edu.mit.simile.vicino.distances.PPMDistance;
public class kNNClusterer extends Clusterer {
@ -77,12 +69,12 @@ public class kNNClusterer extends Clusterer {
@JsonIgnore
private String _distanceStr;
@JsonIgnore
private Distance _distance;
private SimilarityDistance _distance;
@JsonIgnore
private kNNClustererConfigParameters _parameters = null;
@JsonIgnore
public Distance getDistance() {
public SimilarityDistance getDistance() {
return _distance;
}
@ -130,7 +122,7 @@ public class kNNClusterer extends Clusterer {
public int blockingNgramSize = defaultBlockingNgramSize;
}
private Distance _distance;
private SimilarityDistance _distance;
private kNNClustererConfigParameters _params;
List<Set<Serializable>> _clusters;
@ -180,18 +172,31 @@ public class kNNClusterer extends Clusterer {
class BlockingClusteringRowVisitor implements RowVisitor {
Distance _distance;
SimilarityDistance _distance;
double _radius = 1.0d;
int _blockingNgramSize = 6;
HashSet<String> _data;
NGramClusterer _clusterer;
public BlockingClusteringRowVisitor(Distance d, kNNClustererConfigParameters params) {
_distance = d;
private class DistanceWrapper extends Distance {
private final SimilarityDistance _d;
protected DistanceWrapper(SimilarityDistance d) {
_d = d;
}
@Override
public double d(String arg0, String arg1) {
return _d.compute(arg0, arg1);
}
}
public BlockingClusteringRowVisitor(SimilarityDistance _distance2, kNNClustererConfigParameters params) {
_distance = _distance2;
_data = new HashSet<String>();
_blockingNgramSize = params.blockingNgramSize;
_radius = params.radius;
_clusterer = new NGramClusterer(_distance, _blockingNgramSize);
_clusterer = new NGramClusterer(new DistanceWrapper(_distance), _blockingNgramSize);
}
@Override

View File

@ -0,0 +1,23 @@
package com.google.refine.commands.browsing;
import java.io.IOException;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.refine.commands.Command;
public class GetClusteringFunctionsAndDistances extends Command {
final static Logger logger = LoggerFactory.getLogger("get-clustering-functions-and-distances_command");
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
}
}