diff --git a/main/src/com/google/refine/clustering/knn/DistanceFactory.java b/main/src/com/google/refine/clustering/knn/DistanceFactory.java index fb6ff2032..7f7a71f07 100644 --- a/main/src/com/google/refine/clustering/knn/DistanceFactory.java +++ b/main/src/com/google/refine/clustering/knn/DistanceFactory.java @@ -5,7 +5,6 @@ import java.util.Map; import java.util.Set; import edu.mit.simile.vicino.distances.BZip2Distance; -import edu.mit.simile.vicino.distances.Distance; import edu.mit.simile.vicino.distances.GZipDistance; import edu.mit.simile.vicino.distances.JaccardDistance; import edu.mit.simile.vicino.distances.JaroDistance; @@ -21,30 +20,30 @@ import edu.mit.simile.vicino.distances.PPMDistance; */ public class DistanceFactory { - static final protected Map _distances = new HashMap(); + static final protected Map _distances = new HashMap<>(); static { - _distances.put("levenshtein", new LevenshteinDistance()); - _distances.put("jaccard", new JaccardDistance()); - _distances.put("jaro", new JaroDistance()); - _distances.put("jaro-winkler", new JaroWinklerDistance()); - _distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance()); - _distances.put("gzip", new GZipDistance()); - _distances.put("bzip2", new BZip2Distance()); - _distances.put("ppm", new PPMDistance()); + _distances.put("levenshtein", new VicinoDistance(new LevenshteinDistance())); + _distances.put("jaccard", new VicinoDistance(new JaccardDistance())); + _distances.put("jaro", new VicinoDistance(new JaroDistance())); + _distances.put("jaro-winkler", new VicinoDistance(new JaroWinklerDistance())); + _distances.put("jaro-winkler-tfidf", new VicinoDistance(new JaroWinklerTFIDFDistance())); + _distances.put("gzip", new VicinoDistance(new GZipDistance())); + _distances.put("bzip2", new VicinoDistance(new BZip2Distance())); + _distances.put("ppm", new VicinoDistance(new PPMDistance())); } /** * Returns the distance registered under this name, or null if it does not exist. */ - public static Distance get(String name) { + public static SimilarityDistance get(String name) { return _distances.get(name); } /** * Registers a new distance under a name. */ - public static void put(String name, Distance distance) { + public static void put(String name, SimilarityDistance distance) { _distances.put(name, distance); } diff --git a/main/src/com/google/refine/clustering/knn/SimilarityDistance.java b/main/src/com/google/refine/clustering/knn/SimilarityDistance.java new file mode 100644 index 000000000..00075f036 --- /dev/null +++ b/main/src/com/google/refine/clustering/knn/SimilarityDistance.java @@ -0,0 +1,17 @@ +package com.google.refine.clustering.knn; + +/** + * A function that computes a similarity distance between + * two strings. + * + * This is introduced to provide a clean interface for extensions, independent + * of the library currently used (Simile Vicino, unmaintained since 2010 as of + * December 2018). + */ +public interface SimilarityDistance { + /** + * Compute the distance between two strings. This should return 0 when the + * two arguments are equal, and rise as their differences increase. + */ + public double compute(String a, String b); +} diff --git a/main/src/com/google/refine/clustering/knn/VicinoDistance.java b/main/src/com/google/refine/clustering/knn/VicinoDistance.java new file mode 100644 index 000000000..491d1f979 --- /dev/null +++ b/main/src/com/google/refine/clustering/knn/VicinoDistance.java @@ -0,0 +1,23 @@ +package com.google.refine.clustering.knn; + +import edu.mit.simile.vicino.distances.Distance; + +/** + * Wrapper to expose a similarity function from the Vicino + * library. + * + * @author Antonin Delpeuch + * + */ +public class VicinoDistance implements SimilarityDistance { + private final Distance _d; + + public VicinoDistance(Distance d) { + _d = d; + } + + @Override + public double compute(String a, String b) { + return _d.d(a, b); + } +} diff --git a/main/src/com/google/refine/clustering/knn/kNNClusterer.java b/main/src/com/google/refine/clustering/knn/kNNClusterer.java index 3ab66519a..ae01ee232 100644 --- a/main/src/com/google/refine/clustering/knn/kNNClusterer.java +++ b/main/src/com/google/refine/clustering/knn/kNNClusterer.java @@ -61,15 +61,7 @@ import com.google.refine.model.Row; import edu.mit.simile.vicino.clustering.NGramClusterer; import edu.mit.simile.vicino.clustering.VPTreeClusterer; -import edu.mit.simile.vicino.distances.BZip2Distance; import edu.mit.simile.vicino.distances.Distance; -import edu.mit.simile.vicino.distances.GZipDistance; -import edu.mit.simile.vicino.distances.JaccardDistance; -import edu.mit.simile.vicino.distances.JaroDistance; -import edu.mit.simile.vicino.distances.JaroWinklerDistance; -import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance; -import edu.mit.simile.vicino.distances.LevenshteinDistance; -import edu.mit.simile.vicino.distances.PPMDistance; public class kNNClusterer extends Clusterer { @@ -77,12 +69,12 @@ public class kNNClusterer extends Clusterer { @JsonIgnore private String _distanceStr; @JsonIgnore - private Distance _distance; + private SimilarityDistance _distance; @JsonIgnore private kNNClustererConfigParameters _parameters = null; @JsonIgnore - public Distance getDistance() { + public SimilarityDistance getDistance() { return _distance; } @@ -130,7 +122,7 @@ public class kNNClusterer extends Clusterer { public int blockingNgramSize = defaultBlockingNgramSize; } - private Distance _distance; + private SimilarityDistance _distance; private kNNClustererConfigParameters _params; List> _clusters; @@ -180,18 +172,31 @@ public class kNNClusterer extends Clusterer { class BlockingClusteringRowVisitor implements RowVisitor { - Distance _distance; + SimilarityDistance _distance; double _radius = 1.0d; int _blockingNgramSize = 6; HashSet _data; NGramClusterer _clusterer; - public BlockingClusteringRowVisitor(Distance d, kNNClustererConfigParameters params) { - _distance = d; + private class DistanceWrapper extends Distance { + private final SimilarityDistance _d; + + protected DistanceWrapper(SimilarityDistance d) { + _d = d; + } + + @Override + public double d(String arg0, String arg1) { + return _d.compute(arg0, arg1); + } + } + + public BlockingClusteringRowVisitor(SimilarityDistance _distance2, kNNClustererConfigParameters params) { + _distance = _distance2; _data = new HashSet(); _blockingNgramSize = params.blockingNgramSize; _radius = params.radius; - _clusterer = new NGramClusterer(_distance, _blockingNgramSize); + _clusterer = new NGramClusterer(new DistanceWrapper(_distance), _blockingNgramSize); } @Override diff --git a/main/src/com/google/refine/commands/browsing/GetClusteringFunctionsAndDistances.java b/main/src/com/google/refine/commands/browsing/GetClusteringFunctionsAndDistances.java new file mode 100644 index 000000000..1ace8754c --- /dev/null +++ b/main/src/com/google/refine/commands/browsing/GetClusteringFunctionsAndDistances.java @@ -0,0 +1,23 @@ +package com.google.refine.commands.browsing; + + +import java.io.IOException; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.refine.commands.Command; + +public class GetClusteringFunctionsAndDistances extends Command { + final static Logger logger = LoggerFactory.getLogger("get-clustering-functions-and-distances_command"); + + @Override + public void doGet(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + } +}