Separate distance and keyer registration in dedicated factories
This commit is contained in:
parent
de86a162fa
commit
72d76ee22a
@ -81,7 +81,7 @@ public class BinningClusterer extends Clusterer {
|
|||||||
@JsonProperty("function")
|
@JsonProperty("function")
|
||||||
public void setKeyer(String keyerName) {
|
public void setKeyer(String keyerName) {
|
||||||
_keyerName = keyerName;
|
_keyerName = keyerName;
|
||||||
_keyer = _keyers.get(_keyerName.toLowerCase());
|
_keyer = KeyerFactory.get(_keyerName.toLowerCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
@JsonProperty("function")
|
@JsonProperty("function")
|
||||||
@ -123,21 +123,9 @@ public class BinningClusterer extends Clusterer {
|
|||||||
protected Keyer _keyer;
|
protected Keyer _keyer;
|
||||||
protected BinningParameters _parameters;
|
protected BinningParameters _parameters;
|
||||||
|
|
||||||
static final protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>();
|
|
||||||
|
|
||||||
final static Logger logger = LoggerFactory.getLogger("binning_clusterer");
|
final static Logger logger = LoggerFactory.getLogger("binning_clusterer");
|
||||||
|
|
||||||
List<Map<String,Integer>> _clusters;
|
List<Map<String,Integer>> _clusters;
|
||||||
|
|
||||||
static {
|
|
||||||
_keyers.put("fingerprint", new FingerprintKeyer());
|
|
||||||
_keyers.put("ngram-fingerprint", new NGramFingerprintKeyer());
|
|
||||||
_keyers.put("metaphone", new MetaphoneKeyer());
|
|
||||||
_keyers.put("double-metaphone", new DoubleMetaphoneKeyer());
|
|
||||||
_keyers.put("metaphone3", new Metaphone3Keyer());
|
|
||||||
_keyers.put("soundex", new SoundexKeyer());
|
|
||||||
_keyers.put("cologne-phonetic", new ColognePhoneticKeyer());
|
|
||||||
}
|
|
||||||
|
|
||||||
class BinningRowVisitor implements RowVisitor {
|
class BinningRowVisitor implements RowVisitor {
|
||||||
|
|
||||||
|
@ -0,0 +1,47 @@
|
|||||||
|
package com.google.refine.clustering.binning;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Registry of keyers for clustering.
|
||||||
|
*
|
||||||
|
* @author Antonin Delpeuch
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class KeyerFactory {
|
||||||
|
|
||||||
|
static final protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>();
|
||||||
|
|
||||||
|
static {
|
||||||
|
_keyers.put("fingerprint", new FingerprintKeyer());
|
||||||
|
_keyers.put("ngram-fingerprint", new NGramFingerprintKeyer());
|
||||||
|
_keyers.put("metaphone", new MetaphoneKeyer());
|
||||||
|
_keyers.put("double-metaphone", new DoubleMetaphoneKeyer());
|
||||||
|
_keyers.put("metaphone3", new Metaphone3Keyer());
|
||||||
|
_keyers.put("soundex", new SoundexKeyer());
|
||||||
|
_keyers.put("cologne-phonetic", new ColognePhoneticKeyer());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the keyer registered under a given name, or null if it does not exist.
|
||||||
|
*/
|
||||||
|
public static Keyer get(String name) {
|
||||||
|
return _keyers.get(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Registers a keyer under a code name.
|
||||||
|
*/
|
||||||
|
public static void put(String name, Keyer keyer) {
|
||||||
|
_keyers.put(name, keyer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set of available keyer, by names.
|
||||||
|
*/
|
||||||
|
public static Set<String> getKeyerNames() {
|
||||||
|
return _keyers.keySet();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,57 @@
|
|||||||
|
package com.google.refine.clustering.knn;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import edu.mit.simile.vicino.distances.BZip2Distance;
|
||||||
|
import edu.mit.simile.vicino.distances.Distance;
|
||||||
|
import edu.mit.simile.vicino.distances.GZipDistance;
|
||||||
|
import edu.mit.simile.vicino.distances.JaccardDistance;
|
||||||
|
import edu.mit.simile.vicino.distances.JaroDistance;
|
||||||
|
import edu.mit.simile.vicino.distances.JaroWinklerDistance;
|
||||||
|
import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
|
||||||
|
import edu.mit.simile.vicino.distances.LevenshteinDistance;
|
||||||
|
import edu.mit.simile.vicino.distances.PPMDistance;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Registry of distances for kNN clustering.
|
||||||
|
*
|
||||||
|
* @author Antonin Delpeuch
|
||||||
|
*/
|
||||||
|
public class DistanceFactory {
|
||||||
|
|
||||||
|
static final protected Map<String, Distance> _distances = new HashMap<String, Distance>();
|
||||||
|
|
||||||
|
static {
|
||||||
|
_distances.put("levenshtein", new LevenshteinDistance());
|
||||||
|
_distances.put("jaccard", new JaccardDistance());
|
||||||
|
_distances.put("jaro", new JaroDistance());
|
||||||
|
_distances.put("jaro-winkler", new JaroWinklerDistance());
|
||||||
|
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
|
||||||
|
_distances.put("gzip", new GZipDistance());
|
||||||
|
_distances.put("bzip2", new BZip2Distance());
|
||||||
|
_distances.put("ppm", new PPMDistance());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the distance registered under this name, or null if it does not exist.
|
||||||
|
*/
|
||||||
|
public static Distance get(String name) {
|
||||||
|
return _distances.get(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Registers a new distance under a name.
|
||||||
|
*/
|
||||||
|
public static void put(String name, Distance distance) {
|
||||||
|
_distances.put(name, distance);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lists the available distances, by name.
|
||||||
|
*/
|
||||||
|
public static Set<String> getDistanceNames() {
|
||||||
|
return _distances.keySet();
|
||||||
|
}
|
||||||
|
}
|
@ -89,7 +89,7 @@ public class kNNClusterer extends Clusterer {
|
|||||||
@JsonProperty("function")
|
@JsonProperty("function")
|
||||||
public void setDistance(String distanceStr) {
|
public void setDistance(String distanceStr) {
|
||||||
_distanceStr = distanceStr;
|
_distanceStr = distanceStr;
|
||||||
_distance = _distances.get(_distanceStr.toLowerCase());
|
_distance = DistanceFactory.get(_distanceStr.toLowerCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
@JsonProperty("function")
|
@JsonProperty("function")
|
||||||
@ -132,25 +132,12 @@ public class kNNClusterer extends Clusterer {
|
|||||||
|
|
||||||
private Distance _distance;
|
private Distance _distance;
|
||||||
private kNNClustererConfigParameters _params;
|
private kNNClustererConfigParameters _params;
|
||||||
|
|
||||||
static final protected Map<String, Distance> _distances = new HashMap<String, Distance>();
|
|
||||||
|
|
||||||
List<Set<Serializable>> _clusters;
|
List<Set<Serializable>> _clusters;
|
||||||
|
|
||||||
Map<Serializable, Integer> _counts = new HashMap<Serializable, Integer>();
|
Map<Serializable, Integer> _counts = new HashMap<Serializable, Integer>();
|
||||||
|
|
||||||
final static Logger logger = LoggerFactory.getLogger("kNN_clusterer");
|
final static Logger logger = LoggerFactory.getLogger("kNN_clusterer");
|
||||||
|
|
||||||
static {
|
|
||||||
_distances.put("levenshtein", new LevenshteinDistance());
|
|
||||||
_distances.put("jaccard", new JaccardDistance());
|
|
||||||
_distances.put("jaro", new JaroDistance());
|
|
||||||
_distances.put("jaro-winkler", new JaroWinklerDistance());
|
|
||||||
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
|
|
||||||
_distances.put("gzip", new GZipDistance());
|
|
||||||
_distances.put("bzip2", new BZip2Distance());
|
|
||||||
_distances.put("ppm", new PPMDistance());
|
|
||||||
}
|
|
||||||
|
|
||||||
class VPTreeClusteringRowVisitor implements RowVisitor {
|
class VPTreeClusteringRowVisitor implements RowVisitor {
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user