Separate distance and keyer registration in dedicated factories

This commit is contained in:
Antonin Delpeuch 2018-12-11 09:27:04 +00:00
parent de86a162fa
commit 72d76ee22a
4 changed files with 106 additions and 27 deletions

View File

@ -81,7 +81,7 @@ public class BinningClusterer extends Clusterer {
@JsonProperty("function")
public void setKeyer(String keyerName) {
_keyerName = keyerName;
_keyer = _keyers.get(_keyerName.toLowerCase());
_keyer = KeyerFactory.get(_keyerName.toLowerCase());
}
@JsonProperty("function")
@ -123,22 +123,10 @@ public class BinningClusterer extends Clusterer {
protected Keyer _keyer;
protected BinningParameters _parameters;
static final protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>();
final static Logger logger = LoggerFactory.getLogger("binning_clusterer");
List<Map<String,Integer>> _clusters;
static {
_keyers.put("fingerprint", new FingerprintKeyer());
_keyers.put("ngram-fingerprint", new NGramFingerprintKeyer());
_keyers.put("metaphone", new MetaphoneKeyer());
_keyers.put("double-metaphone", new DoubleMetaphoneKeyer());
_keyers.put("metaphone3", new Metaphone3Keyer());
_keyers.put("soundex", new SoundexKeyer());
_keyers.put("cologne-phonetic", new ColognePhoneticKeyer());
}
class BinningRowVisitor implements RowVisitor {
Keyer _keyer;

View File

@ -0,0 +1,47 @@
package com.google.refine.clustering.binning;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
/**
* Registry of keyers for clustering.
*
* @author Antonin Delpeuch
*
*/
public class KeyerFactory {
static final protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>();
static {
_keyers.put("fingerprint", new FingerprintKeyer());
_keyers.put("ngram-fingerprint", new NGramFingerprintKeyer());
_keyers.put("metaphone", new MetaphoneKeyer());
_keyers.put("double-metaphone", new DoubleMetaphoneKeyer());
_keyers.put("metaphone3", new Metaphone3Keyer());
_keyers.put("soundex", new SoundexKeyer());
_keyers.put("cologne-phonetic", new ColognePhoneticKeyer());
}
/**
* Returns the keyer registered under a given name, or null if it does not exist.
*/
public static Keyer get(String name) {
return _keyers.get(name);
}
/**
* Registers a keyer under a code name.
*/
public static void put(String name, Keyer keyer) {
_keyers.put(name, keyer);
}
/**
* Set of available keyer, by names.
*/
public static Set<String> getKeyerNames() {
return _keyers.keySet();
}
}

View File

@ -0,0 +1,57 @@
package com.google.refine.clustering.knn;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import edu.mit.simile.vicino.distances.BZip2Distance;
import edu.mit.simile.vicino.distances.Distance;
import edu.mit.simile.vicino.distances.GZipDistance;
import edu.mit.simile.vicino.distances.JaccardDistance;
import edu.mit.simile.vicino.distances.JaroDistance;
import edu.mit.simile.vicino.distances.JaroWinklerDistance;
import edu.mit.simile.vicino.distances.JaroWinklerTFIDFDistance;
import edu.mit.simile.vicino.distances.LevenshteinDistance;
import edu.mit.simile.vicino.distances.PPMDistance;
/**
* Registry of distances for kNN clustering.
*
* @author Antonin Delpeuch
*/
public class DistanceFactory {
static final protected Map<String, Distance> _distances = new HashMap<String, Distance>();
static {
_distances.put("levenshtein", new LevenshteinDistance());
_distances.put("jaccard", new JaccardDistance());
_distances.put("jaro", new JaroDistance());
_distances.put("jaro-winkler", new JaroWinklerDistance());
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
_distances.put("gzip", new GZipDistance());
_distances.put("bzip2", new BZip2Distance());
_distances.put("ppm", new PPMDistance());
}
/**
* Returns the distance registered under this name, or null if it does not exist.
*/
public static Distance get(String name) {
return _distances.get(name);
}
/**
* Registers a new distance under a name.
*/
public static void put(String name, Distance distance) {
_distances.put(name, distance);
}
/**
* Lists the available distances, by name.
*/
public static Set<String> getDistanceNames() {
return _distances.keySet();
}
}

View File

@ -89,7 +89,7 @@ public class kNNClusterer extends Clusterer {
@JsonProperty("function")
public void setDistance(String distanceStr) {
_distanceStr = distanceStr;
_distance = _distances.get(_distanceStr.toLowerCase());
_distance = DistanceFactory.get(_distanceStr.toLowerCase());
}
@JsonProperty("function")
@ -133,25 +133,12 @@ public class kNNClusterer extends Clusterer {
private Distance _distance;
private kNNClustererConfigParameters _params;
static final protected Map<String, Distance> _distances = new HashMap<String, Distance>();
List<Set<Serializable>> _clusters;
Map<Serializable, Integer> _counts = new HashMap<Serializable, Integer>();
final static Logger logger = LoggerFactory.getLogger("kNN_clusterer");
static {
_distances.put("levenshtein", new LevenshteinDistance());
_distances.put("jaccard", new JaccardDistance());
_distances.put("jaro", new JaroDistance());
_distances.put("jaro-winkler", new JaroWinklerDistance());
_distances.put("jaro-winkler-tfidf", new JaroWinklerTFIDFDistance());
_distances.put("gzip", new GZipDistance());
_distances.put("bzip2", new BZip2Distance());
_distances.put("ppm", new PPMDistance());
}
class VPTreeClusteringRowVisitor implements RowVisitor {
Distance _distance;