From 31954862e8aab5ba82534c4923681ecc7a12ff8a Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Wed, 5 Sep 2018 16:49:01 +0100 Subject: [PATCH 1/3] Refactor BinningClusterer for JSON serialization --- .../google/refine/clustering/Clusterer.java | 8 +- .../clustering/binning/BinningClusterer.java | 104 +++++++++++++++--- .../clustering/BinningClustererTests.java | 59 ++++++++++ 3 files changed, 151 insertions(+), 20 deletions(-) create mode 100644 main/tests/server/src/com/google/refine/tests/clustering/BinningClustererTests.java diff --git a/main/src/com/google/refine/clustering/Clusterer.java b/main/src/com/google/refine/clustering/Clusterer.java index 672977230..a18e42c81 100644 --- a/main/src/com/google/refine/clustering/Clusterer.java +++ b/main/src/com/google/refine/clustering/Clusterer.java @@ -33,8 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.clustering; -import org.json.JSONObject; - import com.google.refine.Jsonizable; import com.google.refine.browsing.Engine; import com.google.refine.model.Column; @@ -44,15 +42,13 @@ public abstract class Clusterer implements Jsonizable { protected Project _project; protected int _colindex; - protected JSONObject _config; public abstract void computeClusters(Engine engine); - public void initializeFromJSON(Project project, JSONObject o) throws Exception { + protected void initializeFromConfig(Project project, ClustererConfig c) { _project = project; - _config = o; - String colname = o.getString("column"); + String colname = c.getColumnName(); for (Column column : project.columnModel.columns) { if (column.getName().equals(colname)) { _colindex = column.getCellIndex(); diff --git a/main/src/com/google/refine/clustering/binning/BinningClusterer.java b/main/src/com/google/refine/clustering/binning/BinningClusterer.java index 8a796ab18..5d7e00ab9 100644 --- a/main/src/com/google/refine/clustering/binning/BinningClusterer.java +++ b/main/src/com/google/refine/clustering/binning/BinningClusterer.java @@ -50,17 +50,90 @@ import org.json.JSONWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.refine.Jsonizable; import com.google.refine.browsing.Engine; import com.google.refine.browsing.FilteredRows; import com.google.refine.browsing.RowVisitor; import com.google.refine.clustering.Clusterer; +import com.google.refine.clustering.ClustererConfig; import com.google.refine.model.Cell; import com.google.refine.model.Project; import com.google.refine.model.Row; public class BinningClusterer extends Clusterer { + + public static class BinningClustererConfig extends ClustererConfig { + + private String _keyerName; + private Keyer _keyer; + private BinningParameters _parameters; + + @Override + public void initializeFromJSON(JSONObject o) { + super.initializeFromJSON(o); + _keyerName = o.getString("function"); + _keyer = _keyers.get(_keyerName.toLowerCase()); + if(o.has("params")) { + _parameters = BinningParameters.reconstruct(o.getJSONObject("params")); + } else { + _parameters = null; + } + } + + public Keyer getKeyer() { + return _keyer; + } + + public BinningParameters getParameters() { + return _parameters; + } + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + writer.object(); + writer.key("function"); writer.value(_keyerName); + writer.key("type"); writer.value("binning"); + writer.key("column"); writer.value(getColumnName()); + if(_parameters != null) { + writer.key("params"); + _parameters.write(writer, options); + } + writer.endObject(); + } - private Keyer _keyer; + @Override + public BinningClusterer apply(Project project) { + BinningClusterer clusterer = new BinningClusterer(); + clusterer.initializeFromConfig(project, this); + return clusterer; + } + + } + + public static class BinningParameters implements Jsonizable { + public int ngramSize; + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + writer.object(); + if(ngramSize > 0) { + writer.key("ngram-size"); + writer.value(ngramSize); + } + writer.endObject(); + } + + public static BinningParameters reconstruct(JSONObject o) { + BinningParameters parameters = new BinningParameters(); + parameters.ngramSize = o.has("ngram-size") ? o.getInt("ngram-size") : 0; + return parameters; + } + } + + protected Keyer _keyer; + protected BinningParameters _parameters; static final protected Map _keyers = new HashMap(); @@ -82,21 +155,17 @@ public class BinningClusterer extends Clusterer { Keyer _keyer; Object[] _params; - JSONObject _config; + BinningParameters _parameters; Map> _map = new HashMap>(); - public BinningRowVisitor(Keyer k, JSONObject o) { + public BinningRowVisitor(Keyer k, BinningParameters parameters) { _keyer = k; - _config = o; + _parameters = parameters; if (k instanceof NGramFingerprintKeyer) { - try { - int size = _config.getJSONObject("params").getInt("ngram-size"); - logger.debug("Using ngram size: {}", size); + if(_parameters != null) { _params = new Object[1]; - _params[0] = size; - } catch (JSONException e) { - //Refine.warn("No params specified, using default"); + _params[0] = _parameters.ngramSize; } } } @@ -169,15 +238,22 @@ public class BinningClusterer extends Clusterer { } } - @Override + @Deprecated public void initializeFromJSON(Project project, JSONObject o) throws Exception { - super.initializeFromJSON(project, o); - _keyer = _keyers.get(o.getString("function").toLowerCase()); + BinningClustererConfig config = new BinningClustererConfig(); + config.initializeFromJSON(o); + initializeFromConfig(project, config); + } + + public void initializeFromConfig(Project project, BinningClustererConfig config) { + super.initializeFromConfig(project, config); + _keyer = config.getKeyer(); + _parameters = config.getParameters(); } @Override public void computeClusters(Engine engine) { - BinningRowVisitor visitor = new BinningRowVisitor(_keyer,_config); + BinningRowVisitor visitor = new BinningRowVisitor(_keyer,_parameters); FilteredRows filteredRows = engine.getAllFilteredRows(); filteredRows.accept(_project, visitor); diff --git a/main/tests/server/src/com/google/refine/tests/clustering/BinningClustererTests.java b/main/tests/server/src/com/google/refine/tests/clustering/BinningClustererTests.java new file mode 100644 index 000000000..340496467 --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/clustering/BinningClustererTests.java @@ -0,0 +1,59 @@ +package com.google.refine.tests.clustering; + +import org.json.JSONObject; +import org.testng.annotations.Test; + +import com.google.refine.browsing.Engine; +import com.google.refine.clustering.binning.BinningClusterer; +import com.google.refine.clustering.binning.BinningClusterer.BinningClustererConfig; +import com.google.refine.model.Project; +import com.google.refine.tests.RefineTest; +import com.google.refine.tests.util.TestUtils; + +public class BinningClustererTests extends RefineTest { + + String configJson = "{" + + "\"type\":\"binning\"," + + "\"function\":\"fingerprint\"," + + "\"column\":\"values\"," + + "\"params\":{}}"; + + String configNgramJson = "{" + + "\"type\":\"binning\"," + + "\"function\":\"ngram-fingerprint\"," + + "\"column\":\"values\"," + + "\"params\":{\"ngram-size\":2}}"; + + String clustererJson = "[" + + " [{\"v\":\"a\",\"c\":1},{\"v\":\"à\",\"c\":1}]," + + " [{\"v\":\"c\",\"c\":1},{\"v\":\"ĉ\",\"c\":1}]" + + "]"; + + @Test + public void testSerializeBinningClustererConfig() { + BinningClustererConfig config = new BinningClustererConfig(); + config.initializeFromJSON(new JSONObject(configJson)); + TestUtils.isSerializedTo(config, configJson); + } + + @Test + public void testSerializeBinningClustererConfigWithNgrams() { + BinningClustererConfig config = new BinningClustererConfig(); + config.initializeFromJSON(new JSONObject(configNgramJson)); + TestUtils.isSerializedTo(config, configNgramJson); + } + + @Test + public void testSerializeBinningClusterer() { + Project project = createCSVProject("column\n" + + "a\n" + + "à\n" + + "c\n" + + "ĉ\n"); + BinningClustererConfig config = new BinningClustererConfig(); + config.initializeFromJSON(new JSONObject(configJson)); + BinningClusterer clusterer = config.apply(project); + clusterer.computeClusters(new Engine(project)); + TestUtils.isSerializedTo(clusterer, clustererJson); + } +} From c9436f563d45e8495971ce352828de187e52581a Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Wed, 5 Sep 2018 17:39:31 +0100 Subject: [PATCH 2/3] Refactor kNNClusterer for serialization --- .../refine/clustering/knn/kNNClusterer.java | 123 ++++++++++++++---- .../tests/clustering/kNNClustererTests.java | 47 +++++++ 2 files changed, 143 insertions(+), 27 deletions(-) create mode 100644 main/tests/server/src/com/google/refine/tests/clustering/kNNClustererTests.java diff --git a/main/src/com/google/refine/clustering/knn/kNNClusterer.java b/main/src/com/google/refine/clustering/knn/kNNClusterer.java index 9c4499c57..22ab1a3ff 100644 --- a/main/src/com/google/refine/clustering/knn/kNNClusterer.java +++ b/main/src/com/google/refine/clustering/knn/kNNClusterer.java @@ -51,10 +51,12 @@ import org.json.JSONWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.refine.Jsonizable; import com.google.refine.browsing.Engine; import com.google.refine.browsing.FilteredRows; import com.google.refine.browsing.RowVisitor; import com.google.refine.clustering.Clusterer; +import com.google.refine.clustering.ClustererConfig; import com.google.refine.model.Cell; import com.google.refine.model.Project; import com.google.refine.model.Row; @@ -72,8 +74,84 @@ import edu.mit.simile.vicino.distances.LevenshteinDistance; import edu.mit.simile.vicino.distances.PPMDistance; public class kNNClusterer extends Clusterer { + + public static class kNNClustererConfig extends ClustererConfig { + private String _distanceStr; + private Distance _distance; + private kNNClustererConfigParameters _parameters; + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + writer.object(); + writer.key("function"); writer.value(_distanceStr); + writer.key("type"); writer.value("knn"); + writer.key("column"); writer.value(getColumnName()); + if(_parameters != null) { + writer.key("params"); + _parameters.write(writer, options); + } + writer.endObject(); + } + + public void initializeFromJSON(JSONObject o) { + super.initializeFromJSON(o); + _distanceStr = o.getString("function"); + _distance = _distances.get(_distanceStr.toLowerCase()); + if(o.has("params")) { + _parameters = kNNClustererConfigParameters.reconstruct(o.getJSONObject("params")); + } else { + _parameters = null; + } + } + + public Distance getDistance() { + return _distance; + } + + public kNNClustererConfigParameters getParameters() { + return _parameters; + } + + @Override + public kNNClusterer apply(Project project) { + kNNClusterer clusterer = new kNNClusterer(); + clusterer.initializeFromConfig(project, this); + return clusterer; + } + + } + + public static class kNNClustererConfigParameters implements Jsonizable { + public static final double defaultRadius = 1.0d; + public static final int defaultBlockingNgramSize = 6; + public double radius = defaultRadius; + public int blockingNgramSize = defaultBlockingNgramSize; + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + writer.object(); + writer.key("radius"); writer.value(radius); + writer.key("blocking-ngram-size"); + writer.value(blockingNgramSize); + writer.endObject(); + } + + public static kNNClustererConfigParameters reconstruct(JSONObject o) { + kNNClustererConfigParameters params = new kNNClustererConfigParameters(); + if(o.has("radius")) { + params.radius = o.getDouble("radius"); + } + if(o.has("blocking-ngram-size")) { + params.blockingNgramSize = o.getInt("blocking-ngram-size"); + } + return params; + } + } private Distance _distance; + private kNNClustererConfigParameters _params; static final protected Map _distances = new HashMap(); @@ -97,20 +175,13 @@ public class kNNClusterer extends Clusterer { class VPTreeClusteringRowVisitor implements RowVisitor { Distance _distance; - JSONObject _config; + kNNClustererConfigParameters _params; VPTreeClusterer _clusterer; - double _radius = 1.0f; - public VPTreeClusteringRowVisitor(Distance d, JSONObject o) { + public VPTreeClusteringRowVisitor(Distance d, kNNClustererConfigParameters params) { _distance = d; - _config = o; _clusterer = new VPTreeClusterer(_distance); - try { - JSONObject params = o.getJSONObject("params"); - _radius = params.getDouble("radius"); - } catch (JSONException e) { - //Refine.warn("No parameters found, using defaults"); - } + _params = params; } @Override @@ -136,32 +207,23 @@ public class kNNClusterer extends Clusterer { } public List> getClusters() { - return _clusterer.getClusters(_radius); + return _clusterer.getClusters(_params.radius); } } class BlockingClusteringRowVisitor implements RowVisitor { Distance _distance; - JSONObject _config; double _radius = 1.0d; int _blockingNgramSize = 6; HashSet _data; NGramClusterer _clusterer; - public BlockingClusteringRowVisitor(Distance d, JSONObject o) { + public BlockingClusteringRowVisitor(Distance d, kNNClustererConfigParameters params) { _distance = d; - _config = o; _data = new HashSet(); - try { - JSONObject params = o.getJSONObject("params"); - _radius = params.getDouble("radius"); - logger.debug("Use radius: {}", _radius); - _blockingNgramSize = params.getInt("blocking-ngram-size"); - logger.debug("Use blocking ngram size: {}",_blockingNgramSize); - } catch (JSONException e) { - logger.debug("No parameters found, using defaults"); - } + _blockingNgramSize = params.blockingNgramSize; + _radius = params.radius; _clusterer = new NGramClusterer(_distance, _blockingNgramSize); } @@ -192,16 +254,23 @@ public class kNNClusterer extends Clusterer { } } - @Override + @Deprecated public void initializeFromJSON(Project project, JSONObject o) throws Exception { - super.initializeFromJSON(project, o); - _distance = _distances.get(o.getString("function").toLowerCase()); + kNNClustererConfig config = new kNNClustererConfig(); + config.initializeFromJSON(o); + initializeFromConfig(project, config); + } + + public void initializeFromConfig(Project project, kNNClustererConfig config) { + super.initializeFromConfig(project, config); + _distance = config.getDistance(); + _params = config.getParameters(); } @Override public void computeClusters(Engine engine) { //VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config); - BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_config); + BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_params); FilteredRows filteredRows = engine.getAllFilteredRows(); filteredRows.accept(_project, visitor); diff --git a/main/tests/server/src/com/google/refine/tests/clustering/kNNClustererTests.java b/main/tests/server/src/com/google/refine/tests/clustering/kNNClustererTests.java new file mode 100644 index 000000000..6990fd5e9 --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/clustering/kNNClustererTests.java @@ -0,0 +1,47 @@ +package com.google.refine.tests.clustering; + +import org.json.JSONObject; +import org.testng.annotations.Test; + +import com.google.refine.browsing.Engine; +import com.google.refine.clustering.knn.kNNClusterer; +import com.google.refine.clustering.knn.kNNClusterer.kNNClustererConfig; +import com.google.refine.model.Project; +import com.google.refine.tests.RefineTest; +import com.google.refine.tests.util.TestUtils; + +public class kNNClustererTests extends RefineTest { + + public static String configJson = "{" + + "\"type\":\"knn\"," + + "\"function\":\"PPM\"," + + "\"column\":\"values\"," + + "\"params\":{\"radius\":1,\"blocking-ngram-size\":2}" + + "}"; + public static String clustererJson = "[" + + " [{\"v\":\"ab\",\"c\":1},{\"v\":\"abc\",\"c\":1}]" + + "]"; + + @Test + public void serializekNNClustererConfig() { + kNNClustererConfig config = new kNNClustererConfig(); + config.initializeFromJSON(new JSONObject(configJson)); + TestUtils.isSerializedTo(config, configJson); + } + + @Test + public void serializekNNClusterer() { + Project project = createCSVProject("column\n" + + "ab\n" + + "abc\n" + + "c\n" + + "ĉ\n"); + + kNNClustererConfig config = new kNNClustererConfig(); + config.initializeFromJSON(new JSONObject(configJson)); + kNNClusterer clusterer = config.apply(project); + clusterer.computeClusters(new Engine(project)); + + TestUtils.isSerializedTo(clusterer, clustererJson); + } +} From 9f964af7d46bfc39ffca060e5f30061117677c3f Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Wed, 5 Sep 2018 17:52:05 +0100 Subject: [PATCH 3/3] Migrate ComputeClustersCommand to use new architecture --- .../refine/clustering/ClustererConfig.java | 35 +++++++++++++++++++ .../clustering/binning/BinningClusterer.java | 7 ---- .../refine/clustering/knn/kNNClusterer.java | 7 ---- .../browsing/ComputeClustersCommand.java | 14 ++++---- 4 files changed, 43 insertions(+), 20 deletions(-) create mode 100644 main/src/com/google/refine/clustering/ClustererConfig.java diff --git a/main/src/com/google/refine/clustering/ClustererConfig.java b/main/src/com/google/refine/clustering/ClustererConfig.java new file mode 100644 index 000000000..d7f5d4842 --- /dev/null +++ b/main/src/com/google/refine/clustering/ClustererConfig.java @@ -0,0 +1,35 @@ +package com.google.refine.clustering; + +import org.json.JSONObject; + +import com.google.refine.Jsonizable; +import com.google.refine.model.Project; + +/** + * Represents the configuration data for a clusterer. + * @author Antonin Delpeuch + * + */ +public abstract class ClustererConfig implements Jsonizable { + + protected String columnName; + + /** + * Reads the configuration from a JSON payload (TODO: delete) + * @param o + */ + public void initializeFromJSON(JSONObject o) { + columnName = o.getString("column"); + } + + public String getColumnName() { + return columnName; + } + + /** + * Instantiate the configuration on a particular project. + * @param project + * @return + */ + public abstract Clusterer apply(Project project); +} diff --git a/main/src/com/google/refine/clustering/binning/BinningClusterer.java b/main/src/com/google/refine/clustering/binning/BinningClusterer.java index 5d7e00ab9..1da042dbb 100644 --- a/main/src/com/google/refine/clustering/binning/BinningClusterer.java +++ b/main/src/com/google/refine/clustering/binning/BinningClusterer.java @@ -238,13 +238,6 @@ public class BinningClusterer extends Clusterer { } } - @Deprecated - public void initializeFromJSON(Project project, JSONObject o) throws Exception { - BinningClustererConfig config = new BinningClustererConfig(); - config.initializeFromJSON(o); - initializeFromConfig(project, config); - } - public void initializeFromConfig(Project project, BinningClustererConfig config) { super.initializeFromConfig(project, config); _keyer = config.getKeyer(); diff --git a/main/src/com/google/refine/clustering/knn/kNNClusterer.java b/main/src/com/google/refine/clustering/knn/kNNClusterer.java index 22ab1a3ff..d80f05f7c 100644 --- a/main/src/com/google/refine/clustering/knn/kNNClusterer.java +++ b/main/src/com/google/refine/clustering/knn/kNNClusterer.java @@ -253,13 +253,6 @@ public class kNNClusterer extends Clusterer { return _clusterer.getClusters(_radius); } } - - @Deprecated - public void initializeFromJSON(Project project, JSONObject o) throws Exception { - kNNClustererConfig config = new kNNClustererConfig(); - config.initializeFromJSON(o); - initializeFromConfig(project, config); - } public void initializeFromConfig(Project project, kNNClustererConfig config) { super.initializeFromConfig(project, config); diff --git a/main/src/com/google/refine/commands/browsing/ComputeClustersCommand.java b/main/src/com/google/refine/commands/browsing/ComputeClustersCommand.java index 0064db9a4..1c80c1863 100644 --- a/main/src/com/google/refine/commands/browsing/ComputeClustersCommand.java +++ b/main/src/com/google/refine/commands/browsing/ComputeClustersCommand.java @@ -45,8 +45,9 @@ import org.slf4j.LoggerFactory; import com.google.refine.browsing.Engine; import com.google.refine.clustering.Clusterer; -import com.google.refine.clustering.binning.BinningClusterer; -import com.google.refine.clustering.knn.kNNClusterer; +import com.google.refine.clustering.ClustererConfig; +import com.google.refine.clustering.binning.BinningClusterer.BinningClustererConfig; +import com.google.refine.clustering.knn.kNNClusterer.kNNClustererConfig; import com.google.refine.commands.Command; import com.google.refine.model.Project; @@ -64,16 +65,17 @@ public class ComputeClustersCommand extends Command { Engine engine = getEngine(request, project); JSONObject clusterer_conf = getJsonParameter(request,"clusterer"); - Clusterer clusterer = null; String type = clusterer_conf.has("type") ? clusterer_conf.getString("type") : "binning"; + ClustererConfig clustererConfig = null; if ("knn".equals(type)) { - clusterer = new kNNClusterer(); + clustererConfig = new kNNClustererConfig(); } else { - clusterer = new BinningClusterer(); + clustererConfig = new BinningClustererConfig(); } - clusterer.initializeFromJSON(project, clusterer_conf); + clustererConfig.initializeFromJSON(clusterer_conf); + Clusterer clusterer = clustererConfig.apply(project); clusterer.computeClusters(engine);