From 31954862e8aab5ba82534c4923681ecc7a12ff8a Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Wed, 5 Sep 2018 16:49:01 +0100 Subject: [PATCH] Refactor BinningClusterer for JSON serialization --- .../google/refine/clustering/Clusterer.java | 8 +- .../clustering/binning/BinningClusterer.java | 104 +++++++++++++++--- .../clustering/BinningClustererTests.java | 59 ++++++++++ 3 files changed, 151 insertions(+), 20 deletions(-) create mode 100644 main/tests/server/src/com/google/refine/tests/clustering/BinningClustererTests.java diff --git a/main/src/com/google/refine/clustering/Clusterer.java b/main/src/com/google/refine/clustering/Clusterer.java index 672977230..a18e42c81 100644 --- a/main/src/com/google/refine/clustering/Clusterer.java +++ b/main/src/com/google/refine/clustering/Clusterer.java @@ -33,8 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.clustering; -import org.json.JSONObject; - import com.google.refine.Jsonizable; import com.google.refine.browsing.Engine; import com.google.refine.model.Column; @@ -44,15 +42,13 @@ public abstract class Clusterer implements Jsonizable { protected Project _project; protected int _colindex; - protected JSONObject _config; public abstract void computeClusters(Engine engine); - public void initializeFromJSON(Project project, JSONObject o) throws Exception { + protected void initializeFromConfig(Project project, ClustererConfig c) { _project = project; - _config = o; - String colname = o.getString("column"); + String colname = c.getColumnName(); for (Column column : project.columnModel.columns) { if (column.getName().equals(colname)) { _colindex = column.getCellIndex(); diff --git a/main/src/com/google/refine/clustering/binning/BinningClusterer.java b/main/src/com/google/refine/clustering/binning/BinningClusterer.java index 8a796ab18..5d7e00ab9 100644 --- a/main/src/com/google/refine/clustering/binning/BinningClusterer.java +++ b/main/src/com/google/refine/clustering/binning/BinningClusterer.java @@ -50,17 +50,90 @@ import org.json.JSONWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.refine.Jsonizable; import com.google.refine.browsing.Engine; import com.google.refine.browsing.FilteredRows; import com.google.refine.browsing.RowVisitor; import com.google.refine.clustering.Clusterer; +import com.google.refine.clustering.ClustererConfig; import com.google.refine.model.Cell; import com.google.refine.model.Project; import com.google.refine.model.Row; public class BinningClusterer extends Clusterer { + + public static class BinningClustererConfig extends ClustererConfig { + + private String _keyerName; + private Keyer _keyer; + private BinningParameters _parameters; + + @Override + public void initializeFromJSON(JSONObject o) { + super.initializeFromJSON(o); + _keyerName = o.getString("function"); + _keyer = _keyers.get(_keyerName.toLowerCase()); + if(o.has("params")) { + _parameters = BinningParameters.reconstruct(o.getJSONObject("params")); + } else { + _parameters = null; + } + } + + public Keyer getKeyer() { + return _keyer; + } + + public BinningParameters getParameters() { + return _parameters; + } + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + writer.object(); + writer.key("function"); writer.value(_keyerName); + writer.key("type"); writer.value("binning"); + writer.key("column"); writer.value(getColumnName()); + if(_parameters != null) { + writer.key("params"); + _parameters.write(writer, options); + } + writer.endObject(); + } - private Keyer _keyer; + @Override + public BinningClusterer apply(Project project) { + BinningClusterer clusterer = new BinningClusterer(); + clusterer.initializeFromConfig(project, this); + return clusterer; + } + + } + + public static class BinningParameters implements Jsonizable { + public int ngramSize; + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + writer.object(); + if(ngramSize > 0) { + writer.key("ngram-size"); + writer.value(ngramSize); + } + writer.endObject(); + } + + public static BinningParameters reconstruct(JSONObject o) { + BinningParameters parameters = new BinningParameters(); + parameters.ngramSize = o.has("ngram-size") ? o.getInt("ngram-size") : 0; + return parameters; + } + } + + protected Keyer _keyer; + protected BinningParameters _parameters; static final protected Map _keyers = new HashMap(); @@ -82,21 +155,17 @@ public class BinningClusterer extends Clusterer { Keyer _keyer; Object[] _params; - JSONObject _config; + BinningParameters _parameters; Map> _map = new HashMap>(); - public BinningRowVisitor(Keyer k, JSONObject o) { + public BinningRowVisitor(Keyer k, BinningParameters parameters) { _keyer = k; - _config = o; + _parameters = parameters; if (k instanceof NGramFingerprintKeyer) { - try { - int size = _config.getJSONObject("params").getInt("ngram-size"); - logger.debug("Using ngram size: {}", size); + if(_parameters != null) { _params = new Object[1]; - _params[0] = size; - } catch (JSONException e) { - //Refine.warn("No params specified, using default"); + _params[0] = _parameters.ngramSize; } } } @@ -169,15 +238,22 @@ public class BinningClusterer extends Clusterer { } } - @Override + @Deprecated public void initializeFromJSON(Project project, JSONObject o) throws Exception { - super.initializeFromJSON(project, o); - _keyer = _keyers.get(o.getString("function").toLowerCase()); + BinningClustererConfig config = new BinningClustererConfig(); + config.initializeFromJSON(o); + initializeFromConfig(project, config); + } + + public void initializeFromConfig(Project project, BinningClustererConfig config) { + super.initializeFromConfig(project, config); + _keyer = config.getKeyer(); + _parameters = config.getParameters(); } @Override public void computeClusters(Engine engine) { - BinningRowVisitor visitor = new BinningRowVisitor(_keyer,_config); + BinningRowVisitor visitor = new BinningRowVisitor(_keyer,_parameters); FilteredRows filteredRows = engine.getAllFilteredRows(); filteredRows.accept(_project, visitor); diff --git a/main/tests/server/src/com/google/refine/tests/clustering/BinningClustererTests.java b/main/tests/server/src/com/google/refine/tests/clustering/BinningClustererTests.java new file mode 100644 index 000000000..340496467 --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/clustering/BinningClustererTests.java @@ -0,0 +1,59 @@ +package com.google.refine.tests.clustering; + +import org.json.JSONObject; +import org.testng.annotations.Test; + +import com.google.refine.browsing.Engine; +import com.google.refine.clustering.binning.BinningClusterer; +import com.google.refine.clustering.binning.BinningClusterer.BinningClustererConfig; +import com.google.refine.model.Project; +import com.google.refine.tests.RefineTest; +import com.google.refine.tests.util.TestUtils; + +public class BinningClustererTests extends RefineTest { + + String configJson = "{" + + "\"type\":\"binning\"," + + "\"function\":\"fingerprint\"," + + "\"column\":\"values\"," + + "\"params\":{}}"; + + String configNgramJson = "{" + + "\"type\":\"binning\"," + + "\"function\":\"ngram-fingerprint\"," + + "\"column\":\"values\"," + + "\"params\":{\"ngram-size\":2}}"; + + String clustererJson = "[" + + " [{\"v\":\"a\",\"c\":1},{\"v\":\"à\",\"c\":1}]," + + " [{\"v\":\"c\",\"c\":1},{\"v\":\"ĉ\",\"c\":1}]" + + "]"; + + @Test + public void testSerializeBinningClustererConfig() { + BinningClustererConfig config = new BinningClustererConfig(); + config.initializeFromJSON(new JSONObject(configJson)); + TestUtils.isSerializedTo(config, configJson); + } + + @Test + public void testSerializeBinningClustererConfigWithNgrams() { + BinningClustererConfig config = new BinningClustererConfig(); + config.initializeFromJSON(new JSONObject(configNgramJson)); + TestUtils.isSerializedTo(config, configNgramJson); + } + + @Test + public void testSerializeBinningClusterer() { + Project project = createCSVProject("column\n" + + "a\n" + + "à\n" + + "c\n" + + "ĉ\n"); + BinningClustererConfig config = new BinningClustererConfig(); + config.initializeFromJSON(new JSONObject(configJson)); + BinningClusterer clusterer = config.apply(project); + clusterer.computeClusters(new Engine(project)); + TestUtils.isSerializedTo(clusterer, clustererJson); + } +}