Merge pull request #1730 from OpenRefine/json-clustering

Refactor clustering serialization in JSON
This commit is contained in:
Antonin Delpeuch 2018-09-16 18:52:11 +01:00 committed by GitHub
commit ec3e9ab1e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 326 additions and 56 deletions

View File

@ -33,8 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.refine.clustering; package com.google.refine.clustering;
import org.json.JSONObject;
import com.google.refine.Jsonizable; import com.google.refine.Jsonizable;
import com.google.refine.browsing.Engine; import com.google.refine.browsing.Engine;
import com.google.refine.model.Column; import com.google.refine.model.Column;
@ -44,15 +42,13 @@ public abstract class Clusterer implements Jsonizable {
protected Project _project; protected Project _project;
protected int _colindex; protected int _colindex;
protected JSONObject _config;
public abstract void computeClusters(Engine engine); public abstract void computeClusters(Engine engine);
public void initializeFromJSON(Project project, JSONObject o) throws Exception { protected void initializeFromConfig(Project project, ClustererConfig c) {
_project = project; _project = project;
_config = o;
String colname = o.getString("column"); String colname = c.getColumnName();
for (Column column : project.columnModel.columns) { for (Column column : project.columnModel.columns) {
if (column.getName().equals(colname)) { if (column.getName().equals(colname)) {
_colindex = column.getCellIndex(); _colindex = column.getCellIndex();

View File

@ -0,0 +1,35 @@
package com.google.refine.clustering;
import org.json.JSONObject;
import com.google.refine.Jsonizable;
import com.google.refine.model.Project;
/**
* Represents the configuration data for a clusterer.
* @author Antonin Delpeuch
*
*/
public abstract class ClustererConfig implements Jsonizable {
protected String columnName;
/**
* Reads the configuration from a JSON payload (TODO: delete)
* @param o
*/
public void initializeFromJSON(JSONObject o) {
columnName = o.getString("column");
}
public String getColumnName() {
return columnName;
}
/**
* Instantiate the configuration on a particular project.
* @param project
* @return
*/
public abstract Clusterer apply(Project project);
}

View File

@ -50,17 +50,90 @@ import org.json.JSONWriter;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.refine.Jsonizable;
import com.google.refine.browsing.Engine; import com.google.refine.browsing.Engine;
import com.google.refine.browsing.FilteredRows; import com.google.refine.browsing.FilteredRows;
import com.google.refine.browsing.RowVisitor; import com.google.refine.browsing.RowVisitor;
import com.google.refine.clustering.Clusterer; import com.google.refine.clustering.Clusterer;
import com.google.refine.clustering.ClustererConfig;
import com.google.refine.model.Cell; import com.google.refine.model.Cell;
import com.google.refine.model.Project; import com.google.refine.model.Project;
import com.google.refine.model.Row; import com.google.refine.model.Row;
public class BinningClusterer extends Clusterer { public class BinningClusterer extends Clusterer {
private Keyer _keyer; public static class BinningClustererConfig extends ClustererConfig {
private String _keyerName;
private Keyer _keyer;
private BinningParameters _parameters;
@Override
public void initializeFromJSON(JSONObject o) {
super.initializeFromJSON(o);
_keyerName = o.getString("function");
_keyer = _keyers.get(_keyerName.toLowerCase());
if(o.has("params")) {
_parameters = BinningParameters.reconstruct(o.getJSONObject("params"));
} else {
_parameters = null;
}
}
public Keyer getKeyer() {
return _keyer;
}
public BinningParameters getParameters() {
return _parameters;
}
@Override
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("function"); writer.value(_keyerName);
writer.key("type"); writer.value("binning");
writer.key("column"); writer.value(getColumnName());
if(_parameters != null) {
writer.key("params");
_parameters.write(writer, options);
}
writer.endObject();
}
@Override
public BinningClusterer apply(Project project) {
BinningClusterer clusterer = new BinningClusterer();
clusterer.initializeFromConfig(project, this);
return clusterer;
}
}
public static class BinningParameters implements Jsonizable {
public int ngramSize;
@Override
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
if(ngramSize > 0) {
writer.key("ngram-size");
writer.value(ngramSize);
}
writer.endObject();
}
public static BinningParameters reconstruct(JSONObject o) {
BinningParameters parameters = new BinningParameters();
parameters.ngramSize = o.has("ngram-size") ? o.getInt("ngram-size") : 0;
return parameters;
}
}
protected Keyer _keyer;
protected BinningParameters _parameters;
static final protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>(); static final protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>();
@ -82,21 +155,17 @@ public class BinningClusterer extends Clusterer {
Keyer _keyer; Keyer _keyer;
Object[] _params; Object[] _params;
JSONObject _config; BinningParameters _parameters;
Map<String,Map<String,Integer>> _map = new HashMap<String,Map<String,Integer>>(); Map<String,Map<String,Integer>> _map = new HashMap<String,Map<String,Integer>>();
public BinningRowVisitor(Keyer k, JSONObject o) { public BinningRowVisitor(Keyer k, BinningParameters parameters) {
_keyer = k; _keyer = k;
_config = o; _parameters = parameters;
if (k instanceof NGramFingerprintKeyer) { if (k instanceof NGramFingerprintKeyer) {
try { if(_parameters != null) {
int size = _config.getJSONObject("params").getInt("ngram-size");
logger.debug("Using ngram size: {}", size);
_params = new Object[1]; _params = new Object[1];
_params[0] = size; _params[0] = _parameters.ngramSize;
} catch (JSONException e) {
//Refine.warn("No params specified, using default");
} }
} }
} }
@ -169,15 +238,15 @@ public class BinningClusterer extends Clusterer {
} }
} }
@Override public void initializeFromConfig(Project project, BinningClustererConfig config) {
public void initializeFromJSON(Project project, JSONObject o) throws Exception { super.initializeFromConfig(project, config);
super.initializeFromJSON(project, o); _keyer = config.getKeyer();
_keyer = _keyers.get(o.getString("function").toLowerCase()); _parameters = config.getParameters();
} }
@Override @Override
public void computeClusters(Engine engine) { public void computeClusters(Engine engine) {
BinningRowVisitor visitor = new BinningRowVisitor(_keyer,_config); BinningRowVisitor visitor = new BinningRowVisitor(_keyer,_parameters);
FilteredRows filteredRows = engine.getAllFilteredRows(); FilteredRows filteredRows = engine.getAllFilteredRows();
filteredRows.accept(_project, visitor); filteredRows.accept(_project, visitor);

View File

@ -51,10 +51,12 @@ import org.json.JSONWriter;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.refine.Jsonizable;
import com.google.refine.browsing.Engine; import com.google.refine.browsing.Engine;
import com.google.refine.browsing.FilteredRows; import com.google.refine.browsing.FilteredRows;
import com.google.refine.browsing.RowVisitor; import com.google.refine.browsing.RowVisitor;
import com.google.refine.clustering.Clusterer; import com.google.refine.clustering.Clusterer;
import com.google.refine.clustering.ClustererConfig;
import com.google.refine.model.Cell; import com.google.refine.model.Cell;
import com.google.refine.model.Project; import com.google.refine.model.Project;
import com.google.refine.model.Row; import com.google.refine.model.Row;
@ -73,7 +75,83 @@ import edu.mit.simile.vicino.distances.PPMDistance;
public class kNNClusterer extends Clusterer { public class kNNClusterer extends Clusterer {
public static class kNNClustererConfig extends ClustererConfig {
private String _distanceStr;
private Distance _distance;
private kNNClustererConfigParameters _parameters;
@Override
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("function"); writer.value(_distanceStr);
writer.key("type"); writer.value("knn");
writer.key("column"); writer.value(getColumnName());
if(_parameters != null) {
writer.key("params");
_parameters.write(writer, options);
}
writer.endObject();
}
public void initializeFromJSON(JSONObject o) {
super.initializeFromJSON(o);
_distanceStr = o.getString("function");
_distance = _distances.get(_distanceStr.toLowerCase());
if(o.has("params")) {
_parameters = kNNClustererConfigParameters.reconstruct(o.getJSONObject("params"));
} else {
_parameters = null;
}
}
public Distance getDistance() {
return _distance;
}
public kNNClustererConfigParameters getParameters() {
return _parameters;
}
@Override
public kNNClusterer apply(Project project) {
kNNClusterer clusterer = new kNNClusterer();
clusterer.initializeFromConfig(project, this);
return clusterer;
}
}
public static class kNNClustererConfigParameters implements Jsonizable {
public static final double defaultRadius = 1.0d;
public static final int defaultBlockingNgramSize = 6;
public double radius = defaultRadius;
public int blockingNgramSize = defaultBlockingNgramSize;
@Override
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("radius"); writer.value(radius);
writer.key("blocking-ngram-size");
writer.value(blockingNgramSize);
writer.endObject();
}
public static kNNClustererConfigParameters reconstruct(JSONObject o) {
kNNClustererConfigParameters params = new kNNClustererConfigParameters();
if(o.has("radius")) {
params.radius = o.getDouble("radius");
}
if(o.has("blocking-ngram-size")) {
params.blockingNgramSize = o.getInt("blocking-ngram-size");
}
return params;
}
}
private Distance _distance; private Distance _distance;
private kNNClustererConfigParameters _params;
static final protected Map<String, Distance> _distances = new HashMap<String, Distance>(); static final protected Map<String, Distance> _distances = new HashMap<String, Distance>();
@ -97,20 +175,13 @@ public class kNNClusterer extends Clusterer {
class VPTreeClusteringRowVisitor implements RowVisitor { class VPTreeClusteringRowVisitor implements RowVisitor {
Distance _distance; Distance _distance;
JSONObject _config; kNNClustererConfigParameters _params;
VPTreeClusterer _clusterer; VPTreeClusterer _clusterer;
double _radius = 1.0f;
public VPTreeClusteringRowVisitor(Distance d, JSONObject o) { public VPTreeClusteringRowVisitor(Distance d, kNNClustererConfigParameters params) {
_distance = d; _distance = d;
_config = o;
_clusterer = new VPTreeClusterer(_distance); _clusterer = new VPTreeClusterer(_distance);
try { _params = params;
JSONObject params = o.getJSONObject("params");
_radius = params.getDouble("radius");
} catch (JSONException e) {
//Refine.warn("No parameters found, using defaults");
}
} }
@Override @Override
@ -136,32 +207,23 @@ public class kNNClusterer extends Clusterer {
} }
public List<Set<Serializable>> getClusters() { public List<Set<Serializable>> getClusters() {
return _clusterer.getClusters(_radius); return _clusterer.getClusters(_params.radius);
} }
} }
class BlockingClusteringRowVisitor implements RowVisitor { class BlockingClusteringRowVisitor implements RowVisitor {
Distance _distance; Distance _distance;
JSONObject _config;
double _radius = 1.0d; double _radius = 1.0d;
int _blockingNgramSize = 6; int _blockingNgramSize = 6;
HashSet<String> _data; HashSet<String> _data;
NGramClusterer _clusterer; NGramClusterer _clusterer;
public BlockingClusteringRowVisitor(Distance d, JSONObject o) { public BlockingClusteringRowVisitor(Distance d, kNNClustererConfigParameters params) {
_distance = d; _distance = d;
_config = o;
_data = new HashSet<String>(); _data = new HashSet<String>();
try { _blockingNgramSize = params.blockingNgramSize;
JSONObject params = o.getJSONObject("params"); _radius = params.radius;
_radius = params.getDouble("radius");
logger.debug("Use radius: {}", _radius);
_blockingNgramSize = params.getInt("blocking-ngram-size");
logger.debug("Use blocking ngram size: {}",_blockingNgramSize);
} catch (JSONException e) {
logger.debug("No parameters found, using defaults");
}
_clusterer = new NGramClusterer(_distance, _blockingNgramSize); _clusterer = new NGramClusterer(_distance, _blockingNgramSize);
} }
@ -192,16 +254,16 @@ public class kNNClusterer extends Clusterer {
} }
} }
@Override public void initializeFromConfig(Project project, kNNClustererConfig config) {
public void initializeFromJSON(Project project, JSONObject o) throws Exception { super.initializeFromConfig(project, config);
super.initializeFromJSON(project, o); _distance = config.getDistance();
_distance = _distances.get(o.getString("function").toLowerCase()); _params = config.getParameters();
} }
@Override @Override
public void computeClusters(Engine engine) { public void computeClusters(Engine engine) {
//VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config); //VPTreeClusteringRowVisitor visitor = new VPTreeClusteringRowVisitor(_distance,_config);
BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_config); BlockingClusteringRowVisitor visitor = new BlockingClusteringRowVisitor(_distance,_params);
FilteredRows filteredRows = engine.getAllFilteredRows(); FilteredRows filteredRows = engine.getAllFilteredRows();
filteredRows.accept(_project, visitor); filteredRows.accept(_project, visitor);

View File

@ -45,8 +45,9 @@ import org.slf4j.LoggerFactory;
import com.google.refine.browsing.Engine; import com.google.refine.browsing.Engine;
import com.google.refine.clustering.Clusterer; import com.google.refine.clustering.Clusterer;
import com.google.refine.clustering.binning.BinningClusterer; import com.google.refine.clustering.ClustererConfig;
import com.google.refine.clustering.knn.kNNClusterer; import com.google.refine.clustering.binning.BinningClusterer.BinningClustererConfig;
import com.google.refine.clustering.knn.kNNClusterer.kNNClustererConfig;
import com.google.refine.commands.Command; import com.google.refine.commands.Command;
import com.google.refine.model.Project; import com.google.refine.model.Project;
@ -64,16 +65,17 @@ public class ComputeClustersCommand extends Command {
Engine engine = getEngine(request, project); Engine engine = getEngine(request, project);
JSONObject clusterer_conf = getJsonParameter(request,"clusterer"); JSONObject clusterer_conf = getJsonParameter(request,"clusterer");
Clusterer clusterer = null;
String type = clusterer_conf.has("type") ? clusterer_conf.getString("type") : "binning"; String type = clusterer_conf.has("type") ? clusterer_conf.getString("type") : "binning";
ClustererConfig clustererConfig = null;
if ("knn".equals(type)) { if ("knn".equals(type)) {
clusterer = new kNNClusterer(); clustererConfig = new kNNClustererConfig();
} else { } else {
clusterer = new BinningClusterer(); clustererConfig = new BinningClustererConfig();
} }
clusterer.initializeFromJSON(project, clusterer_conf); clustererConfig.initializeFromJSON(clusterer_conf);
Clusterer clusterer = clustererConfig.apply(project);
clusterer.computeClusters(engine); clusterer.computeClusters(engine);

View File

@ -0,0 +1,59 @@
package com.google.refine.tests.clustering;
import org.json.JSONObject;
import org.testng.annotations.Test;
import com.google.refine.browsing.Engine;
import com.google.refine.clustering.binning.BinningClusterer;
import com.google.refine.clustering.binning.BinningClusterer.BinningClustererConfig;
import com.google.refine.model.Project;
import com.google.refine.tests.RefineTest;
import com.google.refine.tests.util.TestUtils;
public class BinningClustererTests extends RefineTest {
String configJson = "{"
+ "\"type\":\"binning\","
+ "\"function\":\"fingerprint\","
+ "\"column\":\"values\","
+ "\"params\":{}}";
String configNgramJson = "{"
+ "\"type\":\"binning\","
+ "\"function\":\"ngram-fingerprint\","
+ "\"column\":\"values\","
+ "\"params\":{\"ngram-size\":2}}";
String clustererJson = "["
+ " [{\"v\":\"a\",\"c\":1},{\"v\":\"à\",\"c\":1}],"
+ " [{\"v\":\"c\",\"c\":1},{\"v\":\"ĉ\",\"c\":1}]"
+ "]";
@Test
public void testSerializeBinningClustererConfig() {
BinningClustererConfig config = new BinningClustererConfig();
config.initializeFromJSON(new JSONObject(configJson));
TestUtils.isSerializedTo(config, configJson);
}
@Test
public void testSerializeBinningClustererConfigWithNgrams() {
BinningClustererConfig config = new BinningClustererConfig();
config.initializeFromJSON(new JSONObject(configNgramJson));
TestUtils.isSerializedTo(config, configNgramJson);
}
@Test
public void testSerializeBinningClusterer() {
Project project = createCSVProject("column\n"
+ "a\n"
+ "à\n"
+ "c\n"
+ "ĉ\n");
BinningClustererConfig config = new BinningClustererConfig();
config.initializeFromJSON(new JSONObject(configJson));
BinningClusterer clusterer = config.apply(project);
clusterer.computeClusters(new Engine(project));
TestUtils.isSerializedTo(clusterer, clustererJson);
}
}

View File

@ -0,0 +1,47 @@
package com.google.refine.tests.clustering;
import org.json.JSONObject;
import org.testng.annotations.Test;
import com.google.refine.browsing.Engine;
import com.google.refine.clustering.knn.kNNClusterer;
import com.google.refine.clustering.knn.kNNClusterer.kNNClustererConfig;
import com.google.refine.model.Project;
import com.google.refine.tests.RefineTest;
import com.google.refine.tests.util.TestUtils;
public class kNNClustererTests extends RefineTest {
public static String configJson = "{"
+ "\"type\":\"knn\","
+ "\"function\":\"PPM\","
+ "\"column\":\"values\","
+ "\"params\":{\"radius\":1,\"blocking-ngram-size\":2}"
+ "}";
public static String clustererJson = "["
+ " [{\"v\":\"ab\",\"c\":1},{\"v\":\"abc\",\"c\":1}]"
+ "]";
@Test
public void serializekNNClustererConfig() {
kNNClustererConfig config = new kNNClustererConfig();
config.initializeFromJSON(new JSONObject(configJson));
TestUtils.isSerializedTo(config, configJson);
}
@Test
public void serializekNNClusterer() {
Project project = createCSVProject("column\n"
+ "ab\n"
+ "abc\n"
+ "c\n"
+ "ĉ\n");
kNNClustererConfig config = new kNNClustererConfig();
config.initializeFromJSON(new JSONObject(configJson));
kNNClusterer clusterer = config.apply(project);
clusterer.computeClusters(new Engine(project));
TestUtils.isSerializedTo(clusterer, clustererJson);
}
}