Track freebase types of columns added with data from Freebase, so that we can later add more data based on those columns. Fixed minor bug in serialization of data extension records.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@303 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-03-16 06:18:00 +00:00
parent cf95e5b5f6
commit 084a6114d7
8 changed files with 214 additions and 30 deletions

View File

@ -19,14 +19,21 @@ import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Recon;
import com.metaweb.gridworks.model.ReconCandidate;
import com.metaweb.gridworks.model.ReconStats;
import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.model.Recon.Judgment;
import com.metaweb.gridworks.model.recon.DataExtensionReconConfig;
import com.metaweb.gridworks.protograph.FreebaseType;
import com.metaweb.gridworks.util.ParsingUtilities;
import com.metaweb.gridworks.util.FreebaseDataExtensionJob.DataExtension;
public class DataExtensionChange implements Change {
final protected String _baseColumnName;
final protected int _columnInsertIndex;
final protected List<String> _columnNames;
final protected List<FreebaseType> _columnTypes;
final protected List<Integer> _rowIndices;
final protected List<DataExtension> _dataExtensions;
@ -38,20 +45,27 @@ public class DataExtensionChange implements Change {
String baseColumnName,
int columnInsertIndex,
List<String> columnNames,
List<FreebaseType> columnTypes,
List<Integer> rowIndices,
List<DataExtension> dataExtensions
) {
_baseColumnName = baseColumnName;
_columnInsertIndex = columnInsertIndex;
_columnNames = columnNames;
_columnTypes = columnTypes;
_rowIndices = rowIndices;
_dataExtensions = dataExtensions;
}
protected DataExtensionChange(
String baseColumnName,
int columnInsertIndex,
int columnInsertIndex,
List<String> columnNames,
List<FreebaseType> columnTypes,
List<Integer> rowIndices,
List<DataExtension> dataExtensions,
int firstNewCellIndex,
@ -60,7 +74,10 @@ public class DataExtensionChange implements Change {
) {
_baseColumnName = baseColumnName;
_columnInsertIndex = columnInsertIndex;
_columnNames = columnNames;
_columnTypes = columnTypes;
_rowIndices = rowIndices;
_dataExtensions = dataExtensions;
@ -140,9 +157,12 @@ public class DataExtensionChange implements Change {
for (int i = 0; i < _columnNames.size(); i++) {
String name = _columnNames.get(i);
int cellIndex = _firstNewCellIndex + i;
Column column = new Column(cellIndex, name);
column.setReconConfig(new DataExtensionReconConfig(_columnTypes.get(i)));
column.setReconStats(ReconStats.create(project, cellIndex));
Column column = new Column(_firstNewCellIndex + i, name);
project.columnModel.columns.add(_columnInsertIndex + i, column);
}
@ -194,6 +214,17 @@ public class DataExtensionChange implements Change {
for (String name : _columnNames) {
writer.write(name); writer.write('\n');
}
writer.write("columnTypeCount="); writer.write(Integer.toString(_columnTypes.size())); writer.write('\n');
for (FreebaseType type : _columnTypes) {
try {
JSONWriter jsonWriter = new JSONWriter(writer);
type.write(jsonWriter, options);
} catch (JSONException e) {
// ???
}
writer.write('\n');
}
writer.write("rowIndexCount="); writer.write(Integer.toString(_rowIndices.size())); writer.write('\n');
for (Integer rowIndex : _rowIndices) {
writer.write(rowIndex.toString()); writer.write('\n');
@ -201,17 +232,22 @@ public class DataExtensionChange implements Change {
writer.write("dataExtensionCount="); writer.write(Integer.toString(_dataExtensions.size())); writer.write('\n');
for (DataExtension dataExtension : _dataExtensions) {
writer.write(Integer.toString(dataExtension.data.length)); writer.write('\n');
for (Object[] values : dataExtension.data) {
for (Object value : values) {
try {
JSONWriter jsonWriter = new JSONWriter(writer);
if (value instanceof ReconCandidate) {
((ReconCandidate) value).write(jsonWriter, options);
} else {
jsonWriter.value(value);
}
} catch (JSONException e) {
// ???
if (value == null) {
writer.write("null");
} else if (value instanceof ReconCandidate) {
try {
JSONWriter jsonWriter = new JSONWriter(writer);
((ReconCandidate) value).write(jsonWriter, options);
} catch (JSONException e) {
// ???
}
} else if (value instanceof String) {
writer.write(JSONObject.quote((String) value));
} else {
writer.write(value.toString());
}
writer.write('\n');
}
@ -236,13 +272,18 @@ public class DataExtensionChange implements Change {
static public Change load(LineNumberReader reader) throws Exception {
String baseColumnName = null;
int columnInsertIndex = -1;
int firstNewCellIndex = -1;
List<String> columnNames = null;
List<FreebaseType> columnTypes = null;
List<Integer> rowIndices = null;
List<DataExtension> dataExtensions = null;
List<Row> oldRows = null;
List<Row> newRows = null;
int firstNewCellIndex = -1;
String line;
while ((line = reader.readLine()) != null && !"/ec/".equals(line)) {
int equal = line.indexOf('=');
@ -271,6 +312,14 @@ public class DataExtensionChange implements Change {
line = reader.readLine();
columnNames.add(line);
}
} else if ("columnTypeCount".equals(field)) {
int count = Integer.parseInt(value);
columnTypes = new ArrayList<FreebaseType>(count);
for (int i = 0; i < count; i++) {
line = reader.readLine();
columnTypes.add(FreebaseType.load(ParsingUtilities.evaluateJsonStringToObject(line)));
}
} else if ("dataExtensionCount".equals(field)) {
int count = Integer.parseInt(value);
@ -289,10 +338,12 @@ public class DataExtensionChange implements Change {
JSONTokener t = new JSONTokener(line);
Object o = t.nextValue();
if (o instanceof JSONObject) {
row[c] = ReconCandidate.load((JSONObject) o);
} else {
row[c] = o;
if (o != JSONObject.NULL) {
if (o instanceof JSONObject) {
row[c] = ReconCandidate.load((JSONObject) o);
} else {
row[c] = o;
}
}
}
@ -325,6 +376,7 @@ public class DataExtensionChange implements Change {
baseColumnName,
columnInsertIndex,
columnNames,
columnTypes,
rowIndices,
dataExtensions,
firstNewCellIndex,

View File

@ -0,0 +1,67 @@
package com.metaweb.gridworks.model.recon;
import java.util.List;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONWriter;
import sun.reflect.generics.reflectiveObjects.NotImplementedException;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Recon;
import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.protograph.FreebaseType;
public class DataExtensionReconConfig extends StrictReconConfig {
final public FreebaseType type;
static public ReconConfig reconstruct(JSONObject obj) throws Exception {
JSONObject type = obj.getJSONObject("type");
return new DataExtensionReconConfig(
new FreebaseType(
type.getString("id"),
type.getString("name")
)
);
}
public DataExtensionReconConfig(FreebaseType type) {
this.type = type;
}
@Override
public ReconJob createJob(Project project, int rowIndex, Row row,
String columnName, Cell cell) {
throw new NotImplementedException();
}
@Override
public int getBatchSize() {
throw new NotImplementedException();
}
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("mode"); writer.value("extend");
writer.key("type"); type.write(writer, options);
writer.endObject();
}
@Override
public List<Recon> batchRecon(List<ReconJob> jobs) {
throw new NotImplementedException();
}
@Override
public String getBriefDescription(Project project, String columnName) {
throw new NotImplementedException();
}
}

View File

@ -21,6 +21,8 @@ abstract public class ReconConfig implements Jsonizable {
return HeuristicReconConfig.reconstruct(obj);
} else if ("strict".equals(mode)) {
return StrictReconConfig.reconstruct(obj);
} else if ("extend".equals(mode)) {
return DataExtensionReconConfig.reconstruct(obj);
}
return null;
}

View File

@ -26,6 +26,7 @@ import com.metaweb.gridworks.model.changes.CellAtRow;
import com.metaweb.gridworks.model.changes.DataExtensionChange;
import com.metaweb.gridworks.process.LongRunningProcess;
import com.metaweb.gridworks.process.Process;
import com.metaweb.gridworks.protograph.FreebaseType;
import com.metaweb.gridworks.util.FreebaseDataExtensionJob;
import com.metaweb.gridworks.util.FreebaseDataExtensionJob.ColumnInfo;
import com.metaweb.gridworks.util.FreebaseDataExtensionJob.DataExtension;
@ -221,6 +222,11 @@ public class ExtendDataOperation extends EngineDependentOperation {
columnNames.add(StringUtils.join(info.names, " - "));
}
List<FreebaseType> columnTypes = new ArrayList<FreebaseType>();
for (ColumnInfo info : _job.columns) {
columnTypes.add(info.expectedType);
}
HistoryEntry historyEntry = new HistoryEntry(
_project,
_description,
@ -229,6 +235,7 @@ public class ExtendDataOperation extends EngineDependentOperation {
_baseColumnName,
_columnInsertIndex,
columnNames,
columnTypes,
rowIndices,
dataExtensions)
);

View File

@ -1,8 +1,36 @@
package com.metaweb.gridworks.protograph;
public class FreebaseType extends FreebaseTopic {
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONWriter;
import com.metaweb.gridworks.Jsonizable;
public class FreebaseType extends FreebaseTopic implements Jsonizable {
public FreebaseType(String id, String name) {
super(id, name);
}
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("id"); writer.value(id);
writer.key("name"); writer.value(name);
writer.endObject();
}
static public FreebaseType load(JSONObject obj) throws Exception {
if (obj == null) {
return null;
}
FreebaseType type = new FreebaseType(
obj.getString("id"),
obj.getString("name")
);
return type;
}
}

View File

@ -23,6 +23,7 @@ import org.json.JSONObject;
import org.json.JSONWriter;
import com.metaweb.gridworks.model.ReconCandidate;
import com.metaweb.gridworks.protograph.FreebaseType;
public class FreebaseDataExtensionJob {
static public class DataExtension {
@ -36,10 +37,12 @@ public class FreebaseDataExtensionJob {
static public class ColumnInfo {
final public List<String> names;
final public List<String> path;
final public FreebaseType expectedType;
protected ColumnInfo(List<String> names, List<String> path) {
protected ColumnInfo(List<String> names, List<String> path, FreebaseType expectedType) {
this.names = names;
this.path = path;
this.expectedType = expectedType;
}
}
@ -130,7 +133,7 @@ public class FreebaseDataExtensionJob {
int startColumnIndex
) throws JSONException {
String propertyID = extNode.getString("id");
String expectedTypeID = extNode.getString("expected");
String expectedTypeID = extNode.getJSONObject("expected").getString("id");
JSONArray a = resultNode != null && resultNode.has(propertyID) && !resultNode.isNull(propertyID) ?
resultNode.getJSONArray(propertyID) : null;
@ -268,7 +271,7 @@ public class FreebaseDataExtensionJob {
static protected void formulateQueryNode(JSONObject node, JSONWriter writer) throws JSONException {
String propertyID = node.getString("id");
String expectedTypeID = node.getString("expected");
String expectedTypeID = node.getJSONObject("expected").getString("id");
writer.key(propertyID);
writer.array();
@ -321,14 +324,20 @@ public class FreebaseDataExtensionJob {
if (obj.has("properties") && !obj.isNull("properties")) {
boolean included = (obj.has("included") && obj.getBoolean("included"));
if (included && columns != null) {
columns.add(new ColumnInfo(names2, path2));
JSONObject expected = obj.getJSONObject("expected");
columns.add(new ColumnInfo(names2, path2,
new FreebaseType(expected.getString("id"), expected.getString("name"))));
}
return (included ? 1 : 0) +
countColumns(obj.getJSONArray("properties"), columns, names2, path2);
} else {
if (columns != null) {
columns.add(new ColumnInfo(names2, path2));
JSONObject expected = obj.getJSONObject("expected");
columns.add(new ColumnInfo(names2, path2,
new FreebaseType(expected.getString("id"), expected.getString("name"))));
}
return 1;
}

View File

@ -64,6 +64,7 @@ ExtendDataPreviewDialog.getAllProperties = function(typeID, onDone) {
"name" : null,
"/type/property/expected_type" : {
"id" : null,
"name" : null,
"/freebase/type_hints/mediator" : []
},
"sort" : "name"
@ -74,6 +75,7 @@ ExtendDataPreviewDialog.getAllProperties = function(typeID, onDone) {
"name" : null,
"/type/property/expected_type" : {
"id" : null,
"name" : null,
"/freebase/type_hints/mediator" : []
},
"sort" : "name"
@ -90,7 +92,10 @@ ExtendDataPreviewDialog.getAllProperties = function(typeID, onDone) {
allProperties.push({
id : property.id,
name : property.name,
expected : expectedType.id
expected : {
id : expectedType.id,
name : expectedType.name
}
});
}
};
@ -137,7 +142,8 @@ ExtendDataPreviewDialog.getAllProperties = function(typeID, onDone) {
"id" : null,
"name" : null,
"/type/property/expected_type" : {
id : null,
"id" : null,
"name" : null,
"/freebase/type_hints/mediator" : []
},
"sort" : "name"
@ -150,15 +156,24 @@ ExtendDataPreviewDialog.getAllProperties = function(typeID, onDone) {
function(o2) {
if ("result" in o2) {
var processCVTProperty = function(parentProperty, properties) {
var parentExpected = parentProperty["/type/property/expected_type"];
$.each(properties, function() {
var expected = this["/type/property/expected_type"];
allProperties.push({
id : parentProperty.id,
name : parentProperty.name,
expected : parentProperty["/type/property/expected_type"].id,
expected : {
id : parentExpected.id,
name : parentExpected.name
},
properties: [{
id : this.id,
name : this.name,
expected : this["/type/property/expected_type"].id
expected : {
id : expected.id,
name : expected.name
}
}]
});
});
@ -204,10 +219,14 @@ ExtendDataPreviewDialog.prototype._show = function(properties) {
};
this._elmts.addPropertyInput.suggestP(suggestConfig).bind("fb-select", function(evt, data) {
var expected = data["/type/property/expected_type"];
self._addProperty({
id : data.id,
name: data.name,
expected: data["/type/property/expected_type"]
expected: {
id: expected.id,
name: expected.name
}
});
});
};

View File

@ -127,7 +127,7 @@
mql_output: JSON.stringify([{
"id" : null,
"name" : null,
"/type/property/expected_type" : null
"/type/property/expected_type" : { "id" : null, "name" : null }
}])
};
if (start) {