Extend data operation is working.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@301 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-03-16 00:24:20 +00:00
parent 2645c864ab
commit c6e7986206
8 changed files with 639 additions and 6 deletions

View File

@ -19,6 +19,7 @@ import com.metaweb.gridworks.commands.edit.ApplyOperationsCommand;
import com.metaweb.gridworks.commands.edit.CreateProjectCommand;
import com.metaweb.gridworks.commands.edit.DeleteProjectCommand;
import com.metaweb.gridworks.commands.edit.ExportProjectCommand;
import com.metaweb.gridworks.commands.edit.ExtendDataCommand;
import com.metaweb.gridworks.commands.edit.ImportProjectCommand;
import com.metaweb.gridworks.commands.edit.TextTransformCommand;
import com.metaweb.gridworks.commands.edit.EditOneCellCommand;
@ -93,6 +94,7 @@ public class GridworksServlet extends HttpServlet {
_commands.put("add-column", new AddColumnCommand());
_commands.put("remove-column", new RemoveColumnCommand());
_commands.put("extend-data", new ExtendDataCommand());
_commands.put("reconcile", new ReconcileCommand());
_commands.put("recon-match-best-candidates", new ReconMatchBestCandidatesCommand());

View File

@ -0,0 +1,31 @@
package com.metaweb.gridworks.commands.edit;
import javax.servlet.http.HttpServletRequest;
import org.json.JSONObject;
import com.metaweb.gridworks.commands.EngineDependentCommand;
import com.metaweb.gridworks.model.AbstractOperation;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.operations.ExtendDataOperation;
public class ExtendDataCommand extends EngineDependentCommand {
@Override
protected AbstractOperation createOperation(Project project,
HttpServletRequest request, JSONObject engineConfig) throws Exception {
String baseColumnName = request.getParameter("baseColumnName");
int columnInsertIndex = Integer.parseInt(request.getParameter("columnInsertIndex"));
String jsonString = request.getParameter("extension");
JSONObject extension = jsonStringToObject(jsonString);
return new ExtendDataOperation(
engineConfig,
baseColumnName,
extension,
columnInsertIndex
);
}
}

View File

@ -0,0 +1,338 @@
package com.metaweb.gridworks.model.changes;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Serializable;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONTokener;
import org.json.JSONWriter;
import com.metaweb.gridworks.history.Change;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Recon;
import com.metaweb.gridworks.model.ReconCandidate;
import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.model.Recon.Judgment;
import com.metaweb.gridworks.util.FreebaseDataExtensionJob.DataExtension;
public class DataExtensionChange implements Change {
final protected String _baseColumnName;
final protected int _columnInsertIndex;
final protected List<String> _columnNames;
final protected List<Integer> _rowIndices;
final protected List<DataExtension> _dataExtensions;
protected int _firstNewCellIndex = -1;
protected List<Row> _oldRows;
protected List<Row> _newRows;
public DataExtensionChange(
String baseColumnName,
int columnInsertIndex,
List<String> columnNames,
List<Integer> rowIndices,
List<DataExtension> dataExtensions
) {
_baseColumnName = baseColumnName;
_columnInsertIndex = columnInsertIndex;
_columnNames = columnNames;
_rowIndices = rowIndices;
_dataExtensions = dataExtensions;
}
protected DataExtensionChange(
String baseColumnName,
int columnInsertIndex,
List<String> columnNames,
List<Integer> rowIndices,
List<DataExtension> dataExtensions,
int firstNewCellIndex,
List<Row> oldRows,
List<Row> newRows
) {
_baseColumnName = baseColumnName;
_columnInsertIndex = columnInsertIndex;
_columnNames = columnNames;
_rowIndices = rowIndices;
_dataExtensions = dataExtensions;
_firstNewCellIndex = firstNewCellIndex;
_oldRows = oldRows;
_newRows = newRows;
}
public void apply(Project project) {
synchronized (project) {
if (_firstNewCellIndex < 0) {
_firstNewCellIndex = project.columnModel.allocateNewCellIndex();
for (int i = 1; i < _columnNames.size(); i++) {
project.columnModel.allocateNewCellIndex();
}
_oldRows = new ArrayList<Row>(project.rows);
_newRows = new ArrayList<Row>(project.rows.size());
int cellIndex = project.columnModel.getColumnByName(_baseColumnName).getCellIndex();
int keyCellIndex = project.columnModel.columns.get(project.columnModel.getKeyColumnIndex()).getCellIndex();
int index = 0;
int rowIndex = _rowIndices.get(index);
DataExtension dataExtension = _dataExtensions.get(index);
index++;
for (int r = 0; r < _oldRows.size(); r++) {
Row oldRow = _oldRows.get(r);
if (r < rowIndex) {
_newRows.add(oldRow.dup());
continue;
}
if (dataExtension == null || dataExtension.data.length == 0) {
_newRows.add(oldRow);
} else {
Row firstNewRow = oldRow.dup();
extendRow(firstNewRow, dataExtension, 0);
_newRows.add(firstNewRow);
int r2 = r + 1;
for (int subR = 1; subR < dataExtension.data.length; subR++) {
if (r2 < project.rows.size()) {
Row oldRow2 = project.rows.get(r2);
if (oldRow2.isCellBlank(cellIndex) &&
oldRow2.isCellBlank(keyCellIndex)) {
Row newRow = oldRow2.dup();
extendRow(newRow, dataExtension, subR);
_newRows.add(newRow);
r2++;
continue;
}
}
Row newRow = new Row(cellIndex + _columnNames.size());
extendRow(newRow, dataExtension, subR);
_newRows.add(newRow);
}
r = r2 - 1; // r will be incremented by the for loop anyway
}
rowIndex = index < _rowIndices.size() ? _rowIndices.get(index) : _oldRows.size();
dataExtension = index < _rowIndices.size() ? _dataExtensions.get(index) : null;
index++;
}
}
project.rows.clear();
project.rows.addAll(_newRows);
for (int i = 0; i < _columnNames.size(); i++) {
String name = _columnNames.get(i);
Column column = new Column(_firstNewCellIndex + i, name);
project.columnModel.columns.add(_columnInsertIndex + i, column);
}
project.columnModel.update();
project.recomputeRowContextDependencies();
}
}
protected void extendRow(Row row, DataExtension dataExtension, int extensionRowIndex) {
Object[] values = dataExtension.data[extensionRowIndex];
for (int c = 0; c < values.length; c++) {
Object value = values[c];
Cell cell = null;
if (value instanceof ReconCandidate) {
ReconCandidate rc = (ReconCandidate) value;
Recon recon = new Recon();
recon.addCandidate(rc);
recon.match = rc;
recon.judgment = Judgment.Matched;
cell = new Cell(rc.topicName, recon);
} else {
cell = new Cell((Serializable) value, null);
}
row.setCell(_firstNewCellIndex + c, cell);
}
}
public void revert(Project project) {
synchronized (project) {
project.rows.clear();
project.rows.addAll(_oldRows);
for (int i = 0; i < _columnNames.size(); i++) {
project.columnModel.columns.remove(_columnInsertIndex);
}
project.columnModel.update();
project.recomputeRowContextDependencies();
}
}
public void save(Writer writer, Properties options) throws IOException {
writer.write("baseColumnName="); writer.write(_baseColumnName); writer.write('\n');
writer.write("columnInsertIndex="); writer.write(Integer.toString(_columnInsertIndex)); writer.write('\n');
writer.write("columnNameCount="); writer.write(Integer.toString(_columnNames.size())); writer.write('\n');
for (String name : _columnNames) {
writer.write(name); writer.write('\n');
}
writer.write("rowIndexCount="); writer.write(Integer.toString(_rowIndices.size())); writer.write('\n');
for (Integer rowIndex : _rowIndices) {
writer.write(rowIndex.toString()); writer.write('\n');
}
writer.write("dataExtensionCount="); writer.write(Integer.toString(_dataExtensions.size())); writer.write('\n');
for (DataExtension dataExtension : _dataExtensions) {
writer.write(Integer.toString(dataExtension.data.length)); writer.write('\n');
for (Object[] values : dataExtension.data) {
for (Object value : values) {
try {
JSONWriter jsonWriter = new JSONWriter(writer);
if (value instanceof ReconCandidate) {
((ReconCandidate) value).write(jsonWriter, options);
} else {
jsonWriter.value(value);
}
} catch (JSONException e) {
// ???
}
writer.write('\n');
}
}
}
writer.write("firstNewCellIndex="); writer.write(Integer.toString(_firstNewCellIndex)); writer.write('\n');
writer.write("newRowCount="); writer.write(Integer.toString(_newRows.size())); writer.write('\n');
for (Row row : _newRows) {
row.save(writer, options);
writer.write('\n');
}
writer.write("oldRowCount="); writer.write(Integer.toString(_oldRows.size())); writer.write('\n');
for (Row row : _oldRows) {
row.save(writer, options);
writer.write('\n');
}
writer.write("/ec/\n"); // end of change marker
}
static public Change load(LineNumberReader reader) throws Exception {
String baseColumnName = null;
int columnInsertIndex = -1;
int firstNewCellIndex = -1;
List<String> columnNames = null;
List<Integer> rowIndices = null;
List<DataExtension> dataExtensions = null;
List<Row> oldRows = null;
List<Row> newRows = null;
String line;
while ((line = reader.readLine()) != null && !"/ec/".equals(line)) {
int equal = line.indexOf('=');
CharSequence field = line.subSequence(0, equal);
String value = line.substring(equal + 1);
if ("baseColumnName".equals(field)) {
baseColumnName = value;
} else if ("columnInsertIndex".equals(field)) {
columnInsertIndex = Integer.parseInt(value);
} else if ("firstNewCellIndex".equals(field)) {
firstNewCellIndex = Integer.parseInt(value);
} else if ("rowIndexCount".equals(field)) {
int count = Integer.parseInt(value);
rowIndices = new ArrayList<Integer>(count);
for (int i = 0; i < count; i++) {
line = reader.readLine();
rowIndices.add(Integer.parseInt(line));
}
} else if ("columnNameCount".equals(field)) {
int count = Integer.parseInt(value);
columnNames = new ArrayList<String>(count);
for (int i = 0; i < count; i++) {
line = reader.readLine();
columnNames.add(line);
}
} else if ("dataExtensionCount".equals(field)) {
int count = Integer.parseInt(value);
dataExtensions = new ArrayList<DataExtension>(count);
for (int i = 0; i < count; i++) {
line = reader.readLine();
int rowCount = Integer.parseInt(line);
Object[][] data = new Object[rowCount][];
for (int r = 0; r < rowCount; r++) {
Object[] row = new Object[columnNames.size()];
for (int c = 0; c < columnNames.size(); c++) {
line = reader.readLine();
JSONTokener t = new JSONTokener(line);
Object o = t.nextValue();
if (o instanceof JSONObject) {
row[c] = ReconCandidate.load((JSONObject) o);
} else {
row[c] = o;
}
}
data[r] = row;
}
dataExtensions.add(new DataExtension(data));
}
} else if ("oldRowCount".equals(field)) {
int count = Integer.parseInt(value);
oldRows = new ArrayList<Row>(count);
for (int i = 0; i < count; i++) {
line = reader.readLine();
oldRows.add(Row.load(line));
}
} else if ("newRowCount".equals(field)) {
int count = Integer.parseInt(value);
newRows = new ArrayList<Row>(count);
for (int i = 0; i < count; i++) {
line = reader.readLine();
newRows.add(Row.load(line));
}
}
}
DataExtensionChange change = new DataExtensionChange(
baseColumnName,
columnInsertIndex,
columnNames,
rowIndices,
dataExtensions,
firstNewCellIndex,
oldRows,
newRows
);
return change;
}
}

View File

@ -0,0 +1,241 @@
package com.metaweb.gridworks.operations;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONWriter;
import com.metaweb.gridworks.browsing.Engine;
import com.metaweb.gridworks.browsing.FilteredRows;
import com.metaweb.gridworks.browsing.RowVisitor;
import com.metaweb.gridworks.history.HistoryEntry;
import com.metaweb.gridworks.model.AbstractOperation;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.model.changes.CellAtRow;
import com.metaweb.gridworks.model.changes.DataExtensionChange;
import com.metaweb.gridworks.process.LongRunningProcess;
import com.metaweb.gridworks.process.Process;
import com.metaweb.gridworks.util.FreebaseDataExtensionJob;
import com.metaweb.gridworks.util.FreebaseDataExtensionJob.ColumnInfo;
import com.metaweb.gridworks.util.FreebaseDataExtensionJob.DataExtension;
public class ExtendDataOperation extends EngineDependentOperation {
final protected String _baseColumnName;
final protected JSONObject _extension;
final protected int _columnInsertIndex;
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
JSONObject engineConfig = obj.getJSONObject("engineConfig");
return new ExtendDataOperation(
engineConfig,
obj.getString("baseColumnName"),
obj.getJSONObject("extension"),
obj.getInt("columnInsertIndex")
);
}
public ExtendDataOperation(
JSONObject engineConfig,
String baseColumnName,
JSONObject extension,
int columnInsertIndex
) {
super(engineConfig);
_baseColumnName = baseColumnName;
_extension = extension;
_columnInsertIndex = columnInsertIndex;
}
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
writer.key("description"); writer.value(getBriefDescription(null));
writer.key("engineConfig"); writer.value(getEngineConfig());
writer.key("columnInsertIndex"); writer.value(_columnInsertIndex);
writer.key("baseColumnName"); writer.value(_baseColumnName);
writer.key("extension"); writer.value(_extension);
writer.endObject();
}
protected String getBriefDescription(Project project) {
return "Extend data at index " + _columnInsertIndex +
" based on column " + _baseColumnName;
}
protected String createDescription(Column column, List<CellAtRow> cellsAtRows) {
return "Extend data at index " + _columnInsertIndex +
" based on column " + column.getName() +
" by filling " + cellsAtRows.size();
}
public Process createProcess(Project project, Properties options) throws Exception {
return new ExtendDataProcess(
project,
getEngineConfig(),
getBriefDescription(null)
);
}
public class ExtendDataProcess extends LongRunningProcess implements Runnable {
final protected Project _project;
final protected JSONObject _engineConfig;
protected int _cellIndex;
protected FreebaseDataExtensionJob _job;
public ExtendDataProcess(
Project project,
JSONObject engineConfig,
String description
) throws JSONException {
super(description);
_project = project;
_engineConfig = engineConfig;
_job = new FreebaseDataExtensionJob(_extension);
}
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("id"); writer.value(hashCode());
writer.key("description"); writer.value(_description);
writer.key("immediate"); writer.value(false);
writer.key("status"); writer.value(_thread == null ? "pending" : (_thread.isAlive() ? "running" : "done"));
writer.key("progress"); writer.value(_progress);
writer.endObject();
}
protected Runnable getRunnable() {
return this;
}
protected void populateRowsWithMatches(List<Integer> rowIndices) throws Exception {
Engine engine = new Engine(_project);
engine.initializeFromJSON(_engineConfig);
Column column = _project.columnModel.getColumnByName(_baseColumnName);
if (column == null) {
throw new Exception("No column named " + _baseColumnName);
}
_cellIndex = column.getCellIndex();
FilteredRows filteredRows = engine.getAllFilteredRows(false);
filteredRows.accept(_project, new RowVisitor() {
List<Integer> _rowIndices;
public RowVisitor init(List<Integer> rowIndices) {
_rowIndices = rowIndices;
return this;
}
public boolean visit(Project project, int rowIndex, Row row, boolean includeContextual, boolean includeDependent) {
if (!includeContextual) {
Cell cell = row.getCell(_cellIndex);
if (cell != null && cell.recon != null && cell.recon.match != null) {
_rowIndices.add(rowIndex);
}
}
return false;
}
}.init(rowIndices));
}
protected void extendRows(List<Integer> rowIndices, List<DataExtension> dataExtensions, int from, int to) {
Set<String> guids = new HashSet<String>();
for (int i = from; i < to; i++) {
int index = rowIndices.get(i);
Row row = _project.rows.get(index);
Cell cell = row.getCell(_cellIndex);
guids.add(cell.recon.match.topicGUID);
}
Map<String, DataExtension> map = null;
try {
map = _job.extend(guids);
} catch (Exception e) {
map = new HashMap<String, DataExtension>();
}
for (int i = from; i < to; i++) {
int index = rowIndices.get(i);
Row row = _project.rows.get(index);
Cell cell = row.getCell(_cellIndex);
String guid = cell.recon.match.topicGUID;
if (map.containsKey(guid)) {
dataExtensions.add(map.get(guid));
} else {
dataExtensions.add(null);
}
}
}
public void run() {
List<Integer> rowIndices = new ArrayList<Integer>();
List<DataExtension> dataExtensions = new ArrayList<DataExtension>();
try {
populateRowsWithMatches(rowIndices);
} catch (Exception e2) {
// TODO : Not sure what to do here?
e2.printStackTrace();
}
int start = 0;
while (start < rowIndices.size()) {
int end = Math.min(start + 20, rowIndices.size());
extendRows(rowIndices, dataExtensions, start, end);
start = end;
_progress = end * 100 / rowIndices.size();
try {
Thread.sleep(200);
} catch (InterruptedException e) {
if (_canceled) {
break;
}
}
}
if (!_canceled) {
List<String> columnNames = new ArrayList<String>();
for (ColumnInfo info : _job.columns) {
columnNames.add(StringUtils.join(info.names, " - "));
}
HistoryEntry historyEntry = new HistoryEntry(
_project,
_description,
ExtendDataOperation.this,
new DataExtensionChange(
_baseColumnName,
_columnInsertIndex,
columnNames,
rowIndices,
dataExtensions)
);
_project.history.addEntry(historyEntry);
_project.processManager.onDoneProcess(this);
}
}
}
}

View File

@ -34,6 +34,7 @@ public abstract class OperationRegistry {
register("column-addition", ColumnAdditionOperation.class);
register("column-removal", ColumnRemovalOperation.class);
register("extend-data", ExtendDataOperation.class);
register("row-star", RowStarOperation.class);

View File

@ -28,7 +28,7 @@ public class FreebaseDataExtensionJob {
static public class DataExtension {
final public Object[][] data;
protected DataExtension(Object[][] data) {
public DataExtension(Object[][] data) {
this.data = data;
}
}

View File

@ -1,5 +1,6 @@
function ExtendDataPreviewDialog(column, rowIndices, onDone) {
function ExtendDataPreviewDialog(column, columnIndex, rowIndices, onDone) {
this._column = column;
this._columnIndex = columnIndex;
this._rowIndices = rowIndices;
this._onDone = onDone;
this._extension = { properties: [] };
@ -33,15 +34,21 @@ function ExtendDataPreviewDialog(column, rowIndices, onDone) {
this._elmts = DOM.bind(html);
$('<button></button>').html("&nbsp;&nbsp;OK&nbsp;&nbsp;").click(function() {
DialogSystem.dismissUntil(self._level - 1);
self._onDone(self._previewWidget.getExpression(true));
if (self._extension.properties.length === 0) {
alert("Please add some properties first.");
} else {
DialogSystem.dismissUntil(self._level - 1);
self._onDone(self._extension);
}
}).appendTo(footer);
$('<button></button>').text("Cancel").click(function() {
DialogSystem.dismissUntil(self._level - 1);
}).appendTo(footer);
var dismissBusy = DialogSystem.showBusy();
ExtendDataPreviewDialog.getAllProperties(column.reconConfig.type.id, function(properties) {
dismissBusy();
self._show(properties);
});
};

View File

@ -821,11 +821,24 @@ DataTableColumnHeaderUI.prototype._doAddColumn = function(initialExpression) {
DataTableColumnHeaderUI.prototype._doAddColumnFromFreebase = function() {
if ("reconConfig" in this._column && "type" in this._column.reconConfig) {
var o = DataTableView.sampleVisibleRows(this._column);
var self = this;
new ExtendDataPreviewDialog(
this._column,
this._columnIndex,
o.rowIndices,
function() {}
function(extension) {
Gridworks.postProcess(
"extend-data",
{
baseColumnName: self._column.name,
columnInsertIndex: self._columnIndex + 1
},
{
extension: JSON.stringify(extension)
},
{ rowsChanged: true, modelsChanged: true }
);
}
);
} else {
alert("This column has not been reconciled yet.");