From 1f69fba43c58e533645745ed2c889fdf720632fe Mon Sep 17 00:00:00 2001 From: David Huynh Date: Sun, 22 Aug 2010 23:55:07 +0000 Subject: [PATCH] Added command Add Column by Fetching URLs. git-svn-id: http://google-refine.googlecode.com/svn/trunk@1203 7d457c2a-affb-35e4-300a-418c747d4874 --- .../AddColumnByFetchingURLsCommand.java | 36 +++ .../operations/OperationRegistry.java | 2 + ...ColumnAdditionByFetchingURLsOperation.java | 291 ++++++++++++++++++ .../webapp/modules/core/MOD-INF/controller.js | 1 + .../add-column-by-fetching-urls-dialog.html | 25 ++ .../views/data-table/menu-edit-column.js | 50 +++ 6 files changed, 405 insertions(+) create mode 100644 main/src/com/google/gridworks/commands/column/AddColumnByFetchingURLsCommand.java create mode 100644 main/src/com/google/gridworks/operations/column/ColumnAdditionByFetchingURLsOperation.java create mode 100644 main/webapp/modules/core/scripts/views/data-table/add-column-by-fetching-urls-dialog.html diff --git a/main/src/com/google/gridworks/commands/column/AddColumnByFetchingURLsCommand.java b/main/src/com/google/gridworks/commands/column/AddColumnByFetchingURLsCommand.java new file mode 100644 index 000000000..b78e01b49 --- /dev/null +++ b/main/src/com/google/gridworks/commands/column/AddColumnByFetchingURLsCommand.java @@ -0,0 +1,36 @@ +package com.google.gridworks.commands.column; + +import javax.servlet.http.HttpServletRequest; + +import org.json.JSONObject; + +import com.google.gridworks.commands.EngineDependentCommand; +import com.google.gridworks.model.AbstractOperation; +import com.google.gridworks.model.Project; +import com.google.gridworks.operations.cell.TextTransformOperation; +import com.google.gridworks.operations.column.ColumnAdditionByFetchingURLsOperation; + +public class AddColumnByFetchingURLsCommand extends EngineDependentCommand { + @Override + protected AbstractOperation createOperation(Project project, + HttpServletRequest request, JSONObject engineConfig) throws Exception { + + String baseColumnName = request.getParameter("baseColumnName"); + String urlExpression = request.getParameter("urlExpression"); + String newColumnName = request.getParameter("newColumnName"); + int columnInsertIndex = Integer.parseInt(request.getParameter("columnInsertIndex")); + int delay = Integer.parseInt(request.getParameter("delay")); + String onError = request.getParameter("onError"); + + return new ColumnAdditionByFetchingURLsOperation( + engineConfig, + baseColumnName, + urlExpression, + TextTransformOperation.stringToOnError(onError), + newColumnName, + columnInsertIndex, + delay + ); + } + +} diff --git a/main/src/com/google/gridworks/operations/OperationRegistry.java b/main/src/com/google/gridworks/operations/OperationRegistry.java index e661bc0b5..25a07b769 100644 --- a/main/src/com/google/gridworks/operations/OperationRegistry.java +++ b/main/src/com/google/gridworks/operations/OperationRegistry.java @@ -16,6 +16,7 @@ import com.google.gridworks.operations.cell.MultiValuedCellSplitOperation; import com.google.gridworks.operations.cell.TextTransformOperation; import com.google.gridworks.operations.cell.TransposeColumnsIntoRowsOperation; import com.google.gridworks.operations.cell.TransposeRowsIntoColumnsOperation; +import com.google.gridworks.operations.column.ColumnAdditionByFetchingURLsOperation; import com.google.gridworks.operations.column.ColumnAdditionOperation; import com.google.gridworks.operations.column.ColumnMoveOperation; import com.google.gridworks.operations.column.ColumnRemovalOperation; @@ -66,6 +67,7 @@ public abstract class OperationRegistry { register("column-move", ColumnMoveOperation.class); register("column-split", ColumnSplitOperation.class); register("extend-data", ExtendDataOperation.class); + register("column-addition-by-fetching-urls", ColumnAdditionByFetchingURLsOperation.class); register("row-removal", RowRemovalOperation.class); register("row-star", RowStarOperation.class); diff --git a/main/src/com/google/gridworks/operations/column/ColumnAdditionByFetchingURLsOperation.java b/main/src/com/google/gridworks/operations/column/ColumnAdditionByFetchingURLsOperation.java new file mode 100644 index 000000000..8c895bdec --- /dev/null +++ b/main/src/com/google/gridworks/operations/column/ColumnAdditionByFetchingURLsOperation.java @@ -0,0 +1,291 @@ +package com.google.gridworks.operations.column; + +import java.io.InputStream; +import java.io.Serializable; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONWriter; + +import com.google.gridworks.browsing.Engine; +import com.google.gridworks.browsing.FilteredRows; +import com.google.gridworks.browsing.RowVisitor; +import com.google.gridworks.expr.EvalError; +import com.google.gridworks.expr.Evaluable; +import com.google.gridworks.expr.ExpressionUtils; +import com.google.gridworks.expr.MetaParser; +import com.google.gridworks.expr.WrappedCell; +import com.google.gridworks.history.HistoryEntry; +import com.google.gridworks.model.AbstractOperation; +import com.google.gridworks.model.Cell; +import com.google.gridworks.model.Column; +import com.google.gridworks.model.Project; +import com.google.gridworks.model.Row; +import com.google.gridworks.model.changes.CellAtRow; +import com.google.gridworks.model.changes.ColumnAdditionChange; +import com.google.gridworks.operations.EngineDependentOperation; +import com.google.gridworks.operations.OnError; +import com.google.gridworks.operations.OperationRegistry; +import com.google.gridworks.operations.cell.TextTransformOperation; +import com.google.gridworks.process.LongRunningProcess; +import com.google.gridworks.process.Process; +import com.google.gridworks.util.ParsingUtilities; + +public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperation { + final protected String _baseColumnName; + final protected String _urlExpression; + final protected OnError _onError; + + final protected String _newColumnName; + final protected int _columnInsertIndex; + final protected int _delay; + + static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception { + JSONObject engineConfig = obj.getJSONObject("engineConfig"); + + return new ColumnAdditionByFetchingURLsOperation( + engineConfig, + obj.getString("baseColumnName"), + obj.getString("urlExpression"), + TextTransformOperation.stringToOnError(obj.getString("onError")), + obj.getString("newColumnName"), + obj.getInt("columnInsertIndex"), + obj.getInt("delay") + ); + } + + public ColumnAdditionByFetchingURLsOperation( + JSONObject engineConfig, + String baseColumnName, + String urlExpression, + OnError onError, + String newColumnName, + int columnInsertIndex, + int delay + ) { + super(engineConfig); + + _baseColumnName = baseColumnName; + _urlExpression = urlExpression; + _onError = onError; + + _newColumnName = newColumnName; + _columnInsertIndex = columnInsertIndex; + + _delay = delay; + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); + writer.key("description"); writer.value(getBriefDescription(null)); + writer.key("engineConfig"); writer.value(getEngineConfig()); + writer.key("newColumnName"); writer.value(_newColumnName); + writer.key("columnInsertIndex"); writer.value(_columnInsertIndex); + writer.key("baseColumnName"); writer.value(_baseColumnName); + writer.key("urlExpression"); writer.value(_urlExpression); + writer.key("onError"); writer.value(TextTransformOperation.onErrorToString(_onError)); + writer.key("delay"); writer.value(_delay); + writer.endObject(); + } + + protected String getBriefDescription(Project project) { + return "Create column " + _newColumnName + + " at index " + _columnInsertIndex + + " by fetching URLs based on column " + _baseColumnName + + " using expression " + _urlExpression; + } + + protected String createDescription(Column column, List cellsAtRows) { + return "Create new column " + _newColumnName + + ", filling " + cellsAtRows.size() + + " rows by fetching URLs based on column " + column.getName() + + " and formulated as " + _urlExpression; + } + + + public Process createProcess(Project project, Properties options) throws Exception { + Column column = project.columnModel.getColumnByName(_baseColumnName); + if (column == null) { + throw new Exception("No column named " + _baseColumnName); + } + if (project.columnModel.getColumnByName(_newColumnName) != null) { + throw new Exception("Another column already named " + _newColumnName); + } + + Engine engine = createEngine(project); + engine.initializeFromJSON(_engineConfig); + + Evaluable eval = MetaParser.parse(_urlExpression); + + return new ColumnAdditionByFetchingURLsProcess( + project, + engine, + eval, + getBriefDescription(null) + ); + } + + public class ColumnAdditionByFetchingURLsProcess extends LongRunningProcess implements Runnable { + final protected Project _project; + final protected Engine _engine; + final protected Evaluable _eval; + final protected long _historyEntryID; + protected int _cellIndex; + + public ColumnAdditionByFetchingURLsProcess( + Project project, + Engine engine, + Evaluable eval, + String description + ) throws JSONException { + super(description); + _project = project; + _engine = engine; + _eval = eval; + _historyEntryID = HistoryEntry.allocateID(); + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("id"); writer.value(hashCode()); + writer.key("description"); writer.value(_description); + writer.key("immediate"); writer.value(false); + writer.key("status"); writer.value(_thread == null ? "pending" : (_thread.isAlive() ? "running" : "done")); + writer.key("progress"); writer.value(_progress); + writer.endObject(); + } + + protected Runnable getRunnable() { + return this; + } + + public void run() { + List urls = new ArrayList(_project.rows.size()); + + FilteredRows filteredRows = _engine.getAllFilteredRows(); + filteredRows.accept(_project, createRowVisitor(urls)); + + List responseBodies = new ArrayList(urls.size()); + for (int i = 0; i < urls.size(); i++) { + CellAtRow urlData = urls.get(i); + CellAtRow cellAtRow = fetch(urlData); + if (cellAtRow != null) { + responseBodies.add(cellAtRow); + } + + _progress = i * 100 / urls.size(); + try { + Thread.sleep(_delay); + } catch (InterruptedException e) { + if (_canceled) { + break; + } + } + } + + if (!_canceled) { + + HistoryEntry historyEntry = new HistoryEntry( + _historyEntryID, + _project, + _description, + ColumnAdditionByFetchingURLsOperation.this, + new ColumnAdditionChange( + _newColumnName, + _columnInsertIndex, + responseBodies) + ); + + _project.history.addEntry(historyEntry); + _project.processManager.onDoneProcess(this); + } + } + + CellAtRow fetch(CellAtRow urlData) { + String urlString = urlData.cell.value.toString(); + URL url = null; + + try { + url = new URL(urlString); + } catch (MalformedURLException e) { + return null; + } + + try { + InputStream is = url.openStream(); + try { + return new CellAtRow(urlData.row, new Cell(ParsingUtilities.inputStreamToString(is), null)); + } finally { + is.close(); + } + } catch (Exception e) { + return _onError == OnError.StoreError ? + new CellAtRow(urlData.row, new Cell(new EvalError(e.getMessage()), null)) : null; + } + } + + RowVisitor createRowVisitor(List cellsAtRows) { + return new RowVisitor() { + int cellIndex; + Properties bindings; + List cellsAtRows; + + public RowVisitor init(List cellsAtRows) { + Column column = _project.columnModel.getColumnByName(_baseColumnName); + + this.cellIndex = column.getCellIndex(); + this.bindings = ExpressionUtils.createBindings(_project); + this.cellsAtRows = cellsAtRows; + return this; + } + + @Override + public void start(Project project) { + // nothing to do + } + + @Override + public void end(Project project) { + // nothing to do + } + + public boolean visit(Project project, int rowIndex, Row row) { + Cell cell = row.getCell(cellIndex); + Cell newCell = null; + + ExpressionUtils.bind(bindings, row, rowIndex, _baseColumnName, cell); + + Object o = _eval.evaluate(bindings); + if (o != null) { + if (o instanceof Cell) { + newCell = (Cell) o; + } else if (o instanceof WrappedCell) { + newCell = ((WrappedCell) o).cell; + } else { + Serializable v = ExpressionUtils.wrapStorable(o); + if (ExpressionUtils.isNonBlankData(v)) { + newCell = new Cell(v.toString(), null); + } + } + } + + if (newCell != null) { + cellsAtRows.add(new CellAtRow(rowIndex, newCell)); + } + + return false; + } + }.init(cellsAtRows); + } + } +} diff --git a/main/webapp/modules/core/MOD-INF/controller.js b/main/webapp/modules/core/MOD-INF/controller.js index 033ac1789..32627b9c9 100644 --- a/main/webapp/modules/core/MOD-INF/controller.js +++ b/main/webapp/modules/core/MOD-INF/controller.js @@ -55,6 +55,7 @@ function registerCommands() { GS.registerCommand(module, "move-column", new Packages.com.google.gridworks.commands.column.MoveColumnCommand()); GS.registerCommand(module, "split-column", new Packages.com.google.gridworks.commands.column.SplitColumnCommand()); GS.registerCommand(module, "extend-data", new Packages.com.google.gridworks.commands.column.ExtendDataCommand()); + GS.registerCommand(module, "add-column-by-fetching-urls", new Packages.com.google.gridworks.commands.column.AddColumnByFetchingURLsCommand()); GS.registerCommand(module, "denormalize", new Packages.com.google.gridworks.commands.row.DenormalizeCommand()); diff --git a/main/webapp/modules/core/scripts/views/data-table/add-column-by-fetching-urls-dialog.html b/main/webapp/modules/core/scripts/views/data-table/add-column-by-fetching-urls-dialog.html new file mode 100644 index 000000000..db1587ad2 --- /dev/null +++ b/main/webapp/modules/core/scripts/views/data-table/add-column-by-fetching-urls-dialog.html @@ -0,0 +1,25 @@ +
+
+
+
+ + + + + + + + + + + + + +
New column nameThrottle delay milliseconds
On error set to blank + store error

Formulate the URLs to fetch:

$EXPRESSION_PREVIEW_WIDGET$
+
+ +
\ No newline at end of file diff --git a/main/webapp/modules/core/scripts/views/data-table/menu-edit-column.js b/main/webapp/modules/core/scripts/views/data-table/menu-edit-column.js index 2d9a37131..06df4a8dc 100644 --- a/main/webapp/modules/core/scripts/views/data-table/menu-edit-column.js +++ b/main/webapp/modules/core/scripts/views/data-table/menu-edit-column.js @@ -44,6 +44,51 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { ); }; + var doAddColumnByFetchingURLs = function() { + var frame = $( + DOM.loadHTML("core", "scripts/views/data-table/add-column-by-fetching-urls-dialog.html") + .replace("$EXPRESSION_PREVIEW_WIDGET$", ExpressionPreviewDialog.generateWidgetHtml())); + + var elmts = DOM.bind(frame); + elmts.dialogHeader.text("Add column by fetching URLs based on column " + column.name); + + var level = DialogSystem.showDialog(frame); + var dismiss = function() { DialogSystem.dismissUntil(level - 1); }; + + elmts.cancelButton.click(dismiss); + elmts.okButton.click(function() { + var columnName = $.trim(elmts.columnNameInput[0].value); + if (!columnName.length) { + alert("You must enter a column name."); + return; + } + + Gridworks.postCoreProcess( + "add-column-by-fetching-urls", + { + baseColumnName: column.name, + urlExpression: previewWidget.getExpression(true), + newColumnName: columnName, + columnInsertIndex: columnIndex + 1, + delay: elmts.throttleDelayInput[0].value, + onError: $('input[name="dialog-onerror-choice"]:checked')[0].value + }, + null, + { modelsChanged: true } + ); + dismiss(); + }); + + var o = DataTableView.sampleVisibleRows(column); + var previewWidget = new ExpressionPreviewDialog.Widget( + elmts, + column.cellIndex, + o.rowIndices, + o.values, + null + ); + }; + var doAddColumnFromFreebase = function() { var o = DataTableView.sampleVisibleRows(column); new ExtendDataPreviewDialog( @@ -184,6 +229,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { label: "Split into Several Columns ...", click: doSplitColumn }, + {}, { label: "Add Column Based on This Column ...", click: doAddColumn @@ -192,6 +238,10 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { label: "Add Columns From Freebase ...", click: doAddColumnFromFreebase }, + { + label: "Add Column By Fetching URLs ...", + click: doAddColumnByFetchingURLs + }, {}, { label: "Rename This Column",