From 5e9be8c258435cf112b269a0010b5bbf5b94566f Mon Sep 17 00:00:00 2001 From: David Huynh Date: Mon, 22 Feb 2010 22:15:48 +0000 Subject: [PATCH] Support reusing newly created topics for cells with the same content. git-svn-id: http://google-refine.googlecode.com/svn/trunk@121 7d457c2a-affb-35e4-300a-418c747d4874 --- .../recon/ReconJudgeSimilarCellsCommand.java | 9 +- .../recon/ReconMarkNewTopicsCommand.java | 8 +- .../com/metaweb/gridworks/model/Recon.java | 5 ++ .../ReconJudgeSimilarCellsOperation.java | 87 +++++++++++-------- .../ReconMarkNewTopicsOperation.java | 52 ++++++++--- .../protograph/transpose/Transposer.java | 11 ++- .../TripleLoaderTransposedNodeFactory.java | 40 ++++++--- .../scripts/project/data-table-cell-ui.js | 2 +- .../project/data-table-column-header-ui.js | 8 +- 9 files changed, 149 insertions(+), 73 deletions(-) diff --git a/src/main/java/com/metaweb/gridworks/commands/recon/ReconJudgeSimilarCellsCommand.java b/src/main/java/com/metaweb/gridworks/commands/recon/ReconJudgeSimilarCellsCommand.java index 05a23b302..7eeef1e57 100644 --- a/src/main/java/com/metaweb/gridworks/commands/recon/ReconJudgeSimilarCellsCommand.java +++ b/src/main/java/com/metaweb/gridworks/commands/recon/ReconJudgeSimilarCellsCommand.java @@ -6,7 +6,9 @@ import org.json.JSONObject; import com.metaweb.gridworks.commands.EngineDependentCommand; import com.metaweb.gridworks.model.AbstractOperation; +import com.metaweb.gridworks.model.Recon; import com.metaweb.gridworks.model.ReconCandidate; +import com.metaweb.gridworks.model.Recon.Judgment; import com.metaweb.gridworks.operations.ReconJudgeSimilarCellsOperation; public class ReconJudgeSimilarCellsCommand extends EngineDependentCommand { @@ -17,7 +19,7 @@ public class ReconJudgeSimilarCellsCommand extends EngineDependentCommand { String columnName = request.getParameter("columnName"); String similarValue = request.getParameter("similarValue"); - String judgment = request.getParameter("judgment"); + Judgment judgment = Recon.stringToJudgment(request.getParameter("judgment")); ReconCandidate match = null; String topicID = request.getParameter("topicID"); @@ -33,12 +35,15 @@ public class ReconJudgeSimilarCellsCommand extends EngineDependentCommand { ); } + String shareNewTopics = request.getParameter("shareNewTopics"); + return new ReconJudgeSimilarCellsOperation( engineConfig, columnName, similarValue, judgment, - match + match, + "true".equals(shareNewTopics) ); } } diff --git a/src/main/java/com/metaweb/gridworks/commands/recon/ReconMarkNewTopicsCommand.java b/src/main/java/com/metaweb/gridworks/commands/recon/ReconMarkNewTopicsCommand.java index 164932c1d..c9fe69781 100644 --- a/src/main/java/com/metaweb/gridworks/commands/recon/ReconMarkNewTopicsCommand.java +++ b/src/main/java/com/metaweb/gridworks/commands/recon/ReconMarkNewTopicsCommand.java @@ -14,8 +14,10 @@ public class ReconMarkNewTopicsCommand extends EngineDependentCommand { protected AbstractOperation createOperation(HttpServletRequest request, JSONObject engineConfig) throws Exception { - String columnName = request.getParameter("columnName"); - - return new ReconMarkNewTopicsOperation(engineConfig, columnName); + return new ReconMarkNewTopicsOperation( + engineConfig, + request.getParameter("columnName"), + "true".equals(request.getParameter("shareNewTopics")) + ); } } diff --git a/src/main/java/com/metaweb/gridworks/model/Recon.java b/src/main/java/com/metaweb/gridworks/model/Recon.java index 4bc75618e..aecbf21e8 100644 --- a/src/main/java/com/metaweb/gridworks/model/Recon.java +++ b/src/main/java/com/metaweb/gridworks/model/Recon.java @@ -58,11 +58,16 @@ public class Recon implements Serializable, HasFields, Jsonizable { s_featureMap.put("nameWordDistance", Feature_nameWordDistance); } + final public long id; public Object[] features = new Object[Feature_max]; public List candidates = new LinkedList(); public Judgment judgment = Judgment.None; public ReconCandidate match = null; + public Recon() { + id = System.currentTimeMillis() * 1000000 + Math.round(Math.random() * 1000000); + } + public Recon dup() { Recon r = new Recon(); diff --git a/src/main/java/com/metaweb/gridworks/operations/ReconJudgeSimilarCellsOperation.java b/src/main/java/com/metaweb/gridworks/operations/ReconJudgeSimilarCellsOperation.java index db0779081..5ceb9f829 100644 --- a/src/main/java/com/metaweb/gridworks/operations/ReconJudgeSimilarCellsOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/ReconJudgeSimilarCellsOperation.java @@ -1,6 +1,8 @@ package com.metaweb.gridworks.operations; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Properties; import org.json.JSONArray; @@ -26,6 +28,7 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper final protected String _similarValue; final protected Judgment _judgment; final protected ReconCandidate _match; + final protected boolean _shareNewTopics; static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception { JSONObject engineConfig = obj.getJSONObject("engineConfig"); @@ -59,7 +62,8 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper obj.getString("columnName"), obj.getString("similarValue"), judgment, - match + match, + obj.has("shareNewTopics") ? obj.getBoolean("shareNewTopics") : false ); } @@ -68,25 +72,14 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper String columnName, String similarValue, Judgment judgment, - ReconCandidate match + ReconCandidate match, + boolean shareNewTopics ) { super(engineConfig, columnName, false); this._similarValue = similarValue; this._judgment = judgment; this._match = match; - } - - public ReconJudgeSimilarCellsOperation( - JSONObject engineConfig, - String columnName, - String similarValue, - String judgmentString, - ReconCandidate match - ) { - super(engineConfig, columnName, false); - this._similarValue = similarValue; - this._judgment = Recon.stringToJudgment(judgmentString); - this._match = match; + this._shareNewTopics = shareNewTopics; } public void write(JSONWriter writer, Properties options) @@ -102,6 +95,7 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper if (_match != null) { writer.key("match"); _match.write(writer, options); } + writer.key("shareNewTopics"); writer.value(_shareNewTopics); writer.endObject(); } @@ -111,8 +105,13 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper return "Discard recon judgments for cells containing \"" + _similarValue + "\" in column " + _columnName; } else if (_judgment == Judgment.New) { - return "Mark to create new topics for cells containing \"" + - _similarValue + "\" in column " + _columnName; + if (_shareNewTopics) { + return "Mark to create one single new topic for all cells containing \"" + + _similarValue + "\" in column " + _columnName; + } else { + return "Mark to create one new topic for each cell containing \"" + + _similarValue + "\" in column " + _columnName; + } } else if (_judgment == Judgment.Matched) { return "Match topic " + _match.topicName + " (" + @@ -129,8 +128,13 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper return "Discard recon judgments for " + cellChanges.size() + " cells containing \"" + _similarValue + "\" in column " + _columnName; } else if (_judgment == Judgment.New) { - return "Mark to create new topics for " + cellChanges.size() + " cells containing \"" + - _similarValue + "\" in column " + _columnName; + if (_shareNewTopics) { + return "Mark to create one single new topic for " + cellChanges.size() + " cells containing \"" + + _similarValue + "\" in column " + _columnName; + } else { + return "Mark to create one new topic for each of " + cellChanges.size() + " cells containing \"" + + _similarValue + "\" in column " + _columnName; + } } else if (_judgment == Judgment.Matched) { return "Match topic " + _match.topicName + " (" + @@ -145,8 +149,9 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper Column column = project.columnModel.getColumnByName(_columnName); return new RowVisitor() { - int _cellIndex; - List _cellChanges; + int _cellIndex; + List _cellChanges; + Map _sharedRecons = new HashMap(); public RowVisitor init(int cellIndex, List cellChanges) { _cellIndex = cellIndex; @@ -160,20 +165,32 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper !ExpressionUtils.isBlank(cell.value) && _similarValue.equals(cell.value)) { - Cell newCell = new Cell( - cell.value, - cell.recon == null ? new Recon() : cell.recon.dup() - ); - - if (_judgment == Judgment.Matched) { - newCell.recon.judgment = Recon.Judgment.Matched; - newCell.recon.match = _match; - } else if (_judgment == Judgment.New) { - newCell.recon.judgment = Recon.Judgment.New; - } else if (_judgment == Judgment.None) { - newCell.recon.judgment = Recon.Judgment.None; - newCell.recon.match = null; - } + Recon recon = null; + if (_judgment == Judgment.New && _shareNewTopics) { + String s = cell.value.toString(); + if (_sharedRecons.containsKey(s)) { + recon = _sharedRecons.get(s); + } else { + recon = new Recon(); + recon.judgment = Judgment.New; + + _sharedRecons.put(s, recon); + } + } else { + recon = cell.recon == null ? new Recon() : cell.recon.dup(); + if (_judgment == Judgment.Matched) { + recon.judgment = Recon.Judgment.Matched; + recon.match = _match; + } else if (_judgment == Judgment.New) { + recon.judgment = Recon.Judgment.New; + recon.match = null; + } else if (_judgment == Judgment.None) { + recon.judgment = Recon.Judgment.None; + recon.match = null; + } + } + + Cell newCell = new Cell(cell.value, recon); CellChange cellChange = new CellChange(rowIndex, _cellIndex, cell, newCell); _cellChanges.add(cellChange); diff --git a/src/main/java/com/metaweb/gridworks/operations/ReconMarkNewTopicsOperation.java b/src/main/java/com/metaweb/gridworks/operations/ReconMarkNewTopicsOperation.java index 96f62e669..215621745 100644 --- a/src/main/java/com/metaweb/gridworks/operations/ReconMarkNewTopicsOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/ReconMarkNewTopicsOperation.java @@ -1,6 +1,8 @@ package com.metaweb.gridworks.operations; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Properties; import org.json.JSONException; @@ -20,18 +22,21 @@ import com.metaweb.gridworks.model.changes.CellChange; public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperation { private static final long serialVersionUID = -5205694623711144436L; + final protected boolean _shareNewTopics; + static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception { JSONObject engineConfig = obj.getJSONObject("engineConfig"); - String columnName = obj.getString("columnName"); return new ReconMarkNewTopicsOperation( engineConfig, - columnName + obj.getString("columnName"), + obj.has("shareNewTopics") ? obj.getBoolean("shareNewTopics") : false ); } - public ReconMarkNewTopicsOperation(JSONObject engineConfig, String columnName) { + public ReconMarkNewTopicsOperation(JSONObject engineConfig, String columnName, boolean shareNewTopics) { super(engineConfig, columnName, false); + _shareNewTopics = shareNewTopics; } public void write(JSONWriter writer, Properties options) @@ -42,18 +47,25 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio writer.key("description"); writer.value(getBriefDescription()); writer.key("engineConfig"); writer.value(getEngineConfig()); writer.key("columnName"); writer.value(_columnName); + writer.key("shareNewTopics"); writer.value(_shareNewTopics); writer.endObject(); } protected String getBriefDescription() { - return "Mark to create new topics for cells in column " + _columnName; + return "Mark to create new topics for cells in column " + _columnName + + (_shareNewTopics ? + ", one topic for each group of similar cells" : + ", one topic for each cell"); } protected String createDescription(Column column, List cellChanges) { return "Mark to create new topics for " + cellChanges.size() + - " cells in column " + column.getHeaderLabel(); + " cells in column " + column.getHeaderLabel() + + (_shareNewTopics ? + ", one topic for each group of similar cells" : + ", one topic for each cell"); } protected RowVisitor createRowVisitor(Project project, List cellChanges) throws Exception { @@ -62,6 +74,7 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio return new RowVisitor() { int cellIndex; List cellChanges; + Map _sharedRecons = new HashMap(); public RowVisitor init(int cellIndex, List cellChanges) { this.cellIndex = cellIndex; @@ -70,15 +83,26 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio } public boolean visit(Project project, int rowIndex, Row row, boolean contextual) { - if (cellIndex < row.cells.size()) { - Cell cell = row.cells.get(cellIndex); - - Cell newCell = new Cell( - cell.value, - cell.recon != null ? cell.recon.dup() : new Recon() - ); - newCell.recon.match = null; - newCell.recon.judgment = Judgment.New; + Cell cell = row.getCell(cellIndex); + if (cell != null) { + Recon recon = null; + if (_shareNewTopics) { + String s = cell.value == null ? "" : cell.value.toString(); + if (_sharedRecons.containsKey(s)) { + recon = _sharedRecons.get(s); + } else { + recon = new Recon(); + recon.judgment = Judgment.New; + + _sharedRecons.put(s, recon); + } + } else { + recon = cell.recon == null ? new Recon() : cell.recon.dup(); + recon.match = null; + recon.judgment = Judgment.New; + } + + Cell newCell = new Cell(cell.value, recon); CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell); cellChanges.add(cellChange); diff --git a/src/main/java/com/metaweb/gridworks/protograph/transpose/Transposer.java b/src/main/java/com/metaweb/gridworks/protograph/transpose/Transposer.java index c5f462d4d..fe0f39fb2 100644 --- a/src/main/java/com/metaweb/gridworks/protograph/transpose/Transposer.java +++ b/src/main/java/com/metaweb/gridworks/protograph/transpose/Transposer.java @@ -8,8 +8,10 @@ import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Column; import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Row; +import com.metaweb.gridworks.model.Recon.Judgment; import com.metaweb.gridworks.protograph.AnonymousNode; import com.metaweb.gridworks.protograph.CellNode; +import com.metaweb.gridworks.protograph.CellTopicNode; import com.metaweb.gridworks.protograph.FreebaseProperty; import com.metaweb.gridworks.protograph.FreebaseTopicNode; import com.metaweb.gridworks.protograph.Link; @@ -25,7 +27,7 @@ public class Transposer { Node rootNode, TransposedNodeFactory nodeFactory ) { - Context rootContext = new Context(rootNode, null, null, 5); + Context rootContext = new Context(rootNode, null, null, 20); for (Row row : project.rows) { descend(project, protograph, nodeFactory, row, rootNode, rootContext); @@ -53,6 +55,13 @@ public class Transposer { Column column = project.columnModel.getColumnByName(node2.columnName); Cell cell = row.getCell(column.getCellIndex()); if (cell != null && !ExpressionUtils.isBlank(cell.value)) { + if (node2 instanceof CellTopicNode) { + if (!((CellTopicNode) node2).createForNoReconMatch && + (cell.recon == null || cell.recon.judgment == Judgment.None)) { + return; + } + } + context.count++; if (context.limit > 0 && context.count > context.limit) { return; diff --git a/src/main/java/com/metaweb/gridworks/protograph/transpose/TripleLoaderTransposedNodeFactory.java b/src/main/java/com/metaweb/gridworks/protograph/transpose/TripleLoaderTransposedNodeFactory.java index 1361aa59d..5b3b1ef3a 100644 --- a/src/main/java/com/metaweb/gridworks/protograph/transpose/TripleLoaderTransposedNodeFactory.java +++ b/src/main/java/com/metaweb/gridworks/protograph/transpose/TripleLoaderTransposedNodeFactory.java @@ -9,6 +9,7 @@ import org.json.JSONObject; import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Recon; +import com.metaweb.gridworks.model.Recon.Judgment; import com.metaweb.gridworks.protograph.AnonymousNode; import com.metaweb.gridworks.protograph.CellKeyNode; import com.metaweb.gridworks.protograph.CellNode; @@ -22,6 +23,7 @@ public class TripleLoaderTransposedNodeFactory implements TransposedNodeFactory protected List rootNodes = new LinkedList(); protected StringBuffer stringBuffer; protected Map varPool = new HashMap(); + protected Map newTopicVars = new HashMap(); public String getLoad() { stringBuffer = new StringBuffer(); @@ -38,7 +40,7 @@ public class TripleLoaderTransposedNodeFactory implements TransposedNodeFactory stringBuffer.append(line); } protected void writeLine(String subject, String predicate, String object) { - if (subject != null) { + if (subject != null && object != null) { writeLine("{ 's' : '" + subject + "', 'p' : '" + predicate + "', 'o' : " + object + " }"); } } @@ -110,21 +112,33 @@ public class TripleLoaderTransposedNodeFactory implements TransposedNodeFactory public String write(String subject, String predicate) { String id = null; - if (cell.recon != null && + if (cell.recon != null && cell.recon.judgment == Recon.Judgment.Matched && cell.recon.match != null) { - id = cell.recon.match.topicID; - } else { - long var = 0; - if (varPool.containsKey(node.columnName)) { - var = varPool.get(node.columnName); - } - varPool.put(node.columnName, var + 1); - - id = "$" + node.columnName.replaceAll("\\W+", "_") + "_" + var; - writeLine("{ 's' : '" + id + "', 'p' : 'type', 'o' : '" + node.type.id + "' }"); - writeLine("{ 's' : '" + id + "', 'p' : 'name', 'o' : " + JSONObject.quote(cell.value.toString()) + " }"); + id = cell.recon.match.topicID; + } else if (node.createForNoReconMatch || + (cell.recon != null && cell.recon.judgment == Judgment.New)) { + if (cell.recon != null && newTopicVars.containsKey(cell.recon.id)) { + id = newTopicVars.get(cell.recon.id); + } else { + long var = 0; + if (varPool.containsKey(node.columnName)) { + var = varPool.get(node.columnName); + } + varPool.put(node.columnName, var + 1); + + id = "$" + node.columnName.replaceAll("\\W+", "_") + "_" + var; + + writeLine("{ 's' : '" + id + "', 'p' : 'type', 'o' : '" + node.type.id + "' }"); + writeLine("{ 's' : '" + id + "', 'p' : 'name', 'o' : " + JSONObject.quote(cell.value.toString()) + " }"); + + if (cell.recon != null) { + newTopicVars.put(cell.recon.id, id); + } + } + } else { + return null; } if (subject != null) { diff --git a/src/main/webapp/scripts/project/data-table-cell-ui.js b/src/main/webapp/scripts/project/data-table-cell-ui.js index 9978725d6..7038a6851 100644 --- a/src/main/webapp/scripts/project/data-table-cell-ui.js +++ b/src/main/webapp/scripts/project/data-table-cell-ui.js @@ -130,7 +130,7 @@ DataTableCellUI.prototype._doMatchNewTopicToOneCell = function() { }; DataTableCellUI.prototype._doMatchNewTopicToSimilarCells = function() { - this._doJudgmentForSimilarCells("new"); + this._doJudgmentForSimilarCells("new", { shareNewTopics: true }); }; DataTableCellUI.prototype._doMatchTopicToOneCell = function(candidate) { diff --git a/src/main/webapp/scripts/project/data-table-column-header-ui.js b/src/main/webapp/scripts/project/data-table-column-header-ui.js index 1ee90eceb..781c01b02 100644 --- a/src/main/webapp/scripts/project/data-table-column-header-ui.js +++ b/src/main/webapp/scripts/project/data-table-column-header-ui.js @@ -209,8 +209,8 @@ DataTableColumnHeaderUI.prototype._createMenuForColumnHeader = function(elmt) { } }, { - label: "Create One New Topic for All Cells", - tooltip: "Mark to create one new, common topic for all cells in this column for all current filtered rows", + label: "Create One New Topic for Similar Cells", + tooltip: "Mark to create one new topic for each group of similar cells in this column for all current filtered rows", click: function() { self._doReconMarkNewTopics(true); } @@ -459,10 +459,10 @@ DataTableColumnHeaderUI.prototype._doReconMatchBestCandidates = function() { ); }; -DataTableColumnHeaderUI.prototype._doReconMarkNewTopics = function() { +DataTableColumnHeaderUI.prototype._doReconMarkNewTopics = function(shareNewTopics) { this._dataTableView.doPostThenUpdate( "recon-mark-new-topics", - { columnName: this._column.headerLabel } + { columnName: this._column.headerLabel, shareNewTopics: shareNewTopics } ); };