Support reusing newly created topics for cells with the same content.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@121 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-02-22 22:15:48 +00:00
parent e4b01cb36c
commit 5e9be8c258
9 changed files with 149 additions and 73 deletions

View File

@ -6,7 +6,9 @@ import org.json.JSONObject;
import com.metaweb.gridworks.commands.EngineDependentCommand; import com.metaweb.gridworks.commands.EngineDependentCommand;
import com.metaweb.gridworks.model.AbstractOperation; import com.metaweb.gridworks.model.AbstractOperation;
import com.metaweb.gridworks.model.Recon;
import com.metaweb.gridworks.model.ReconCandidate; import com.metaweb.gridworks.model.ReconCandidate;
import com.metaweb.gridworks.model.Recon.Judgment;
import com.metaweb.gridworks.operations.ReconJudgeSimilarCellsOperation; import com.metaweb.gridworks.operations.ReconJudgeSimilarCellsOperation;
public class ReconJudgeSimilarCellsCommand extends EngineDependentCommand { public class ReconJudgeSimilarCellsCommand extends EngineDependentCommand {
@ -17,7 +19,7 @@ public class ReconJudgeSimilarCellsCommand extends EngineDependentCommand {
String columnName = request.getParameter("columnName"); String columnName = request.getParameter("columnName");
String similarValue = request.getParameter("similarValue"); String similarValue = request.getParameter("similarValue");
String judgment = request.getParameter("judgment"); Judgment judgment = Recon.stringToJudgment(request.getParameter("judgment"));
ReconCandidate match = null; ReconCandidate match = null;
String topicID = request.getParameter("topicID"); String topicID = request.getParameter("topicID");
@ -33,12 +35,15 @@ public class ReconJudgeSimilarCellsCommand extends EngineDependentCommand {
); );
} }
String shareNewTopics = request.getParameter("shareNewTopics");
return new ReconJudgeSimilarCellsOperation( return new ReconJudgeSimilarCellsOperation(
engineConfig, engineConfig,
columnName, columnName,
similarValue, similarValue,
judgment, judgment,
match match,
"true".equals(shareNewTopics)
); );
} }
} }

View File

@ -14,8 +14,10 @@ public class ReconMarkNewTopicsCommand extends EngineDependentCommand {
protected AbstractOperation createOperation(HttpServletRequest request, protected AbstractOperation createOperation(HttpServletRequest request,
JSONObject engineConfig) throws Exception { JSONObject engineConfig) throws Exception {
String columnName = request.getParameter("columnName"); return new ReconMarkNewTopicsOperation(
engineConfig,
return new ReconMarkNewTopicsOperation(engineConfig, columnName); request.getParameter("columnName"),
"true".equals(request.getParameter("shareNewTopics"))
);
} }
} }

View File

@ -58,11 +58,16 @@ public class Recon implements Serializable, HasFields, Jsonizable {
s_featureMap.put("nameWordDistance", Feature_nameWordDistance); s_featureMap.put("nameWordDistance", Feature_nameWordDistance);
} }
final public long id;
public Object[] features = new Object[Feature_max]; public Object[] features = new Object[Feature_max];
public List<ReconCandidate> candidates = new LinkedList<ReconCandidate>(); public List<ReconCandidate> candidates = new LinkedList<ReconCandidate>();
public Judgment judgment = Judgment.None; public Judgment judgment = Judgment.None;
public ReconCandidate match = null; public ReconCandidate match = null;
public Recon() {
id = System.currentTimeMillis() * 1000000 + Math.round(Math.random() * 1000000);
}
public Recon dup() { public Recon dup() {
Recon r = new Recon(); Recon r = new Recon();

View File

@ -1,6 +1,8 @@
package com.metaweb.gridworks.operations; package com.metaweb.gridworks.operations;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Properties; import java.util.Properties;
import org.json.JSONArray; import org.json.JSONArray;
@ -26,6 +28,7 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
final protected String _similarValue; final protected String _similarValue;
final protected Judgment _judgment; final protected Judgment _judgment;
final protected ReconCandidate _match; final protected ReconCandidate _match;
final protected boolean _shareNewTopics;
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception { static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
JSONObject engineConfig = obj.getJSONObject("engineConfig"); JSONObject engineConfig = obj.getJSONObject("engineConfig");
@ -59,7 +62,8 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
obj.getString("columnName"), obj.getString("columnName"),
obj.getString("similarValue"), obj.getString("similarValue"),
judgment, judgment,
match match,
obj.has("shareNewTopics") ? obj.getBoolean("shareNewTopics") : false
); );
} }
@ -68,25 +72,14 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
String columnName, String columnName,
String similarValue, String similarValue,
Judgment judgment, Judgment judgment,
ReconCandidate match ReconCandidate match,
boolean shareNewTopics
) { ) {
super(engineConfig, columnName, false); super(engineConfig, columnName, false);
this._similarValue = similarValue; this._similarValue = similarValue;
this._judgment = judgment; this._judgment = judgment;
this._match = match; this._match = match;
} this._shareNewTopics = shareNewTopics;
public ReconJudgeSimilarCellsOperation(
JSONObject engineConfig,
String columnName,
String similarValue,
String judgmentString,
ReconCandidate match
) {
super(engineConfig, columnName, false);
this._similarValue = similarValue;
this._judgment = Recon.stringToJudgment(judgmentString);
this._match = match;
} }
public void write(JSONWriter writer, Properties options) public void write(JSONWriter writer, Properties options)
@ -102,6 +95,7 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
if (_match != null) { if (_match != null) {
writer.key("match"); _match.write(writer, options); writer.key("match"); _match.write(writer, options);
} }
writer.key("shareNewTopics"); writer.value(_shareNewTopics);
writer.endObject(); writer.endObject();
} }
@ -111,8 +105,13 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
return "Discard recon judgments for cells containing \"" + return "Discard recon judgments for cells containing \"" +
_similarValue + "\" in column " + _columnName; _similarValue + "\" in column " + _columnName;
} else if (_judgment == Judgment.New) { } else if (_judgment == Judgment.New) {
return "Mark to create new topics for cells containing \"" + if (_shareNewTopics) {
_similarValue + "\" in column " + _columnName; return "Mark to create one single new topic for all cells containing \"" +
_similarValue + "\" in column " + _columnName;
} else {
return "Mark to create one new topic for each cell containing \"" +
_similarValue + "\" in column " + _columnName;
}
} else if (_judgment == Judgment.Matched) { } else if (_judgment == Judgment.Matched) {
return "Match topic " + return "Match topic " +
_match.topicName + " (" + _match.topicName + " (" +
@ -129,8 +128,13 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
return "Discard recon judgments for " + cellChanges.size() + " cells containing \"" + return "Discard recon judgments for " + cellChanges.size() + " cells containing \"" +
_similarValue + "\" in column " + _columnName; _similarValue + "\" in column " + _columnName;
} else if (_judgment == Judgment.New) { } else if (_judgment == Judgment.New) {
return "Mark to create new topics for " + cellChanges.size() + " cells containing \"" + if (_shareNewTopics) {
_similarValue + "\" in column " + _columnName; return "Mark to create one single new topic for " + cellChanges.size() + " cells containing \"" +
_similarValue + "\" in column " + _columnName;
} else {
return "Mark to create one new topic for each of " + cellChanges.size() + " cells containing \"" +
_similarValue + "\" in column " + _columnName;
}
} else if (_judgment == Judgment.Matched) { } else if (_judgment == Judgment.Matched) {
return "Match topic " + return "Match topic " +
_match.topicName + " (" + _match.topicName + " (" +
@ -145,8 +149,9 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
Column column = project.columnModel.getColumnByName(_columnName); Column column = project.columnModel.getColumnByName(_columnName);
return new RowVisitor() { return new RowVisitor() {
int _cellIndex; int _cellIndex;
List<CellChange> _cellChanges; List<CellChange> _cellChanges;
Map<String, Recon> _sharedRecons = new HashMap<String, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) { public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
_cellIndex = cellIndex; _cellIndex = cellIndex;
@ -160,20 +165,32 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
!ExpressionUtils.isBlank(cell.value) && !ExpressionUtils.isBlank(cell.value) &&
_similarValue.equals(cell.value)) { _similarValue.equals(cell.value)) {
Cell newCell = new Cell( Recon recon = null;
cell.value, if (_judgment == Judgment.New && _shareNewTopics) {
cell.recon == null ? new Recon() : cell.recon.dup() String s = cell.value.toString();
); if (_sharedRecons.containsKey(s)) {
recon = _sharedRecons.get(s);
} else {
recon = new Recon();
recon.judgment = Judgment.New;
if (_judgment == Judgment.Matched) { _sharedRecons.put(s, recon);
newCell.recon.judgment = Recon.Judgment.Matched; }
newCell.recon.match = _match; } else {
} else if (_judgment == Judgment.New) { recon = cell.recon == null ? new Recon() : cell.recon.dup();
newCell.recon.judgment = Recon.Judgment.New; if (_judgment == Judgment.Matched) {
} else if (_judgment == Judgment.None) { recon.judgment = Recon.Judgment.Matched;
newCell.recon.judgment = Recon.Judgment.None; recon.match = _match;
newCell.recon.match = null; } else if (_judgment == Judgment.New) {
} recon.judgment = Recon.Judgment.New;
recon.match = null;
} else if (_judgment == Judgment.None) {
recon.judgment = Recon.Judgment.None;
recon.match = null;
}
}
Cell newCell = new Cell(cell.value, recon);
CellChange cellChange = new CellChange(rowIndex, _cellIndex, cell, newCell); CellChange cellChange = new CellChange(rowIndex, _cellIndex, cell, newCell);
_cellChanges.add(cellChange); _cellChanges.add(cellChange);

View File

@ -1,6 +1,8 @@
package com.metaweb.gridworks.operations; package com.metaweb.gridworks.operations;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Properties; import java.util.Properties;
import org.json.JSONException; import org.json.JSONException;
@ -20,18 +22,21 @@ import com.metaweb.gridworks.model.changes.CellChange;
public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperation { public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperation {
private static final long serialVersionUID = -5205694623711144436L; private static final long serialVersionUID = -5205694623711144436L;
final protected boolean _shareNewTopics;
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception { static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
JSONObject engineConfig = obj.getJSONObject("engineConfig"); JSONObject engineConfig = obj.getJSONObject("engineConfig");
String columnName = obj.getString("columnName");
return new ReconMarkNewTopicsOperation( return new ReconMarkNewTopicsOperation(
engineConfig, engineConfig,
columnName obj.getString("columnName"),
obj.has("shareNewTopics") ? obj.getBoolean("shareNewTopics") : false
); );
} }
public ReconMarkNewTopicsOperation(JSONObject engineConfig, String columnName) { public ReconMarkNewTopicsOperation(JSONObject engineConfig, String columnName, boolean shareNewTopics) {
super(engineConfig, columnName, false); super(engineConfig, columnName, false);
_shareNewTopics = shareNewTopics;
} }
public void write(JSONWriter writer, Properties options) public void write(JSONWriter writer, Properties options)
@ -42,18 +47,25 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
writer.key("description"); writer.value(getBriefDescription()); writer.key("description"); writer.value(getBriefDescription());
writer.key("engineConfig"); writer.value(getEngineConfig()); writer.key("engineConfig"); writer.value(getEngineConfig());
writer.key("columnName"); writer.value(_columnName); writer.key("columnName"); writer.value(_columnName);
writer.key("shareNewTopics"); writer.value(_shareNewTopics);
writer.endObject(); writer.endObject();
} }
protected String getBriefDescription() { protected String getBriefDescription() {
return "Mark to create new topics for cells in column " + _columnName; return "Mark to create new topics for cells in column " + _columnName +
(_shareNewTopics ?
", one topic for each group of similar cells" :
", one topic for each cell");
} }
protected String createDescription(Column column, protected String createDescription(Column column,
List<CellChange> cellChanges) { List<CellChange> cellChanges) {
return "Mark to create new topics for " + cellChanges.size() + return "Mark to create new topics for " + cellChanges.size() +
" cells in column " + column.getHeaderLabel(); " cells in column " + column.getHeaderLabel() +
(_shareNewTopics ?
", one topic for each group of similar cells" :
", one topic for each cell");
} }
protected RowVisitor createRowVisitor(Project project, List<CellChange> cellChanges) throws Exception { protected RowVisitor createRowVisitor(Project project, List<CellChange> cellChanges) throws Exception {
@ -62,6 +74,7 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
return new RowVisitor() { return new RowVisitor() {
int cellIndex; int cellIndex;
List<CellChange> cellChanges; List<CellChange> cellChanges;
Map<String, Recon> _sharedRecons = new HashMap<String, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) { public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
this.cellIndex = cellIndex; this.cellIndex = cellIndex;
@ -70,15 +83,26 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
} }
public boolean visit(Project project, int rowIndex, Row row, boolean contextual) { public boolean visit(Project project, int rowIndex, Row row, boolean contextual) {
if (cellIndex < row.cells.size()) { Cell cell = row.getCell(cellIndex);
Cell cell = row.cells.get(cellIndex); if (cell != null) {
Recon recon = null;
if (_shareNewTopics) {
String s = cell.value == null ? "" : cell.value.toString();
if (_sharedRecons.containsKey(s)) {
recon = _sharedRecons.get(s);
} else {
recon = new Recon();
recon.judgment = Judgment.New;
Cell newCell = new Cell( _sharedRecons.put(s, recon);
cell.value, }
cell.recon != null ? cell.recon.dup() : new Recon() } else {
); recon = cell.recon == null ? new Recon() : cell.recon.dup();
newCell.recon.match = null; recon.match = null;
newCell.recon.judgment = Judgment.New; recon.judgment = Judgment.New;
}
Cell newCell = new Cell(cell.value, recon);
CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell); CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell);
cellChanges.add(cellChange); cellChanges.add(cellChange);

View File

@ -8,8 +8,10 @@ import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column; import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row; import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.model.Recon.Judgment;
import com.metaweb.gridworks.protograph.AnonymousNode; import com.metaweb.gridworks.protograph.AnonymousNode;
import com.metaweb.gridworks.protograph.CellNode; import com.metaweb.gridworks.protograph.CellNode;
import com.metaweb.gridworks.protograph.CellTopicNode;
import com.metaweb.gridworks.protograph.FreebaseProperty; import com.metaweb.gridworks.protograph.FreebaseProperty;
import com.metaweb.gridworks.protograph.FreebaseTopicNode; import com.metaweb.gridworks.protograph.FreebaseTopicNode;
import com.metaweb.gridworks.protograph.Link; import com.metaweb.gridworks.protograph.Link;
@ -25,7 +27,7 @@ public class Transposer {
Node rootNode, Node rootNode,
TransposedNodeFactory nodeFactory TransposedNodeFactory nodeFactory
) { ) {
Context rootContext = new Context(rootNode, null, null, 5); Context rootContext = new Context(rootNode, null, null, 20);
for (Row row : project.rows) { for (Row row : project.rows) {
descend(project, protograph, nodeFactory, row, rootNode, rootContext); descend(project, protograph, nodeFactory, row, rootNode, rootContext);
@ -53,6 +55,13 @@ public class Transposer {
Column column = project.columnModel.getColumnByName(node2.columnName); Column column = project.columnModel.getColumnByName(node2.columnName);
Cell cell = row.getCell(column.getCellIndex()); Cell cell = row.getCell(column.getCellIndex());
if (cell != null && !ExpressionUtils.isBlank(cell.value)) { if (cell != null && !ExpressionUtils.isBlank(cell.value)) {
if (node2 instanceof CellTopicNode) {
if (!((CellTopicNode) node2).createForNoReconMatch &&
(cell.recon == null || cell.recon.judgment == Judgment.None)) {
return;
}
}
context.count++; context.count++;
if (context.limit > 0 && context.count > context.limit) { if (context.limit > 0 && context.count > context.limit) {
return; return;

View File

@ -9,6 +9,7 @@ import org.json.JSONObject;
import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Recon; import com.metaweb.gridworks.model.Recon;
import com.metaweb.gridworks.model.Recon.Judgment;
import com.metaweb.gridworks.protograph.AnonymousNode; import com.metaweb.gridworks.protograph.AnonymousNode;
import com.metaweb.gridworks.protograph.CellKeyNode; import com.metaweb.gridworks.protograph.CellKeyNode;
import com.metaweb.gridworks.protograph.CellNode; import com.metaweb.gridworks.protograph.CellNode;
@ -22,6 +23,7 @@ public class TripleLoaderTransposedNodeFactory implements TransposedNodeFactory
protected List<WritingTransposedNode> rootNodes = new LinkedList<WritingTransposedNode>(); protected List<WritingTransposedNode> rootNodes = new LinkedList<WritingTransposedNode>();
protected StringBuffer stringBuffer; protected StringBuffer stringBuffer;
protected Map<String, Long> varPool = new HashMap<String, Long>(); protected Map<String, Long> varPool = new HashMap<String, Long>();
protected Map<Long, String> newTopicVars = new HashMap<Long, String>();
public String getLoad() { public String getLoad() {
stringBuffer = new StringBuffer(); stringBuffer = new StringBuffer();
@ -38,7 +40,7 @@ public class TripleLoaderTransposedNodeFactory implements TransposedNodeFactory
stringBuffer.append(line); stringBuffer.append(line);
} }
protected void writeLine(String subject, String predicate, String object) { protected void writeLine(String subject, String predicate, String object) {
if (subject != null) { if (subject != null && object != null) {
writeLine("{ 's' : '" + subject + "', 'p' : '" + predicate + "', 'o' : " + object + " }"); writeLine("{ 's' : '" + subject + "', 'p' : '" + predicate + "', 'o' : " + object + " }");
} }
} }
@ -113,18 +115,30 @@ public class TripleLoaderTransposedNodeFactory implements TransposedNodeFactory
if (cell.recon != null && if (cell.recon != null &&
cell.recon.judgment == Recon.Judgment.Matched && cell.recon.judgment == Recon.Judgment.Matched &&
cell.recon.match != null) { cell.recon.match != null) {
id = cell.recon.match.topicID; id = cell.recon.match.topicID;
} else if (node.createForNoReconMatch ||
(cell.recon != null && cell.recon.judgment == Judgment.New)) {
if (cell.recon != null && newTopicVars.containsKey(cell.recon.id)) {
id = newTopicVars.get(cell.recon.id);
} else {
long var = 0;
if (varPool.containsKey(node.columnName)) {
var = varPool.get(node.columnName);
}
varPool.put(node.columnName, var + 1);
id = "$" + node.columnName.replaceAll("\\W+", "_") + "_" + var;
writeLine("{ 's' : '" + id + "', 'p' : 'type', 'o' : '" + node.type.id + "' }");
writeLine("{ 's' : '" + id + "', 'p' : 'name', 'o' : " + JSONObject.quote(cell.value.toString()) + " }");
if (cell.recon != null) {
newTopicVars.put(cell.recon.id, id);
}
}
} else { } else {
long var = 0; return null;
if (varPool.containsKey(node.columnName)) {
var = varPool.get(node.columnName);
}
varPool.put(node.columnName, var + 1);
id = "$" + node.columnName.replaceAll("\\W+", "_") + "_" + var;
writeLine("{ 's' : '" + id + "', 'p' : 'type', 'o' : '" + node.type.id + "' }");
writeLine("{ 's' : '" + id + "', 'p' : 'name', 'o' : " + JSONObject.quote(cell.value.toString()) + " }");
} }
if (subject != null) { if (subject != null) {

View File

@ -130,7 +130,7 @@ DataTableCellUI.prototype._doMatchNewTopicToOneCell = function() {
}; };
DataTableCellUI.prototype._doMatchNewTopicToSimilarCells = function() { DataTableCellUI.prototype._doMatchNewTopicToSimilarCells = function() {
this._doJudgmentForSimilarCells("new"); this._doJudgmentForSimilarCells("new", { shareNewTopics: true });
}; };
DataTableCellUI.prototype._doMatchTopicToOneCell = function(candidate) { DataTableCellUI.prototype._doMatchTopicToOneCell = function(candidate) {

View File

@ -209,8 +209,8 @@ DataTableColumnHeaderUI.prototype._createMenuForColumnHeader = function(elmt) {
} }
}, },
{ {
label: "Create One New Topic for All Cells", label: "Create One New Topic for Similar Cells",
tooltip: "Mark to create one new, common topic for all cells in this column for all current filtered rows", tooltip: "Mark to create one new topic for each group of similar cells in this column for all current filtered rows",
click: function() { click: function() {
self._doReconMarkNewTopics(true); self._doReconMarkNewTopics(true);
} }
@ -459,10 +459,10 @@ DataTableColumnHeaderUI.prototype._doReconMatchBestCandidates = function() {
); );
}; };
DataTableColumnHeaderUI.prototype._doReconMarkNewTopics = function() { DataTableColumnHeaderUI.prototype._doReconMarkNewTopics = function(shareNewTopics) {
this._dataTableView.doPostThenUpdate( this._dataTableView.doPostThenUpdate(
"recon-mark-new-topics", "recon-mark-new-topics",
{ columnName: this._column.headerLabel } { columnName: this._column.headerLabel, shareNewTopics: shareNewTopics }
); );
}; };