Support reusing newly created topics for cells with the same content.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@121 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-02-22 22:15:48 +00:00
parent e4b01cb36c
commit 5e9be8c258
9 changed files with 149 additions and 73 deletions

View File

@ -6,7 +6,9 @@ import org.json.JSONObject;
import com.metaweb.gridworks.commands.EngineDependentCommand;
import com.metaweb.gridworks.model.AbstractOperation;
import com.metaweb.gridworks.model.Recon;
import com.metaweb.gridworks.model.ReconCandidate;
import com.metaweb.gridworks.model.Recon.Judgment;
import com.metaweb.gridworks.operations.ReconJudgeSimilarCellsOperation;
public class ReconJudgeSimilarCellsCommand extends EngineDependentCommand {
@ -17,7 +19,7 @@ public class ReconJudgeSimilarCellsCommand extends EngineDependentCommand {
String columnName = request.getParameter("columnName");
String similarValue = request.getParameter("similarValue");
String judgment = request.getParameter("judgment");
Judgment judgment = Recon.stringToJudgment(request.getParameter("judgment"));
ReconCandidate match = null;
String topicID = request.getParameter("topicID");
@ -33,12 +35,15 @@ public class ReconJudgeSimilarCellsCommand extends EngineDependentCommand {
);
}
String shareNewTopics = request.getParameter("shareNewTopics");
return new ReconJudgeSimilarCellsOperation(
engineConfig,
columnName,
similarValue,
judgment,
match
match,
"true".equals(shareNewTopics)
);
}
}

View File

@ -14,8 +14,10 @@ public class ReconMarkNewTopicsCommand extends EngineDependentCommand {
protected AbstractOperation createOperation(HttpServletRequest request,
JSONObject engineConfig) throws Exception {
String columnName = request.getParameter("columnName");
return new ReconMarkNewTopicsOperation(engineConfig, columnName);
return new ReconMarkNewTopicsOperation(
engineConfig,
request.getParameter("columnName"),
"true".equals(request.getParameter("shareNewTopics"))
);
}
}

View File

@ -58,11 +58,16 @@ public class Recon implements Serializable, HasFields, Jsonizable {
s_featureMap.put("nameWordDistance", Feature_nameWordDistance);
}
final public long id;
public Object[] features = new Object[Feature_max];
public List<ReconCandidate> candidates = new LinkedList<ReconCandidate>();
public Judgment judgment = Judgment.None;
public ReconCandidate match = null;
public Recon() {
id = System.currentTimeMillis() * 1000000 + Math.round(Math.random() * 1000000);
}
public Recon dup() {
Recon r = new Recon();

View File

@ -1,6 +1,8 @@
package com.metaweb.gridworks.operations;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.json.JSONArray;
@ -26,6 +28,7 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
final protected String _similarValue;
final protected Judgment _judgment;
final protected ReconCandidate _match;
final protected boolean _shareNewTopics;
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
JSONObject engineConfig = obj.getJSONObject("engineConfig");
@ -59,7 +62,8 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
obj.getString("columnName"),
obj.getString("similarValue"),
judgment,
match
match,
obj.has("shareNewTopics") ? obj.getBoolean("shareNewTopics") : false
);
}
@ -68,25 +72,14 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
String columnName,
String similarValue,
Judgment judgment,
ReconCandidate match
ReconCandidate match,
boolean shareNewTopics
) {
super(engineConfig, columnName, false);
this._similarValue = similarValue;
this._judgment = judgment;
this._match = match;
}
public ReconJudgeSimilarCellsOperation(
JSONObject engineConfig,
String columnName,
String similarValue,
String judgmentString,
ReconCandidate match
) {
super(engineConfig, columnName, false);
this._similarValue = similarValue;
this._judgment = Recon.stringToJudgment(judgmentString);
this._match = match;
this._shareNewTopics = shareNewTopics;
}
public void write(JSONWriter writer, Properties options)
@ -102,6 +95,7 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
if (_match != null) {
writer.key("match"); _match.write(writer, options);
}
writer.key("shareNewTopics"); writer.value(_shareNewTopics);
writer.endObject();
}
@ -111,8 +105,13 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
return "Discard recon judgments for cells containing \"" +
_similarValue + "\" in column " + _columnName;
} else if (_judgment == Judgment.New) {
return "Mark to create new topics for cells containing \"" +
_similarValue + "\" in column " + _columnName;
if (_shareNewTopics) {
return "Mark to create one single new topic for all cells containing \"" +
_similarValue + "\" in column " + _columnName;
} else {
return "Mark to create one new topic for each cell containing \"" +
_similarValue + "\" in column " + _columnName;
}
} else if (_judgment == Judgment.Matched) {
return "Match topic " +
_match.topicName + " (" +
@ -129,8 +128,13 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
return "Discard recon judgments for " + cellChanges.size() + " cells containing \"" +
_similarValue + "\" in column " + _columnName;
} else if (_judgment == Judgment.New) {
return "Mark to create new topics for " + cellChanges.size() + " cells containing \"" +
_similarValue + "\" in column " + _columnName;
if (_shareNewTopics) {
return "Mark to create one single new topic for " + cellChanges.size() + " cells containing \"" +
_similarValue + "\" in column " + _columnName;
} else {
return "Mark to create one new topic for each of " + cellChanges.size() + " cells containing \"" +
_similarValue + "\" in column " + _columnName;
}
} else if (_judgment == Judgment.Matched) {
return "Match topic " +
_match.topicName + " (" +
@ -145,8 +149,9 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
Column column = project.columnModel.getColumnByName(_columnName);
return new RowVisitor() {
int _cellIndex;
List<CellChange> _cellChanges;
int _cellIndex;
List<CellChange> _cellChanges;
Map<String, Recon> _sharedRecons = new HashMap<String, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
_cellIndex = cellIndex;
@ -160,20 +165,32 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
!ExpressionUtils.isBlank(cell.value) &&
_similarValue.equals(cell.value)) {
Cell newCell = new Cell(
cell.value,
cell.recon == null ? new Recon() : cell.recon.dup()
);
Recon recon = null;
if (_judgment == Judgment.New && _shareNewTopics) {
String s = cell.value.toString();
if (_sharedRecons.containsKey(s)) {
recon = _sharedRecons.get(s);
} else {
recon = new Recon();
recon.judgment = Judgment.New;
if (_judgment == Judgment.Matched) {
newCell.recon.judgment = Recon.Judgment.Matched;
newCell.recon.match = _match;
} else if (_judgment == Judgment.New) {
newCell.recon.judgment = Recon.Judgment.New;
} else if (_judgment == Judgment.None) {
newCell.recon.judgment = Recon.Judgment.None;
newCell.recon.match = null;
}
_sharedRecons.put(s, recon);
}
} else {
recon = cell.recon == null ? new Recon() : cell.recon.dup();
if (_judgment == Judgment.Matched) {
recon.judgment = Recon.Judgment.Matched;
recon.match = _match;
} else if (_judgment == Judgment.New) {
recon.judgment = Recon.Judgment.New;
recon.match = null;
} else if (_judgment == Judgment.None) {
recon.judgment = Recon.Judgment.None;
recon.match = null;
}
}
Cell newCell = new Cell(cell.value, recon);
CellChange cellChange = new CellChange(rowIndex, _cellIndex, cell, newCell);
_cellChanges.add(cellChange);

View File

@ -1,6 +1,8 @@
package com.metaweb.gridworks.operations;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.json.JSONException;
@ -20,18 +22,21 @@ import com.metaweb.gridworks.model.changes.CellChange;
public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperation {
private static final long serialVersionUID = -5205694623711144436L;
final protected boolean _shareNewTopics;
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
JSONObject engineConfig = obj.getJSONObject("engineConfig");
String columnName = obj.getString("columnName");
return new ReconMarkNewTopicsOperation(
engineConfig,
columnName
obj.getString("columnName"),
obj.has("shareNewTopics") ? obj.getBoolean("shareNewTopics") : false
);
}
public ReconMarkNewTopicsOperation(JSONObject engineConfig, String columnName) {
public ReconMarkNewTopicsOperation(JSONObject engineConfig, String columnName, boolean shareNewTopics) {
super(engineConfig, columnName, false);
_shareNewTopics = shareNewTopics;
}
public void write(JSONWriter writer, Properties options)
@ -42,18 +47,25 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
writer.key("description"); writer.value(getBriefDescription());
writer.key("engineConfig"); writer.value(getEngineConfig());
writer.key("columnName"); writer.value(_columnName);
writer.key("shareNewTopics"); writer.value(_shareNewTopics);
writer.endObject();
}
protected String getBriefDescription() {
return "Mark to create new topics for cells in column " + _columnName;
return "Mark to create new topics for cells in column " + _columnName +
(_shareNewTopics ?
", one topic for each group of similar cells" :
", one topic for each cell");
}
protected String createDescription(Column column,
List<CellChange> cellChanges) {
return "Mark to create new topics for " + cellChanges.size() +
" cells in column " + column.getHeaderLabel();
" cells in column " + column.getHeaderLabel() +
(_shareNewTopics ?
", one topic for each group of similar cells" :
", one topic for each cell");
}
protected RowVisitor createRowVisitor(Project project, List<CellChange> cellChanges) throws Exception {
@ -62,6 +74,7 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
return new RowVisitor() {
int cellIndex;
List<CellChange> cellChanges;
Map<String, Recon> _sharedRecons = new HashMap<String, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
this.cellIndex = cellIndex;
@ -70,15 +83,26 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
}
public boolean visit(Project project, int rowIndex, Row row, boolean contextual) {
if (cellIndex < row.cells.size()) {
Cell cell = row.cells.get(cellIndex);
Cell cell = row.getCell(cellIndex);
if (cell != null) {
Recon recon = null;
if (_shareNewTopics) {
String s = cell.value == null ? "" : cell.value.toString();
if (_sharedRecons.containsKey(s)) {
recon = _sharedRecons.get(s);
} else {
recon = new Recon();
recon.judgment = Judgment.New;
Cell newCell = new Cell(
cell.value,
cell.recon != null ? cell.recon.dup() : new Recon()
);
newCell.recon.match = null;
newCell.recon.judgment = Judgment.New;
_sharedRecons.put(s, recon);
}
} else {
recon = cell.recon == null ? new Recon() : cell.recon.dup();
recon.match = null;
recon.judgment = Judgment.New;
}
Cell newCell = new Cell(cell.value, recon);
CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell);
cellChanges.add(cellChange);

View File

@ -8,8 +8,10 @@ import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.model.Recon.Judgment;
import com.metaweb.gridworks.protograph.AnonymousNode;
import com.metaweb.gridworks.protograph.CellNode;
import com.metaweb.gridworks.protograph.CellTopicNode;
import com.metaweb.gridworks.protograph.FreebaseProperty;
import com.metaweb.gridworks.protograph.FreebaseTopicNode;
import com.metaweb.gridworks.protograph.Link;
@ -25,7 +27,7 @@ public class Transposer {
Node rootNode,
TransposedNodeFactory nodeFactory
) {
Context rootContext = new Context(rootNode, null, null, 5);
Context rootContext = new Context(rootNode, null, null, 20);
for (Row row : project.rows) {
descend(project, protograph, nodeFactory, row, rootNode, rootContext);
@ -53,6 +55,13 @@ public class Transposer {
Column column = project.columnModel.getColumnByName(node2.columnName);
Cell cell = row.getCell(column.getCellIndex());
if (cell != null && !ExpressionUtils.isBlank(cell.value)) {
if (node2 instanceof CellTopicNode) {
if (!((CellTopicNode) node2).createForNoReconMatch &&
(cell.recon == null || cell.recon.judgment == Judgment.None)) {
return;
}
}
context.count++;
if (context.limit > 0 && context.count > context.limit) {
return;

View File

@ -9,6 +9,7 @@ import org.json.JSONObject;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Recon;
import com.metaweb.gridworks.model.Recon.Judgment;
import com.metaweb.gridworks.protograph.AnonymousNode;
import com.metaweb.gridworks.protograph.CellKeyNode;
import com.metaweb.gridworks.protograph.CellNode;
@ -22,6 +23,7 @@ public class TripleLoaderTransposedNodeFactory implements TransposedNodeFactory
protected List<WritingTransposedNode> rootNodes = new LinkedList<WritingTransposedNode>();
protected StringBuffer stringBuffer;
protected Map<String, Long> varPool = new HashMap<String, Long>();
protected Map<Long, String> newTopicVars = new HashMap<Long, String>();
public String getLoad() {
stringBuffer = new StringBuffer();
@ -38,7 +40,7 @@ public class TripleLoaderTransposedNodeFactory implements TransposedNodeFactory
stringBuffer.append(line);
}
protected void writeLine(String subject, String predicate, String object) {
if (subject != null) {
if (subject != null && object != null) {
writeLine("{ 's' : '" + subject + "', 'p' : '" + predicate + "', 'o' : " + object + " }");
}
}
@ -113,18 +115,30 @@ public class TripleLoaderTransposedNodeFactory implements TransposedNodeFactory
if (cell.recon != null &&
cell.recon.judgment == Recon.Judgment.Matched &&
cell.recon.match != null) {
id = cell.recon.match.topicID;
} else if (node.createForNoReconMatch ||
(cell.recon != null && cell.recon.judgment == Judgment.New)) {
if (cell.recon != null && newTopicVars.containsKey(cell.recon.id)) {
id = newTopicVars.get(cell.recon.id);
} else {
long var = 0;
if (varPool.containsKey(node.columnName)) {
var = varPool.get(node.columnName);
}
varPool.put(node.columnName, var + 1);
id = "$" + node.columnName.replaceAll("\\W+", "_") + "_" + var;
writeLine("{ 's' : '" + id + "', 'p' : 'type', 'o' : '" + node.type.id + "' }");
writeLine("{ 's' : '" + id + "', 'p' : 'name', 'o' : " + JSONObject.quote(cell.value.toString()) + " }");
if (cell.recon != null) {
newTopicVars.put(cell.recon.id, id);
}
}
} else {
long var = 0;
if (varPool.containsKey(node.columnName)) {
var = varPool.get(node.columnName);
}
varPool.put(node.columnName, var + 1);
id = "$" + node.columnName.replaceAll("\\W+", "_") + "_" + var;
writeLine("{ 's' : '" + id + "', 'p' : 'type', 'o' : '" + node.type.id + "' }");
writeLine("{ 's' : '" + id + "', 'p' : 'name', 'o' : " + JSONObject.quote(cell.value.toString()) + " }");
return null;
}
if (subject != null) {

View File

@ -130,7 +130,7 @@ DataTableCellUI.prototype._doMatchNewTopicToOneCell = function() {
};
DataTableCellUI.prototype._doMatchNewTopicToSimilarCells = function() {
this._doJudgmentForSimilarCells("new");
this._doJudgmentForSimilarCells("new", { shareNewTopics: true });
};
DataTableCellUI.prototype._doMatchTopicToOneCell = function(candidate) {

View File

@ -209,8 +209,8 @@ DataTableColumnHeaderUI.prototype._createMenuForColumnHeader = function(elmt) {
}
},
{
label: "Create One New Topic for All Cells",
tooltip: "Mark to create one new, common topic for all cells in this column for all current filtered rows",
label: "Create One New Topic for Similar Cells",
tooltip: "Mark to create one new topic for each group of similar cells in this column for all current filtered rows",
click: function() {
self._doReconMarkNewTopics(true);
}
@ -459,10 +459,10 @@ DataTableColumnHeaderUI.prototype._doReconMatchBestCandidates = function() {
);
};
DataTableColumnHeaderUI.prototype._doReconMarkNewTopics = function() {
DataTableColumnHeaderUI.prototype._doReconMarkNewTopics = function(shareNewTopics) {
this._dataTableView.doPostThenUpdate(
"recon-mark-new-topics",
{ columnName: this._column.headerLabel }
{ columnName: this._column.headerLabel, shareNewTopics: shareNewTopics }
);
};