Added more metadata into recon objects.

Tried to minimize number of unique recon objects created when calling Recon.dup().

git-svn-id: http://google-refine.googlecode.com/svn/trunk@560 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-04-27 23:17:18 +00:00
parent e77b99e58b
commit 15c188ad7a
15 changed files with 229 additions and 58 deletions

View File

@ -134,17 +134,32 @@ public class ReconJudgeOneCellCommand extends Command {
", containing \"" + cell.value + "\""; ", containing \"" + cell.value + "\"";
String description = null; String description = null;
newCell.recon.matchRank = -1;
newCell.recon.judgmentAction = "single";
newCell.recon.judgmentBatchSize = 1;
if (judgment == Judgment.None) { if (judgment == Judgment.None) {
newCell.recon.judgment = Recon.Judgment.None; newCell.recon.judgment = Recon.Judgment.None;
newCell.recon.match = null; newCell.recon.match = null;
description = "Discard recon judgment for " + cellDescription; description = "Discard recon judgment for " + cellDescription;
} else if (judgment == Judgment.New) { } else if (judgment == Judgment.New) {
newCell.recon.judgment = Recon.Judgment.New; newCell.recon.judgment = Recon.Judgment.New;
newCell.recon.match = null;
description = "Mark to create new topic for " + cellDescription; description = "Mark to create new topic for " + cellDescription;
} else { } else {
newCell.recon.judgment = Recon.Judgment.Matched; newCell.recon.judgment = Recon.Judgment.Matched;
newCell.recon.match = this.match; newCell.recon.match = this.match;
for (int m = 0; m < newCell.recon.candidates.size(); m++) {
if (newCell.recon.candidates.get(m).topicGUID.equals(this.match.topicGUID)) {
newCell.recon.matchRank = m;
break;
}
}
description = "Match " + this.match.topicName + description = "Match " + this.match.topicName +
" (" + match.topicID + ") to " + " (" + match.topicID + ") to " +
cellDescription; cellDescription;

View File

@ -132,6 +132,8 @@ public class ExcelImporter implements Importer {
* Now process the data rows * Now process the data rows
*/ */
int rowsWithData = 0; int rowsWithData = 0;
Map<String, Recon> reconMap = new HashMap<String, Recon>();
for (; r <= lastRow; r++) { for (; r <= lastRow; r++) {
org.apache.poi.ss.usermodel.Row row = sheet.getRow(r); org.apache.poi.ss.usermodel.Row row = sheet.getRow(r);
if (row == null) { if (row == null) {
@ -206,10 +208,22 @@ public class ExcelImporter implements Importer {
id = id.substring(0, h); id = id.substring(0, h);
} }
recon = new Recon(); if (reconMap.containsKey(id)) {
recon.judgment = Judgment.Matched; recon = reconMap.get(id);
recon.match = new ReconCandidate(id, "", value.toString(), new String[0], 100); recon.judgmentBatchSize++;
recon.addCandidate(recon.match); } else {
recon = new Recon();
recon.service = "import";
recon.match = new ReconCandidate(id, "", value.toString(), new String[0], 100);
recon.matchRank = 0;
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.judgmentBatchSize = 1;
recon.addCandidate(recon.match);
reconMap.put(id, recon);
}
} }
} }
} }

View File

@ -60,9 +60,16 @@ public class Recon implements HasFields, Jsonizable {
final public long id; final public long id;
public Object[] features = new Object[Feature_max]; public Object[] features = new Object[Feature_max];
public String service = "unknown";
public List<ReconCandidate> candidates; public List<ReconCandidate> candidates;
public Judgment judgment = Judgment.None; public Judgment judgment = Judgment.None;
public String judgmentAction = "unknown";
public long judgmentHistoryEntry = -1;
public int judgmentBatchSize = 0;
public ReconCandidate match = null; public ReconCandidate match = null;
public int matchRank = -1;
public Recon() { public Recon() {
id = System.currentTimeMillis() * 1000000 + Math.round(Math.random() * 1000000); id = System.currentTimeMillis() * 1000000 + Math.round(Math.random() * 1000000);
@ -81,8 +88,16 @@ public class Recon implements HasFields, Jsonizable {
r.candidates = new ArrayList<ReconCandidate>(candidates); r.candidates = new ArrayList<ReconCandidate>(candidates);
} }
r.service = service;
r.judgment = judgment; r.judgment = judgment;
r.judgmentAction = judgmentAction;
r.judgmentHistoryEntry = judgmentHistoryEntry;
r.judgmentBatchSize = judgmentBatchSize;
r.match = match; r.match = match;
r.matchRank = matchRank;
return r; return r;
} }
@ -190,6 +205,15 @@ public class Recon implements HasFields, Jsonizable {
writer.value(o); writer.value(o);
} }
writer.endArray(); writer.endArray();
writer.key("service"); writer.value(service);
writer.key("judgmentAction"); writer.value(judgmentAction);
writer.key("judgmentHistoryEntry"); writer.value(judgmentHistoryEntry);
writer.key("judgmentBatchSize"); writer.value(judgmentBatchSize);
if (match != null) {
writer.key("matchRank"); writer.value(matchRank);
}
} }
writer.endObject(); writer.endObject();
@ -268,6 +292,16 @@ public class Recon implements HasFields, Jsonizable {
recon.addCandidate(ReconCandidate.loadStreaming(jp)); recon.addCandidate(ReconCandidate.loadStreaming(jp));
} }
} }
} else if ("service".equals(fieldName)) {
recon.service = jp.getText();
} else if ("judgmentAction".equals(fieldName)) {
recon.judgmentAction = jp.getText();
} else if ("judgmentHistoryEntry".equals(fieldName)) {
recon.judgmentHistoryEntry = jp.getLongValue();
} else if ("judgmentBatchSize".equals(fieldName)) {
recon.judgmentBatchSize = jp.getIntValue();
} else if ("matchRank".equals(fieldName)) {
recon.matchRank = jp.getIntValue();
} }
} }

View File

@ -181,8 +181,12 @@ public class DataExtensionChange implements Change {
ReconCandidate rc = (ReconCandidate) value; ReconCandidate rc = (ReconCandidate) value;
Recon recon = new Recon(); Recon recon = new Recon();
recon.addCandidate(rc); recon.addCandidate(rc);
recon.service = "mql";
recon.match = rc; recon.match = rc;
recon.matchRank = 0;
recon.judgment = Judgment.Matched; recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.judgmentBatchSize = 1;
cell = new Cell(rc.topicName, recon); cell = new Cell(rc.topicName, recon);
} else { } else {

View File

@ -149,8 +149,11 @@ public class GuidBasedReconConfig extends StrictReconConfig {
Recon recon = new Recon(); Recon recon = new Recon();
recon.addCandidate(candidate); recon.addCandidate(candidate);
recon.match = candidate; recon.service = "mql";
recon.judgment = Judgment.Matched; recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.match = candidate;
recon.matchRank = 0;
guidToRecon.put(guid, recon); guidToRecon.put(guid, recon);
} }

View File

@ -256,6 +256,7 @@ public class HeuristicReconConfig extends ReconConfig {
recon = new Recon(); recon = new Recon();
} }
recon.service = "recon";
recons.add(recon); recons.add(recon);
} }
} finally { } finally {
@ -314,7 +315,9 @@ public class HeuristicReconConfig extends ReconConfig {
candidate.score / recon.candidates.get(1).score >= 1.5) { candidate.score / recon.candidates.get(1).score >= 1.5) {
recon.match = candidate; recon.match = candidate;
recon.matchRank = 0;
recon.judgment = Judgment.Matched; recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
} }
} }
break; break;
@ -361,6 +364,7 @@ public class HeuristicReconConfig extends ReconConfig {
if (recon == null) { if (recon == null) {
recon = new Recon(); recon = new Recon();
} }
recon.service = "recon";
recons.add(recon); recons.add(recon);
} }
@ -408,7 +412,9 @@ public class HeuristicReconConfig extends ReconConfig {
recon.setFeature(Recon.Feature_typeMatch, true); recon.setFeature(Recon.Feature_typeMatch, true);
if (autoMatch && result.has("match") && result.getBoolean("match")) { if (autoMatch && result.has("match") && result.getBoolean("match")) {
recon.match = candidate; recon.match = candidate;
recon.matchRank = 0;
recon.judgment = Judgment.Matched; recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
} }
break; break;
} }

View File

@ -153,8 +153,11 @@ public class IdBasedReconConfig extends StrictReconConfig {
Recon recon = new Recon(); Recon recon = new Recon();
recon.addCandidate(candidate); recon.addCandidate(candidate);
recon.match = candidate; recon.service = "mql";
recon.judgment = Judgment.Matched; recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.match = candidate;
recon.matchRank = 0;
idToRecon.put(id, recon); idToRecon.put(id, recon);
} }

View File

@ -167,8 +167,11 @@ public class KeyBasedReconConfig extends StrictReconConfig {
Recon recon = new Recon(); Recon recon = new Recon();
recon.addCandidate(candidate); recon.addCandidate(candidate);
recon.match = candidate; recon.service = "mql";
recon.judgment = Judgment.Matched; recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.match = candidate;
recon.matchRank = 0;
keyToRecon.put(key, recon); keyToRecon.put(key, recon);
} }

View File

@ -37,7 +37,11 @@ abstract public class EngineDependentMassCellOperation extends EngineDependentOp
List<CellChange> cellChanges = new ArrayList<CellChange>(project.rows.size()); List<CellChange> cellChanges = new ArrayList<CellChange>(project.rows.size());
FilteredRows filteredRows = engine.getAllFilteredRows(false); FilteredRows filteredRows = engine.getAllFilteredRows(false);
filteredRows.accept(project, createRowVisitor(project, cellChanges)); try {
filteredRows.accept(project, createRowVisitor(project, cellChanges));
} catch (Exception e) {
e.printStackTrace();
}
String description = createDescription(column, cellChanges); String description = createDescription(column, cellChanges);

View File

@ -1,6 +1,8 @@
package com.metaweb.gridworks.operations; package com.metaweb.gridworks.operations;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Properties; import java.util.Properties;
import org.json.JSONException; import org.json.JSONException;
@ -62,6 +64,7 @@ public class ReconDiscardJudgmentsOperation extends EngineDependentMassCellOpera
return new RowVisitor() { return new RowVisitor() {
int cellIndex; int cellIndex;
List<CellChange> cellChanges; List<CellChange> cellChanges;
Map<Long, Recon> dupReconMap = new HashMap<Long, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) { public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
this.cellIndex = cellIndex; this.cellIndex = cellIndex;
@ -72,11 +75,22 @@ public class ReconDiscardJudgmentsOperation extends EngineDependentMassCellOpera
public boolean visit(Project project, int rowIndex, Row row, boolean includeContextual, boolean includeDependent) { public boolean visit(Project project, int rowIndex, Row row, boolean includeContextual, boolean includeDependent) {
Cell cell = row.getCell(cellIndex); Cell cell = row.getCell(cellIndex);
if (cell != null && cell.recon != null) { if (cell != null && cell.recon != null) {
Recon recon = cell.recon.dup(); Recon newRecon;
recon.judgment = Judgment.None; if (dupReconMap.containsKey(cell.recon.id)) {
recon.match = null; newRecon = dupReconMap.get(cell.recon.id);
newRecon.judgmentBatchSize++;
Cell newCell = new Cell(cell.value, recon); } else {
newRecon = cell.recon.dup();
newRecon.match = null;
newRecon.matchRank = -1;
newRecon.judgment = Judgment.None;
newRecon.judgmentAction = "mass";
newRecon.judgmentBatchSize = 1;
dupReconMap.put(cell.recon.id, newRecon);
}
Cell newCell = new Cell(cell.value, newRecon);
CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell); CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell);
cellChanges.add(cellChange); cellChanges.add(cellChange);

View File

@ -150,8 +150,9 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
return new RowVisitor() { return new RowVisitor() {
int _cellIndex; int _cellIndex;
List<CellChange> _cellChanges; List<CellChange> _cellChanges;
Map<String, Recon> _sharedRecons = new HashMap<String, Recon>(); Recon _sharedNewRecon = null;
Map<Long, Recon> _dupReconMap = new HashMap<Long, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) { public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
_cellIndex = cellIndex; _cellIndex = cellIndex;
@ -167,27 +168,47 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
Recon recon = null; Recon recon = null;
if (_judgment == Judgment.New && _shareNewTopics) { if (_judgment == Judgment.New && _shareNewTopics) {
String s = cell.value.toString(); if (_sharedNewRecon == null) {
if (_sharedRecons.containsKey(s)) { _sharedNewRecon = new Recon();
recon = _sharedRecons.get(s); _sharedNewRecon.judgment = Judgment.New;
} else { _sharedNewRecon.judgmentBatchSize = 0;
recon = new Recon(); _sharedNewRecon.judgmentAction = "similar";
recon.judgment = Judgment.New; }
_sharedNewRecon.judgmentBatchSize++;
_sharedRecons.put(s, recon);
} recon = _sharedNewRecon;
} else { } else {
recon = cell.recon == null ? new Recon() : cell.recon.dup(); if (_dupReconMap.containsKey(cell.recon.id)) {
if (_judgment == Judgment.Matched) { recon = _dupReconMap.get(cell.recon.id);
recon.judgment = Recon.Judgment.Matched; recon.judgmentBatchSize++;
recon.match = _match; } else {
} else if (_judgment == Judgment.New) { recon = cell.recon.dup();
recon.judgment = Recon.Judgment.New; recon.judgmentBatchSize = 1;
recon.match = null; recon.matchRank = -1;
} else if (_judgment == Judgment.None) { recon.judgmentAction = "similar";
recon.judgment = Recon.Judgment.None;
recon.match = null; if (_judgment == Judgment.Matched) {
} recon.judgment = Recon.Judgment.Matched;
recon.match = _match;
if (recon.candidates != null) {
for (int m = 0; m < recon.candidates.size(); m++) {
if (recon.candidates.get(m).topicGUID.equals(_match.topicGUID)) {
recon.matchRank = m;
break;
}
}
}
} else if (_judgment == Judgment.New) {
recon.judgment = Recon.Judgment.New;
recon.match = null;
} else if (_judgment == Judgment.None) {
recon.judgment = Recon.Judgment.None;
recon.match = null;
}
_dupReconMap.put(cell.recon.id, recon);
}
} }
Cell newCell = new Cell(cell.value, recon); Cell newCell = new Cell(cell.value, recon);

View File

@ -72,9 +72,10 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
Column column = project.columnModel.getColumnByName(_columnName); Column column = project.columnModel.getColumnByName(_columnName);
return new RowVisitor() { return new RowVisitor() {
int cellIndex; int cellIndex;
List<CellChange> cellChanges; List<CellChange> cellChanges;
Map<String, Recon> _sharedRecons = new HashMap<String, Recon>(); Map<String, Recon> sharedRecons = new HashMap<String, Recon>();
Map<Long, Recon> dupReconMap = new HashMap<Long, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) { public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
this.cellIndex = cellIndex; this.cellIndex = cellIndex;
@ -88,18 +89,32 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
Recon recon = null; Recon recon = null;
if (_shareNewTopics) { if (_shareNewTopics) {
String s = cell.value == null ? "" : cell.value.toString(); String s = cell.value == null ? "" : cell.value.toString();
if (_sharedRecons.containsKey(s)) { if (sharedRecons.containsKey(s)) {
recon = _sharedRecons.get(s); recon = sharedRecons.get(s);
recon.judgmentBatchSize++;
} else { } else {
recon = new Recon(); recon = new Recon();
recon.judgment = Judgment.New; recon.judgment = Judgment.New;
recon.judgmentBatchSize = 1;
recon.judgmentAction = "mass";
_sharedRecons.put(s, recon); sharedRecons.put(s, recon);
} }
} else { } else {
recon = cell.recon == null ? new Recon() : cell.recon.dup(); long reconID = cell.recon == null ? 0 : cell.recon.id;
recon.match = null; if (dupReconMap.containsKey(reconID)) {
recon.judgment = Judgment.New; recon = dupReconMap.get(reconID);
recon.judgmentBatchSize++;
} else {
recon = cell.recon == null ? new Recon() : cell.recon.dup();
recon.match = null;
recon.matchRank = -1;
recon.judgment = Judgment.New;
recon.judgmentBatchSize = 1;
recon.judgmentAction = "mass";
dupReconMap.put(reconID, recon);
}
} }
Cell newCell = new Cell(cell.value, recon); Cell newCell = new Cell(cell.value, recon);

View File

@ -1,6 +1,8 @@
package com.metaweb.gridworks.operations; package com.metaweb.gridworks.operations;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Properties; import java.util.Properties;
import org.json.JSONException; import org.json.JSONException;
@ -13,6 +15,7 @@ import com.metaweb.gridworks.model.AbstractOperation;
import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column; import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Recon;
import com.metaweb.gridworks.model.ReconCandidate; import com.metaweb.gridworks.model.ReconCandidate;
import com.metaweb.gridworks.model.Row; import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.model.Recon.Judgment; import com.metaweb.gridworks.model.Recon.Judgment;
@ -60,8 +63,9 @@ public class ReconMatchBestCandidatesOperation extends EngineDependentMassCellOp
Column column = project.columnModel.getColumnByName(_columnName); Column column = project.columnModel.getColumnByName(_columnName);
return new RowVisitor() { return new RowVisitor() {
int cellIndex; int cellIndex;
List<CellChange> cellChanges; List<CellChange> cellChanges;
Map<Long, Recon> dupReconMap = new HashMap<Long, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) { public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
this.cellIndex = cellIndex; this.cellIndex = cellIndex;
@ -75,12 +79,24 @@ public class ReconMatchBestCandidatesOperation extends EngineDependentMassCellOp
if (cell != null && cell.recon != null) { if (cell != null && cell.recon != null) {
ReconCandidate candidate = cell.recon.getBestCandidate(); ReconCandidate candidate = cell.recon.getBestCandidate();
if (candidate != null) { if (candidate != null) {
Recon newRecon;
if (dupReconMap.containsKey(cell.recon.id)) {
newRecon = dupReconMap.get(cell.recon.id);
newRecon.judgmentBatchSize++;
} else {
newRecon = cell.recon.dup();
newRecon.judgmentBatchSize = 1;
newRecon.match = candidate;
newRecon.matchRank = 0;
newRecon.judgment = Judgment.Matched;
newRecon.judgmentAction = "mass";
dupReconMap.put(cell.recon.id, newRecon);
}
Cell newCell = new Cell( Cell newCell = new Cell(
cell.value, cell.value,
cell.recon.dup() newRecon
); );
newCell.recon.match = candidate;
newCell.recon.judgment = Judgment.Matched;
CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell); CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell);
cellChanges.add(cellChange); cellChanges.add(cellChange);

View File

@ -1,6 +1,8 @@
package com.metaweb.gridworks.operations; package com.metaweb.gridworks.operations;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Properties; import java.util.Properties;
import org.json.JSONArray; import org.json.JSONArray;
@ -96,6 +98,7 @@ public class ReconMatchSpecificTopicOperation extends EngineDependentMassCellOpe
return new RowVisitor() { return new RowVisitor() {
int cellIndex; int cellIndex;
List<CellChange> cellChanges; List<CellChange> cellChanges;
Map<Long, Recon> dupReconMap = new HashMap<Long, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) { public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
this.cellIndex = cellIndex; this.cellIndex = cellIndex;
@ -104,15 +107,29 @@ public class ReconMatchSpecificTopicOperation extends EngineDependentMassCellOpe
} }
public boolean visit(Project project, int rowIndex, Row row, boolean includeContextual, boolean includeDependent) { public boolean visit(Project project, int rowIndex, Row row, boolean includeContextual, boolean includeDependent) {
if (cellIndex < row.cells.size()) { Cell cell = row.getCell(cellIndex);
Cell cell = row.cells.get(cellIndex); if (cell != null) {
long reconID = cell.recon != null ? cell.recon.id : 0;
Recon newRecon;
if (dupReconMap.containsKey(reconID)) {
newRecon = dupReconMap.get(reconID);
newRecon.judgmentBatchSize++;
} else {
newRecon = cell.recon != null ? cell.recon.dup() : new Recon();
newRecon.match = match;
newRecon.matchRank = -1;
newRecon.judgment = Judgment.Matched;
newRecon.judgmentAction = "mass";
newRecon.judgmentBatchSize = 1;
dupReconMap.put(reconID, newRecon);
}
Cell newCell = new Cell( Cell newCell = new Cell(
cell.value, cell.value,
cell.recon != null ? cell.recon.dup() : new Recon() newRecon
); );
newCell.recon.match = match;
newCell.recon.judgment = Judgment.Matched;
CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell); CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell);
cellChanges.add(cellChange); cellChanges.add(cellChange);

View File

@ -229,11 +229,13 @@ public class ReconOperation extends EngineDependentOperation {
List<Recon> recons = _reconConfig.batchRecon(jobs); List<Recon> recons = _reconConfig.batchRecon(jobs);
for (int j = i; j < to; j++) { for (int j = i; j < to; j++) {
Recon recon = recons.get(j - i); Recon recon = recons.get(j - i);
if (recon == null) { List<ReconEntry> entries = groups.get(j).entries;
recon = new Recon();
if (recon != null) {
recon.judgmentBatchSize = entries.size();
} }
for (ReconEntry entry : groups.get(j).entries) { for (ReconEntry entry : entries) {
Cell oldCell = entry.cell; Cell oldCell = entry.cell;
Cell newCell = new Cell(oldCell.value, recon); Cell newCell = new Cell(oldCell.value, recon);