Added more metadata into recon objects.

Tried to minimize number of unique recon objects created when calling Recon.dup().

git-svn-id: http://google-refine.googlecode.com/svn/trunk@560 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-04-27 23:17:18 +00:00
parent e77b99e58b
commit 15c188ad7a
15 changed files with 229 additions and 58 deletions

View File

@ -134,17 +134,32 @@ public class ReconJudgeOneCellCommand extends Command {
", containing \"" + cell.value + "\"";
String description = null;
newCell.recon.matchRank = -1;
newCell.recon.judgmentAction = "single";
newCell.recon.judgmentBatchSize = 1;
if (judgment == Judgment.None) {
newCell.recon.judgment = Recon.Judgment.None;
newCell.recon.match = null;
description = "Discard recon judgment for " + cellDescription;
} else if (judgment == Judgment.New) {
newCell.recon.judgment = Recon.Judgment.New;
newCell.recon.match = null;
description = "Mark to create new topic for " + cellDescription;
} else {
newCell.recon.judgment = Recon.Judgment.Matched;
newCell.recon.match = this.match;
for (int m = 0; m < newCell.recon.candidates.size(); m++) {
if (newCell.recon.candidates.get(m).topicGUID.equals(this.match.topicGUID)) {
newCell.recon.matchRank = m;
break;
}
}
description = "Match " + this.match.topicName +
" (" + match.topicID + ") to " +
cellDescription;

View File

@ -132,6 +132,8 @@ public class ExcelImporter implements Importer {
* Now process the data rows
*/
int rowsWithData = 0;
Map<String, Recon> reconMap = new HashMap<String, Recon>();
for (; r <= lastRow; r++) {
org.apache.poi.ss.usermodel.Row row = sheet.getRow(r);
if (row == null) {
@ -206,10 +208,22 @@ public class ExcelImporter implements Importer {
id = id.substring(0, h);
}
recon = new Recon();
recon.judgment = Judgment.Matched;
recon.match = new ReconCandidate(id, "", value.toString(), new String[0], 100);
recon.addCandidate(recon.match);
if (reconMap.containsKey(id)) {
recon = reconMap.get(id);
recon.judgmentBatchSize++;
} else {
recon = new Recon();
recon.service = "import";
recon.match = new ReconCandidate(id, "", value.toString(), new String[0], 100);
recon.matchRank = 0;
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.judgmentBatchSize = 1;
recon.addCandidate(recon.match);
reconMap.put(id, recon);
}
}
}
}

View File

@ -60,9 +60,16 @@ public class Recon implements HasFields, Jsonizable {
final public long id;
public Object[] features = new Object[Feature_max];
public String service = "unknown";
public List<ReconCandidate> candidates;
public Judgment judgment = Judgment.None;
public String judgmentAction = "unknown";
public long judgmentHistoryEntry = -1;
public int judgmentBatchSize = 0;
public ReconCandidate match = null;
public int matchRank = -1;
public Recon() {
id = System.currentTimeMillis() * 1000000 + Math.round(Math.random() * 1000000);
@ -81,8 +88,16 @@ public class Recon implements HasFields, Jsonizable {
r.candidates = new ArrayList<ReconCandidate>(candidates);
}
r.service = service;
r.judgment = judgment;
r.judgmentAction = judgmentAction;
r.judgmentHistoryEntry = judgmentHistoryEntry;
r.judgmentBatchSize = judgmentBatchSize;
r.match = match;
r.matchRank = matchRank;
return r;
}
@ -190,6 +205,15 @@ public class Recon implements HasFields, Jsonizable {
writer.value(o);
}
writer.endArray();
writer.key("service"); writer.value(service);
writer.key("judgmentAction"); writer.value(judgmentAction);
writer.key("judgmentHistoryEntry"); writer.value(judgmentHistoryEntry);
writer.key("judgmentBatchSize"); writer.value(judgmentBatchSize);
if (match != null) {
writer.key("matchRank"); writer.value(matchRank);
}
}
writer.endObject();
@ -268,6 +292,16 @@ public class Recon implements HasFields, Jsonizable {
recon.addCandidate(ReconCandidate.loadStreaming(jp));
}
}
} else if ("service".equals(fieldName)) {
recon.service = jp.getText();
} else if ("judgmentAction".equals(fieldName)) {
recon.judgmentAction = jp.getText();
} else if ("judgmentHistoryEntry".equals(fieldName)) {
recon.judgmentHistoryEntry = jp.getLongValue();
} else if ("judgmentBatchSize".equals(fieldName)) {
recon.judgmentBatchSize = jp.getIntValue();
} else if ("matchRank".equals(fieldName)) {
recon.matchRank = jp.getIntValue();
}
}

View File

@ -181,8 +181,12 @@ public class DataExtensionChange implements Change {
ReconCandidate rc = (ReconCandidate) value;
Recon recon = new Recon();
recon.addCandidate(rc);
recon.service = "mql";
recon.match = rc;
recon.matchRank = 0;
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.judgmentBatchSize = 1;
cell = new Cell(rc.topicName, recon);
} else {

View File

@ -149,8 +149,11 @@ public class GuidBasedReconConfig extends StrictReconConfig {
Recon recon = new Recon();
recon.addCandidate(candidate);
recon.match = candidate;
recon.service = "mql";
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.match = candidate;
recon.matchRank = 0;
guidToRecon.put(guid, recon);
}

View File

@ -256,6 +256,7 @@ public class HeuristicReconConfig extends ReconConfig {
recon = new Recon();
}
recon.service = "recon";
recons.add(recon);
}
} finally {
@ -314,7 +315,9 @@ public class HeuristicReconConfig extends ReconConfig {
candidate.score / recon.candidates.get(1).score >= 1.5) {
recon.match = candidate;
recon.matchRank = 0;
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
}
}
break;
@ -361,6 +364,7 @@ public class HeuristicReconConfig extends ReconConfig {
if (recon == null) {
recon = new Recon();
}
recon.service = "recon";
recons.add(recon);
}
@ -408,7 +412,9 @@ public class HeuristicReconConfig extends ReconConfig {
recon.setFeature(Recon.Feature_typeMatch, true);
if (autoMatch && result.has("match") && result.getBoolean("match")) {
recon.match = candidate;
recon.matchRank = 0;
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
}
break;
}

View File

@ -153,8 +153,11 @@ public class IdBasedReconConfig extends StrictReconConfig {
Recon recon = new Recon();
recon.addCandidate(candidate);
recon.match = candidate;
recon.service = "mql";
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.match = candidate;
recon.matchRank = 0;
idToRecon.put(id, recon);
}

View File

@ -167,8 +167,11 @@ public class KeyBasedReconConfig extends StrictReconConfig {
Recon recon = new Recon();
recon.addCandidate(candidate);
recon.match = candidate;
recon.service = "mql";
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.match = candidate;
recon.matchRank = 0;
keyToRecon.put(key, recon);
}

View File

@ -37,7 +37,11 @@ abstract public class EngineDependentMassCellOperation extends EngineDependentOp
List<CellChange> cellChanges = new ArrayList<CellChange>(project.rows.size());
FilteredRows filteredRows = engine.getAllFilteredRows(false);
filteredRows.accept(project, createRowVisitor(project, cellChanges));
try {
filteredRows.accept(project, createRowVisitor(project, cellChanges));
} catch (Exception e) {
e.printStackTrace();
}
String description = createDescription(column, cellChanges);

View File

@ -1,6 +1,8 @@
package com.metaweb.gridworks.operations;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.json.JSONException;
@ -62,6 +64,7 @@ public class ReconDiscardJudgmentsOperation extends EngineDependentMassCellOpera
return new RowVisitor() {
int cellIndex;
List<CellChange> cellChanges;
Map<Long, Recon> dupReconMap = new HashMap<Long, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
this.cellIndex = cellIndex;
@ -72,11 +75,22 @@ public class ReconDiscardJudgmentsOperation extends EngineDependentMassCellOpera
public boolean visit(Project project, int rowIndex, Row row, boolean includeContextual, boolean includeDependent) {
Cell cell = row.getCell(cellIndex);
if (cell != null && cell.recon != null) {
Recon recon = cell.recon.dup();
recon.judgment = Judgment.None;
recon.match = null;
Cell newCell = new Cell(cell.value, recon);
Recon newRecon;
if (dupReconMap.containsKey(cell.recon.id)) {
newRecon = dupReconMap.get(cell.recon.id);
newRecon.judgmentBatchSize++;
} else {
newRecon = cell.recon.dup();
newRecon.match = null;
newRecon.matchRank = -1;
newRecon.judgment = Judgment.None;
newRecon.judgmentAction = "mass";
newRecon.judgmentBatchSize = 1;
dupReconMap.put(cell.recon.id, newRecon);
}
Cell newCell = new Cell(cell.value, newRecon);
CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell);
cellChanges.add(cellChange);

View File

@ -150,8 +150,9 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
return new RowVisitor() {
int _cellIndex;
List<CellChange> _cellChanges;
Map<String, Recon> _sharedRecons = new HashMap<String, Recon>();
List<CellChange> _cellChanges;
Recon _sharedNewRecon = null;
Map<Long, Recon> _dupReconMap = new HashMap<Long, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
_cellIndex = cellIndex;
@ -167,27 +168,47 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
Recon recon = null;
if (_judgment == Judgment.New && _shareNewTopics) {
String s = cell.value.toString();
if (_sharedRecons.containsKey(s)) {
recon = _sharedRecons.get(s);
} else {
recon = new Recon();
recon.judgment = Judgment.New;
_sharedRecons.put(s, recon);
}
if (_sharedNewRecon == null) {
_sharedNewRecon = new Recon();
_sharedNewRecon.judgment = Judgment.New;
_sharedNewRecon.judgmentBatchSize = 0;
_sharedNewRecon.judgmentAction = "similar";
}
_sharedNewRecon.judgmentBatchSize++;
recon = _sharedNewRecon;
} else {
recon = cell.recon == null ? new Recon() : cell.recon.dup();
if (_judgment == Judgment.Matched) {
recon.judgment = Recon.Judgment.Matched;
recon.match = _match;
} else if (_judgment == Judgment.New) {
recon.judgment = Recon.Judgment.New;
recon.match = null;
} else if (_judgment == Judgment.None) {
recon.judgment = Recon.Judgment.None;
recon.match = null;
}
if (_dupReconMap.containsKey(cell.recon.id)) {
recon = _dupReconMap.get(cell.recon.id);
recon.judgmentBatchSize++;
} else {
recon = cell.recon.dup();
recon.judgmentBatchSize = 1;
recon.matchRank = -1;
recon.judgmentAction = "similar";
if (_judgment == Judgment.Matched) {
recon.judgment = Recon.Judgment.Matched;
recon.match = _match;
if (recon.candidates != null) {
for (int m = 0; m < recon.candidates.size(); m++) {
if (recon.candidates.get(m).topicGUID.equals(_match.topicGUID)) {
recon.matchRank = m;
break;
}
}
}
} else if (_judgment == Judgment.New) {
recon.judgment = Recon.Judgment.New;
recon.match = null;
} else if (_judgment == Judgment.None) {
recon.judgment = Recon.Judgment.None;
recon.match = null;
}
_dupReconMap.put(cell.recon.id, recon);
}
}
Cell newCell = new Cell(cell.value, recon);

View File

@ -72,9 +72,10 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
Column column = project.columnModel.getColumnByName(_columnName);
return new RowVisitor() {
int cellIndex;
List<CellChange> cellChanges;
Map<String, Recon> _sharedRecons = new HashMap<String, Recon>();
int cellIndex;
List<CellChange> cellChanges;
Map<String, Recon> sharedRecons = new HashMap<String, Recon>();
Map<Long, Recon> dupReconMap = new HashMap<Long, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
this.cellIndex = cellIndex;
@ -88,18 +89,32 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
Recon recon = null;
if (_shareNewTopics) {
String s = cell.value == null ? "" : cell.value.toString();
if (_sharedRecons.containsKey(s)) {
recon = _sharedRecons.get(s);
if (sharedRecons.containsKey(s)) {
recon = sharedRecons.get(s);
recon.judgmentBatchSize++;
} else {
recon = new Recon();
recon.judgment = Judgment.New;
recon.judgmentBatchSize = 1;
recon.judgmentAction = "mass";
_sharedRecons.put(s, recon);
sharedRecons.put(s, recon);
}
} else {
recon = cell.recon == null ? new Recon() : cell.recon.dup();
recon.match = null;
recon.judgment = Judgment.New;
long reconID = cell.recon == null ? 0 : cell.recon.id;
if (dupReconMap.containsKey(reconID)) {
recon = dupReconMap.get(reconID);
recon.judgmentBatchSize++;
} else {
recon = cell.recon == null ? new Recon() : cell.recon.dup();
recon.match = null;
recon.matchRank = -1;
recon.judgment = Judgment.New;
recon.judgmentBatchSize = 1;
recon.judgmentAction = "mass";
dupReconMap.put(reconID, recon);
}
}
Cell newCell = new Cell(cell.value, recon);

View File

@ -1,6 +1,8 @@
package com.metaweb.gridworks.operations;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.json.JSONException;
@ -13,6 +15,7 @@ import com.metaweb.gridworks.model.AbstractOperation;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Recon;
import com.metaweb.gridworks.model.ReconCandidate;
import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.model.Recon.Judgment;
@ -60,8 +63,9 @@ public class ReconMatchBestCandidatesOperation extends EngineDependentMassCellOp
Column column = project.columnModel.getColumnByName(_columnName);
return new RowVisitor() {
int cellIndex;
List<CellChange> cellChanges;
int cellIndex;
List<CellChange> cellChanges;
Map<Long, Recon> dupReconMap = new HashMap<Long, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
this.cellIndex = cellIndex;
@ -75,12 +79,24 @@ public class ReconMatchBestCandidatesOperation extends EngineDependentMassCellOp
if (cell != null && cell.recon != null) {
ReconCandidate candidate = cell.recon.getBestCandidate();
if (candidate != null) {
Recon newRecon;
if (dupReconMap.containsKey(cell.recon.id)) {
newRecon = dupReconMap.get(cell.recon.id);
newRecon.judgmentBatchSize++;
} else {
newRecon = cell.recon.dup();
newRecon.judgmentBatchSize = 1;
newRecon.match = candidate;
newRecon.matchRank = 0;
newRecon.judgment = Judgment.Matched;
newRecon.judgmentAction = "mass";
dupReconMap.put(cell.recon.id, newRecon);
}
Cell newCell = new Cell(
cell.value,
cell.recon.dup()
newRecon
);
newCell.recon.match = candidate;
newCell.recon.judgment = Judgment.Matched;
CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell);
cellChanges.add(cellChange);

View File

@ -1,6 +1,8 @@
package com.metaweb.gridworks.operations;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.json.JSONArray;
@ -96,6 +98,7 @@ public class ReconMatchSpecificTopicOperation extends EngineDependentMassCellOpe
return new RowVisitor() {
int cellIndex;
List<CellChange> cellChanges;
Map<Long, Recon> dupReconMap = new HashMap<Long, Recon>();
public RowVisitor init(int cellIndex, List<CellChange> cellChanges) {
this.cellIndex = cellIndex;
@ -104,15 +107,29 @@ public class ReconMatchSpecificTopicOperation extends EngineDependentMassCellOpe
}
public boolean visit(Project project, int rowIndex, Row row, boolean includeContextual, boolean includeDependent) {
if (cellIndex < row.cells.size()) {
Cell cell = row.cells.get(cellIndex);
Cell cell = row.getCell(cellIndex);
if (cell != null) {
long reconID = cell.recon != null ? cell.recon.id : 0;
Recon newRecon;
if (dupReconMap.containsKey(reconID)) {
newRecon = dupReconMap.get(reconID);
newRecon.judgmentBatchSize++;
} else {
newRecon = cell.recon != null ? cell.recon.dup() : new Recon();
newRecon.match = match;
newRecon.matchRank = -1;
newRecon.judgment = Judgment.Matched;
newRecon.judgmentAction = "mass";
newRecon.judgmentBatchSize = 1;
dupReconMap.put(reconID, newRecon);
}
Cell newCell = new Cell(
cell.value,
cell.recon != null ? cell.recon.dup() : new Recon()
newRecon
);
newCell.recon.match = match;
newCell.recon.judgment = Judgment.Matched;
CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell);
cellChanges.add(cellChange);

View File

@ -229,11 +229,13 @@ public class ReconOperation extends EngineDependentOperation {
List<Recon> recons = _reconConfig.batchRecon(jobs);
for (int j = i; j < to; j++) {
Recon recon = recons.get(j - i);
if (recon == null) {
recon = new Recon();
List<ReconEntry> entries = groups.get(j).entries;
if (recon != null) {
recon.judgmentBatchSize = entries.size();
}
for (ReconEntry entry : groups.get(j).entries) {
for (ReconEntry entry : entries) {
Cell oldCell = entry.cell;
Cell newCell = new Cell(oldCell.value, recon);