Tweaked recon type guessing heuristic: remove "role" and "annotation" types, and rank types based on result orders rather than relevance scores.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@114 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
b730dfd8f9
commit
b4935f576c
@ -57,6 +57,8 @@ public class GuessTypesOfColumnCommand extends Command {
|
|||||||
writer.object();
|
writer.object();
|
||||||
writer.key("id"); writer.value(tg.id);
|
writer.key("id"); writer.value(tg.id);
|
||||||
writer.key("name"); writer.value(tg.name);
|
writer.key("name"); writer.value(tg.name);
|
||||||
|
writer.key("score"); writer.value(tg.score);
|
||||||
|
writer.key("count"); writer.value(tg.count);
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,12 +74,14 @@ public class GuessTypesOfColumnCommand extends Command {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final static int s_sampleSize = 20;
|
||||||
|
|
||||||
protected List<TypeGroup> guessTypes(Project project, Column column) {
|
protected List<TypeGroup> guessTypes(Project project, Column column) {
|
||||||
Map<String, TypeGroup> map = new HashMap<String, TypeGroup>();
|
Map<String, TypeGroup> map = new HashMap<String, TypeGroup>();
|
||||||
|
|
||||||
int cellIndex = column.getCellIndex();
|
int cellIndex = column.getCellIndex();
|
||||||
|
|
||||||
List<String> samples = new ArrayList<String>(10);
|
List<String> samples = new ArrayList<String>(s_sampleSize);
|
||||||
Set<String> sampleSet = new HashSet<String>();
|
Set<String> sampleSet = new HashSet<String>();
|
||||||
|
|
||||||
for (Row row : project.rows) {
|
for (Row row : project.rows) {
|
||||||
@ -87,7 +91,7 @@ public class GuessTypesOfColumnCommand extends Command {
|
|||||||
if (!sampleSet.contains(s)) {
|
if (!sampleSet.contains(s)) {
|
||||||
samples.add(s);
|
samples.add(s);
|
||||||
sampleSet.add(s);
|
sampleSet.add(s);
|
||||||
if (samples.size() >= 10) {
|
if (samples.size() >= s_sampleSize) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -105,8 +109,6 @@ public class GuessTypesOfColumnCommand extends Command {
|
|||||||
|
|
||||||
jsonWriter.key("query"); jsonWriter.value(samples.get(i));
|
jsonWriter.key("query"); jsonWriter.value(samples.get(i));
|
||||||
jsonWriter.key("limit"); jsonWriter.value(3);
|
jsonWriter.key("limit"); jsonWriter.value(3);
|
||||||
jsonWriter.key("type_exclude"); jsonWriter.value("/common/image");
|
|
||||||
jsonWriter.key("domain_exclude"); jsonWriter.value("/freebase");
|
|
||||||
|
|
||||||
jsonWriter.endObject();
|
jsonWriter.endObject();
|
||||||
}
|
}
|
||||||
@ -142,7 +144,7 @@ public class GuessTypesOfColumnCommand extends Command {
|
|||||||
|
|
||||||
for (int j = 0; j < count; j++) {
|
for (int j = 0; j < count; j++) {
|
||||||
JSONObject result = results.getJSONObject(j);
|
JSONObject result = results.getJSONObject(j);
|
||||||
double score = result.getDouble("relevance:score");
|
double score = 1.0 / (1 + j); //result.getDouble("relevance:score");
|
||||||
|
|
||||||
JSONArray types = result.getJSONArray("type");
|
JSONArray types = result.getJSONArray("type");
|
||||||
int typeCount = types.length();
|
int typeCount = types.length();
|
||||||
@ -151,14 +153,18 @@ public class GuessTypesOfColumnCommand extends Command {
|
|||||||
JSONObject type = types.getJSONObject(t);
|
JSONObject type = types.getJSONObject(t);
|
||||||
String id = type.getString("id");
|
String id = type.getString("id");
|
||||||
if (id.equals("/common/topic") ||
|
if (id.equals("/common/topic") ||
|
||||||
|
id.equals("/base/ontologies/ontology_instance") ||
|
||||||
(id.startsWith("/base/") && id.endsWith("/topic")) ||
|
(id.startsWith("/base/") && id.endsWith("/topic")) ||
|
||||||
id.startsWith("/user/")
|
id.startsWith("/user/") ||
|
||||||
|
id.startsWith("/freebase/")
|
||||||
) {
|
) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (map.containsKey(id)) {
|
if (map.containsKey(id)) {
|
||||||
map.get(id).score += score;
|
TypeGroup tg = map.get(id);
|
||||||
|
tg.score += score;
|
||||||
|
tg.count++;
|
||||||
} else {
|
} else {
|
||||||
map.put(id, new TypeGroup(id, type.getString("name"), score));
|
map.put(id, new TypeGroup(id, type.getString("name"), score));
|
||||||
}
|
}
|
||||||
@ -175,7 +181,11 @@ public class GuessTypesOfColumnCommand extends Command {
|
|||||||
List<TypeGroup> types = new ArrayList<TypeGroup>(map.values());
|
List<TypeGroup> types = new ArrayList<TypeGroup>(map.values());
|
||||||
Collections.sort(types, new Comparator<TypeGroup>() {
|
Collections.sort(types, new Comparator<TypeGroup>() {
|
||||||
public int compare(TypeGroup o1, TypeGroup o2) {
|
public int compare(TypeGroup o1, TypeGroup o2) {
|
||||||
return (int) Math.signum(o2.score - o1.score);
|
int c = Math.min(s_sampleSize, o2.count) - Math.min(s_sampleSize, o1.count);
|
||||||
|
if (c != 0) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
return (int) Math.signum(o2.score / o2.count - o1.score / o1.count);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -185,12 +195,14 @@ public class GuessTypesOfColumnCommand extends Command {
|
|||||||
static protected class TypeGroup {
|
static protected class TypeGroup {
|
||||||
String id;
|
String id;
|
||||||
String name;
|
String name;
|
||||||
|
int count;
|
||||||
double score;
|
double score;
|
||||||
|
|
||||||
TypeGroup(String id, String name, double score) {
|
TypeGroup(String id, String name, double score) {
|
||||||
this.id = id;
|
this.id = id;
|
||||||
this.name = name;
|
this.name = name;
|
||||||
this.score = score;
|
this.score = score;
|
||||||
|
this.count = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -393,8 +393,45 @@ DataTableColumnHeaderUI.prototype._doReconcile = function() {
|
|||||||
"/command/guess-types-of-column?" + $.param({ project: theProject.id, columnName: this._column.headerLabel }),
|
"/command/guess-types-of-column?" + $.param({ project: theProject.id, columnName: this._column.headerLabel }),
|
||||||
null,
|
null,
|
||||||
function(data) {
|
function(data) {
|
||||||
dismissBusy();
|
if (data.code != "ok") {
|
||||||
new ReconDialog(self._column, data.code == "ok" ? data.types : []);
|
dismissBusy();
|
||||||
|
new ReconDialog(self._column, []);
|
||||||
|
} else {
|
||||||
|
data.types = data.types.slice(0, 20);
|
||||||
|
|
||||||
|
var ids = $.map(data.types, function(elmt) { return elmt.id; });
|
||||||
|
var query = [{
|
||||||
|
"id|=" : ids,
|
||||||
|
"id" : null,
|
||||||
|
"/freebase/type_profile/kind" : []
|
||||||
|
}];
|
||||||
|
$.getJSON(
|
||||||
|
"http://api.freebase.com/api/service/mqlread?" + $.param({ "query" : JSON.stringify({ "query" : query }) }) + "&callback=?",
|
||||||
|
null,
|
||||||
|
function(o) {
|
||||||
|
dismissBusy();
|
||||||
|
|
||||||
|
var kindMap = {};
|
||||||
|
$.each(o.result, function() {
|
||||||
|
var m = kindMap[this.id] = {};
|
||||||
|
$.each(this["/freebase/type_profile/kind"], function() {
|
||||||
|
m[this] = true;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
new ReconDialog(self._column, $.map(data.types, function(type) {
|
||||||
|
if (type.id in kindMap) {
|
||||||
|
var m = kindMap[type.id];
|
||||||
|
if (!("Role" in m) && !("Annotation" in m)) {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}));
|
||||||
|
},
|
||||||
|
"jsonp"
|
||||||
|
);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"json"
|
"json"
|
||||||
);
|
);
|
||||||
|
@ -7,13 +7,13 @@ function ReconDialog(column, types) {
|
|||||||
ReconDialog.prototype._createDialog = function() {
|
ReconDialog.prototype._createDialog = function() {
|
||||||
var self = this;
|
var self = this;
|
||||||
var frame = DialogSystem.createDialog();
|
var frame = DialogSystem.createDialog();
|
||||||
frame.width("400px");
|
frame.width("500px");
|
||||||
|
|
||||||
var header = $('<div></div>').addClass("dialog-header").text("Reconcile column " + this._column.headerLabel).appendTo(frame);
|
var header = $('<div></div>').addClass("dialog-header").text("Reconcile column " + this._column.headerLabel).appendTo(frame);
|
||||||
var body = $('<div></div>').addClass("dialog-body").appendTo(frame);
|
var body = $('<div></div>').addClass("dialog-body").appendTo(frame);
|
||||||
var footer = $('<div></div>').addClass("dialog-footer").appendTo(frame);
|
var footer = $('<div></div>').addClass("dialog-footer").appendTo(frame);
|
||||||
|
|
||||||
$('<p></p>').text("Reconcile cell values to topics of type:").appendTo(body);
|
$('<p>').text("Reconcile cell values to Freebase topics of type:").appendTo(body);
|
||||||
|
|
||||||
if (this._types.length > 0) {
|
if (this._types.length > 0) {
|
||||||
var createTypeChoice = function(type) {
|
var createTypeChoice = function(type) {
|
||||||
@ -35,11 +35,14 @@ ReconDialog.prototype._createDialog = function() {
|
|||||||
.attr("value", "")
|
.attr("value", "")
|
||||||
.appendTo(divCustom);
|
.appendTo(divCustom);
|
||||||
|
|
||||||
$('<span></span>').text(" Other:").appendTo(divCustom);
|
$('<span></span>').text(" Other: ").appendTo(divCustom);
|
||||||
|
|
||||||
|
var input = $('<input />').appendTo(divCustom);
|
||||||
|
} else {
|
||||||
|
var input = $('<input />').appendTo($('<p></p>').appendTo(body));
|
||||||
}
|
}
|
||||||
|
|
||||||
var type = null;
|
var type = null;
|
||||||
var input = $('<input />').appendTo($('<p></p>').appendTo(body));
|
|
||||||
input.suggest({ type : '/type/type' }).bind("fb-select", function(e, data) {
|
input.suggest({ type : '/type/type' }).bind("fb-select", function(e, data) {
|
||||||
type = {
|
type = {
|
||||||
id: data.id,
|
id: data.id,
|
||||||
@ -48,6 +51,11 @@ ReconDialog.prototype._createDialog = function() {
|
|||||||
$('input[name="recon-dialog-type-choice"][value=""]').attr("checked", "true");
|
$('input[name="recon-dialog-type-choice"][value=""]').attr("checked", "true");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
var optionDiv = $('<p>').appendTo(body);
|
||||||
|
var autoMatchCheckbox = $('<input type="checkbox" checked />').appendTo(optionDiv);
|
||||||
|
$('<span>').text(" Auto-match correctly-typed candidates scoring at least ").appendTo(optionDiv);
|
||||||
|
var minScoreInput = $('<input/>').attr("value", "100").appendTo(optionDiv);
|
||||||
|
|
||||||
$('<button></button>').text("Start Reconciling").click(function() {
|
$('<button></button>').text("Start Reconciling").click(function() {
|
||||||
var choices = $('input[name="recon-dialog-type-choice"]:checked');
|
var choices = $('input[name="recon-dialog-type-choice"]:checked');
|
||||||
if (choices != null && choices.length > 0 && choices[0].value != "") {
|
if (choices != null && choices.length > 0 && choices[0].value != "") {
|
||||||
@ -62,7 +70,14 @@ ReconDialog.prototype._createDialog = function() {
|
|||||||
} else {
|
} else {
|
||||||
DialogSystem.dismissUntil(level - 1);
|
DialogSystem.dismissUntil(level - 1);
|
||||||
$.post(
|
$.post(
|
||||||
"/command/reconcile?" + $.param({ project: theProject.id, columnName: self._column.headerLabel, typeID: type.id, typeName: type.name }),
|
"/command/reconcile?" + $.param({
|
||||||
|
project: theProject.id,
|
||||||
|
columnName: self._column.headerLabel,
|
||||||
|
typeID: type.id,
|
||||||
|
typeName: type.name,
|
||||||
|
autoMatch: autoMatchCheckbox[0].checked,
|
||||||
|
minScore: minScoreInput[0].value
|
||||||
|
}),
|
||||||
{ engine: JSON.stringify(ui.browsingEngine.getJSON()) },
|
{ engine: JSON.stringify(ui.browsingEngine.getJSON()) },
|
||||||
function(data) {
|
function(data) {
|
||||||
if (data.code != "error") {
|
if (data.code != "error") {
|
||||||
|
Loading…
Reference in New Issue
Block a user