Automatically guess types to reconcile a column, using Stefano's trick in his "cupid" acre app.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@104 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
4b2e48614b
commit
28a86dfe0f
@ -39,6 +39,7 @@ import com.metaweb.gridworks.commands.recon.DiscardReconcileCommand;
|
|||||||
import com.metaweb.gridworks.commands.recon.JudgeOneCellCommand;
|
import com.metaweb.gridworks.commands.recon.JudgeOneCellCommand;
|
||||||
import com.metaweb.gridworks.commands.recon.ReconcileCommand;
|
import com.metaweb.gridworks.commands.recon.ReconcileCommand;
|
||||||
import com.metaweb.gridworks.commands.util.GetExpressionLanguageInfoCommand;
|
import com.metaweb.gridworks.commands.util.GetExpressionLanguageInfoCommand;
|
||||||
|
import com.metaweb.gridworks.commands.util.GuessTypesOfColumnCommand;
|
||||||
import com.metaweb.gridworks.commands.util.PreviewExpressionCommand;
|
import com.metaweb.gridworks.commands.util.PreviewExpressionCommand;
|
||||||
import com.metaweb.gridworks.commands.util.PreviewProtographCommand;
|
import com.metaweb.gridworks.commands.util.PreviewProtographCommand;
|
||||||
|
|
||||||
@ -82,6 +83,7 @@ public class GridworksServlet extends HttpServlet {
|
|||||||
_commands.put("preview-expression", new PreviewExpressionCommand());
|
_commands.put("preview-expression", new PreviewExpressionCommand());
|
||||||
_commands.put("get-expression-language-info", new GetExpressionLanguageInfoCommand());
|
_commands.put("get-expression-language-info", new GetExpressionLanguageInfoCommand());
|
||||||
_commands.put("preview-protograph", new PreviewProtographCommand());
|
_commands.put("preview-protograph", new PreviewProtographCommand());
|
||||||
|
_commands.put("guess-types-of-column", new GuessTypesOfColumnCommand());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -0,0 +1,188 @@
|
|||||||
|
package com.metaweb.gridworks.commands.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.net.URLConnection;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import javax.servlet.ServletException;
|
||||||
|
import javax.servlet.http.HttpServletRequest;
|
||||||
|
import javax.servlet.http.HttpServletResponse;
|
||||||
|
|
||||||
|
import org.json.JSONArray;
|
||||||
|
import org.json.JSONObject;
|
||||||
|
import org.json.JSONWriter;
|
||||||
|
|
||||||
|
|
||||||
|
import com.metaweb.gridworks.commands.Command;
|
||||||
|
import com.metaweb.gridworks.expr.ExpressionUtils;
|
||||||
|
import com.metaweb.gridworks.model.Column;
|
||||||
|
import com.metaweb.gridworks.model.Project;
|
||||||
|
import com.metaweb.gridworks.model.Row;
|
||||||
|
import com.metaweb.gridworks.util.ParsingUtilities;
|
||||||
|
|
||||||
|
public class GuessTypesOfColumnCommand extends Command {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void doPost(HttpServletRequest request, HttpServletResponse response)
|
||||||
|
throws ServletException, IOException {
|
||||||
|
|
||||||
|
try {
|
||||||
|
Project project = getProject(request);
|
||||||
|
String columnName = request.getParameter("columnName");
|
||||||
|
|
||||||
|
JSONWriter writer = new JSONWriter(response.getWriter());
|
||||||
|
writer.object();
|
||||||
|
|
||||||
|
Column column = project.columnModel.getColumnByName(columnName);
|
||||||
|
if (column == null) {
|
||||||
|
writer.key("code"); writer.value("error");
|
||||||
|
writer.key("message"); writer.value("No such column");
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
writer.key("code"); writer.value("ok");
|
||||||
|
writer.key("types"); writer.array();
|
||||||
|
|
||||||
|
List<TypeGroup> typeGroups = guessTypes(project, column);
|
||||||
|
for (TypeGroup tg : typeGroups) {
|
||||||
|
writer.object();
|
||||||
|
writer.key("id"); writer.value(tg.id);
|
||||||
|
writer.key("name"); writer.value(tg.name);
|
||||||
|
writer.endObject();
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.endArray();
|
||||||
|
} catch (Exception e) {
|
||||||
|
writer.key("code"); writer.value("error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.endObject();
|
||||||
|
} catch (Exception e) {
|
||||||
|
respondException(response, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<TypeGroup> guessTypes(Project project, Column column) {
|
||||||
|
Map<String, TypeGroup> map = new HashMap<String, TypeGroup>();
|
||||||
|
|
||||||
|
int cellIndex = column.getCellIndex();
|
||||||
|
|
||||||
|
List<String> samples = new ArrayList<String>(10);
|
||||||
|
for (Row row : project.rows) {
|
||||||
|
Object value = row.getCellValue(cellIndex);
|
||||||
|
if (!ExpressionUtils.isBlank(value)) {
|
||||||
|
samples.add(value.toString());
|
||||||
|
if (samples.size() >= 10) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
StringWriter stringWriter = new StringWriter();
|
||||||
|
JSONWriter jsonWriter = new JSONWriter(stringWriter);
|
||||||
|
|
||||||
|
jsonWriter.object();
|
||||||
|
for (int i = 0; i < samples.size(); i++) {
|
||||||
|
jsonWriter.key("q" + i + ":search");
|
||||||
|
jsonWriter.object();
|
||||||
|
|
||||||
|
jsonWriter.key("query"); jsonWriter.value(samples.get(i));
|
||||||
|
jsonWriter.key("limit"); jsonWriter.value(3);
|
||||||
|
jsonWriter.key("type_exclude"); jsonWriter.value("/common/image");
|
||||||
|
jsonWriter.key("domain_exclude"); jsonWriter.value("/freebase");
|
||||||
|
|
||||||
|
jsonWriter.endObject();
|
||||||
|
}
|
||||||
|
jsonWriter.endObject();
|
||||||
|
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
sb.append("http://api.freebase.com/api/service/search?indent=1&queries=");
|
||||||
|
sb.append(ParsingUtilities.encode(stringWriter.toString()));
|
||||||
|
|
||||||
|
URL url = new URL(sb.toString());
|
||||||
|
URLConnection connection = url.openConnection();
|
||||||
|
connection.setConnectTimeout(5000);
|
||||||
|
connection.connect();
|
||||||
|
|
||||||
|
InputStream is = connection.getInputStream();
|
||||||
|
try {
|
||||||
|
String s = ParsingUtilities.inputStreamToString(is);
|
||||||
|
JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s);
|
||||||
|
|
||||||
|
for (int i = 0; i < samples.size(); i++) {
|
||||||
|
String key = "q" + i + ":search";
|
||||||
|
if (!o.has(key)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONObject o2 = o.getJSONObject(key);
|
||||||
|
if (!(o2.has("result"))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONArray results = o2.getJSONArray("result");
|
||||||
|
int count = results.length();
|
||||||
|
|
||||||
|
for (int j = 0; j < count; j++) {
|
||||||
|
JSONObject result = results.getJSONObject(j);
|
||||||
|
double score = result.getDouble("relevance:score");
|
||||||
|
|
||||||
|
JSONArray types = result.getJSONArray("type");
|
||||||
|
int typeCount = types.length();
|
||||||
|
|
||||||
|
for (int t = 0; t < typeCount; t++) {
|
||||||
|
JSONObject type = types.getJSONObject(t);
|
||||||
|
String id = type.getString("id");
|
||||||
|
if (id.equals("/common/topic") ||
|
||||||
|
(id.startsWith("/base/") && id.endsWith("/topic")) ||
|
||||||
|
id.startsWith("/user/")
|
||||||
|
) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (map.containsKey(id)) {
|
||||||
|
map.get(id).score += score;
|
||||||
|
} else {
|
||||||
|
map.put(id, new TypeGroup(id, type.getString("name"), score));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
is.close();
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<TypeGroup> types = new ArrayList<TypeGroup>(map.values());
|
||||||
|
Collections.sort(types, new Comparator<TypeGroup>() {
|
||||||
|
public int compare(TypeGroup o1, TypeGroup o2) {
|
||||||
|
return (int) Math.signum(o2.score - o1.score);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return types;
|
||||||
|
}
|
||||||
|
|
||||||
|
static protected class TypeGroup {
|
||||||
|
String id;
|
||||||
|
String name;
|
||||||
|
double score;
|
||||||
|
|
||||||
|
TypeGroup(String id, String name, double score) {
|
||||||
|
this.id = id;
|
||||||
|
this.name = name;
|
||||||
|
this.score = score;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -187,7 +187,7 @@ DataTableColumnHeaderUI.prototype._createMenuForColumnHeader = function(elmt) {
|
|||||||
label: "Start Reconciling ...",
|
label: "Start Reconciling ...",
|
||||||
tooltip: "Reconcile text in this column with topics on Freebase",
|
tooltip: "Reconcile text in this column with topics on Freebase",
|
||||||
click: function() {
|
click: function() {
|
||||||
new ReconDialog(self._column);
|
self._doReconcile();
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
@ -375,6 +375,19 @@ DataTableColumnHeaderUI.prototype._doTextTransformPrompt = function() {
|
|||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
DataTableColumnHeaderUI.prototype._doReconcile = function() {
|
||||||
|
var self = this;
|
||||||
|
$.post(
|
||||||
|
"/command/guess-types-of-column?" + $.param({ project: theProject.id, columnName: this._column.headerLabel }),
|
||||||
|
null,
|
||||||
|
function(data) {
|
||||||
|
new ReconDialog(self._column, data.code == "ok" ? data.types : []);
|
||||||
|
},
|
||||||
|
"json"
|
||||||
|
);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
DataTableColumnHeaderUI.prototype._doDiscardReconResults = function() {
|
DataTableColumnHeaderUI.prototype._doDiscardReconResults = function() {
|
||||||
this._dataTableView.doPostThenUpdate(
|
this._dataTableView.doPostThenUpdate(
|
||||||
"discard-reconcile",
|
"discard-reconcile",
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
function ReconDialog(column) {
|
function ReconDialog(column, types) {
|
||||||
this._column = column;
|
this._column = column;
|
||||||
|
this._types = types;
|
||||||
this._createDialog();
|
this._createDialog();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -14,13 +15,44 @@ ReconDialog.prototype._createDialog = function() {
|
|||||||
|
|
||||||
$('<p></p>').text("Reconcile cell values to topics of type:").appendTo(body);
|
$('<p></p>').text("Reconcile cell values to topics of type:").appendTo(body);
|
||||||
|
|
||||||
|
if (this._types.length > 0) {
|
||||||
|
var createTypeChoice = function(type) {
|
||||||
|
var div = $('<div>').appendTo(body);
|
||||||
|
$('<input type="radio" name="recon-dialog-type-choice">')
|
||||||
|
.attr("value", type.id)
|
||||||
|
.appendTo(div);
|
||||||
|
|
||||||
|
$('<span></span>').text(" " + type.name).appendTo(div);
|
||||||
|
$('<span></span>').text(" (" + type.id + ")").appendTo(div);
|
||||||
|
};
|
||||||
|
for (var i = 0; i < this._types.length && i < 7; i++) {
|
||||||
|
createTypeChoice(this._types[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
var divCustom = $('<div>').appendTo(body);
|
||||||
|
$('<input type="radio" name="recon-dialog-type-choice">')
|
||||||
|
.attr("value", "")
|
||||||
|
.appendTo(divCustom);
|
||||||
|
|
||||||
|
$('<span></span>').text(" Other:").appendTo(divCustom);
|
||||||
|
}
|
||||||
|
|
||||||
var type = null;
|
var type = null;
|
||||||
var input = $('<input />').appendTo($('<p></p>').appendTo(body));
|
var input = $('<input />').appendTo($('<p></p>').appendTo(body));
|
||||||
input.suggest({ type : '/type/type' }).bind("fb-select", function(e, data) {
|
input.suggest({ type : '/type/type' }).bind("fb-select", function(e, data) {
|
||||||
type = data.id;
|
type = data.id;
|
||||||
|
$('input[name="recon-dialog-type-choice"][value=""]').attr("checked", "true");
|
||||||
});
|
});
|
||||||
|
|
||||||
$('<button></button>').text("Start Reconciling").click(function() {
|
$('<button></button>').text("Start Reconciling").click(function() {
|
||||||
|
var choices = $('input[name="recon-dialog-type-choice"]:checked');
|
||||||
|
if (choices != null && choices.length > 0 && choices[0].value != "") {
|
||||||
|
type = choices[0].value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type == null) {
|
||||||
|
alert("Please specify a type.");
|
||||||
|
} else {
|
||||||
DialogSystem.dismissUntil(level - 1);
|
DialogSystem.dismissUntil(level - 1);
|
||||||
$.post(
|
$.post(
|
||||||
"/command/reconcile?" + $.param({ project: theProject.id, columnName: self._column.headerLabel, type: type }),
|
"/command/reconcile?" + $.param({ project: theProject.id, columnName: self._column.headerLabel, type: type }),
|
||||||
@ -34,6 +66,7 @@ ReconDialog.prototype._createDialog = function() {
|
|||||||
},
|
},
|
||||||
"json"
|
"json"
|
||||||
);
|
);
|
||||||
|
}
|
||||||
}).appendTo(footer);
|
}).appendTo(footer);
|
||||||
|
|
||||||
$('<button></button>').text("Cancel").click(function() {
|
$('<button></button>').text("Cancel").click(function() {
|
||||||
@ -44,3 +77,4 @@ ReconDialog.prototype._createDialog = function() {
|
|||||||
|
|
||||||
input[0].focus();
|
input[0].focus();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user