Recon process seems to be working. We can now filter rows by recon features using custom expressions.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@19 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
fe8810aa07
commit
58f2dd8f60
@ -30,9 +30,20 @@ public class ExpressionEqualRowFilter implements RowFilter {
|
|||||||
|
|
||||||
Object value = _evaluable.evaluate(bindings);
|
Object value = _evaluable.evaluate(bindings);
|
||||||
if (value != null) {
|
if (value != null) {
|
||||||
for (Object match : _matches) {
|
if (value.getClass().isArray()) {
|
||||||
if (match.equals(value)) {
|
Object[] a = (Object[]) value;
|
||||||
return true;
|
for (Object v : a) {
|
||||||
|
for (Object match : _matches) {
|
||||||
|
if (match.equals(v)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (Object match : _matches) {
|
||||||
|
if (match.equals(value)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -190,22 +190,23 @@ public class Parser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
String identifier = _token.text;
|
String identifier = _token.text;
|
||||||
Function f = functionTable.get(identifier);
|
|
||||||
if (f == null) {
|
|
||||||
throw makeException("Unknown function " + identifier);
|
|
||||||
}
|
|
||||||
next();
|
next();
|
||||||
|
|
||||||
if (_token == null || _token.type != TokenType.Delimiter || !_token.text.equals("(")) {
|
if (_token != null && _token.type == TokenType.Delimiter && _token.text.equals("(")) {
|
||||||
throw makeException("Missing (");
|
next(); // swallow (
|
||||||
|
|
||||||
|
Function f = functionTable.get(identifier);
|
||||||
|
if (f == null) {
|
||||||
|
throw makeException("Unknown function " + identifier);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Evaluable> args = parseExpressionList(")");
|
||||||
|
args.add(0, eval);
|
||||||
|
|
||||||
|
eval = new FunctionCallExpr(makeArray(args), f);
|
||||||
|
} else {
|
||||||
|
eval = new FieldAccessorExpr(eval, identifier);
|
||||||
}
|
}
|
||||||
next();
|
|
||||||
|
|
||||||
List<Evaluable> args = parseExpressionList(")");
|
|
||||||
args.add(0, eval);
|
|
||||||
|
|
||||||
eval = new FunctionCallExpr(makeArray(args), f);
|
|
||||||
|
|
||||||
} else if (_token.type == TokenType.Delimiter && _token.text.equals("[")) {
|
} else if (_token.type == TokenType.Delimiter && _token.text.equals("[")) {
|
||||||
next(); // swallow [
|
next(); // swallow [
|
||||||
|
|
||||||
|
@ -31,8 +31,8 @@ public class Cell implements Serializable, HasFields, Jsonizable {
|
|||||||
writer.key("v");
|
writer.key("v");
|
||||||
writer.value(value);
|
writer.value(value);
|
||||||
|
|
||||||
if (recon != null && options.containsKey("cell-recon")) {
|
if (recon != null) {
|
||||||
writer.key("recon");
|
writer.key("r");
|
||||||
recon.write(writer, options);
|
recon.write(writer, options);
|
||||||
}
|
}
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
|
@ -39,6 +39,13 @@ public class Recon implements Serializable, HasFields, Jsonizable {
|
|||||||
return judgment == Judgment.New;
|
return judgment == Judgment.New;
|
||||||
} else if ("match".equals(name)) {
|
} else if ("match".equals(name)) {
|
||||||
return match;
|
return match;
|
||||||
|
} else if ("features".equals(name)) {
|
||||||
|
return new HasFields() {
|
||||||
|
@Override
|
||||||
|
public Object getField(String name, Properties bindings) {
|
||||||
|
return features.get(name);
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -67,6 +74,18 @@ public class Recon implements Serializable, HasFields, Jsonizable {
|
|||||||
writer.object();
|
writer.object();
|
||||||
writer.key("j");
|
writer.key("j");
|
||||||
writer.value(judgmentToString());
|
writer.value(judgmentToString());
|
||||||
|
|
||||||
|
writer.key("c"); writer.array();
|
||||||
|
for (ReconCandidate c : candidates) {
|
||||||
|
c.write(writer, options);
|
||||||
|
}
|
||||||
|
writer.endArray();
|
||||||
|
|
||||||
|
if (match != null) {
|
||||||
|
writer.key("m");
|
||||||
|
match.write(writer, options);
|
||||||
|
}
|
||||||
|
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -40,15 +40,17 @@ public class ReconCandidate implements Serializable, HasFields, Jsonizable {
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("id"); writer.value(topicID);
|
writer.key("id"); writer.value(topicID);
|
||||||
writer.key("guid"); writer.value(topicGUID);
|
//writer.key("guid"); writer.value(topicGUID);
|
||||||
writer.key("name"); writer.value(topicName);
|
writer.key("name"); writer.value(topicName);
|
||||||
writer.key("score"); writer.value(score);
|
//writer.key("score"); writer.value(score);
|
||||||
|
|
||||||
|
/*
|
||||||
writer.key("types"); writer.array();
|
writer.key("types"); writer.array();
|
||||||
for (String typeID : typeIDs) {
|
for (String typeID : typeIDs) {
|
||||||
writer.value(typeID);
|
writer.value(typeID);
|
||||||
}
|
}
|
||||||
writer.endArray();
|
writer.endArray();
|
||||||
|
*/
|
||||||
|
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,33 @@
|
|||||||
package com.metaweb.gridlock.process;
|
package com.metaweb.gridlock.process;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.net.URLConnection;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.json.JSONArray;
|
||||||
|
import org.json.JSONException;
|
||||||
|
import org.json.JSONObject;
|
||||||
|
import org.json.JSONWriter;
|
||||||
|
|
||||||
|
import com.metaweb.gridlock.history.CellChange;
|
||||||
|
import com.metaweb.gridlock.history.HistoryEntry;
|
||||||
|
import com.metaweb.gridlock.history.MassCellChange;
|
||||||
import com.metaweb.gridlock.model.Cell;
|
import com.metaweb.gridlock.model.Cell;
|
||||||
import com.metaweb.gridlock.model.Project;
|
import com.metaweb.gridlock.model.Project;
|
||||||
|
import com.metaweb.gridlock.model.Recon;
|
||||||
|
import com.metaweb.gridlock.model.ReconCandidate;
|
||||||
|
import com.metaweb.gridlock.util.ParsingUtilities;
|
||||||
|
|
||||||
public class ReconProcess extends LongRunningProcess implements Runnable {
|
public class ReconProcess extends LongRunningProcess implements Runnable {
|
||||||
static public class ReconEntry {
|
static public class ReconEntry {
|
||||||
@ -56,9 +76,16 @@ public class ReconProcess extends LongRunningProcess implements Runnable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<CellChange> cellChanges = new ArrayList<CellChange>(_entries.size());
|
||||||
List<String> values = new ArrayList<String>(valueToEntries.keySet());
|
List<String> values = new ArrayList<String>(valueToEntries.keySet());
|
||||||
for (int i = 0; i < values.size(); i += 20) {
|
for (int i = 0; i < values.size(); i += 20) {
|
||||||
|
try {
|
||||||
|
recon(valueToEntries, values, i, Math.min(i + 20, values.size()), cellChanges);
|
||||||
|
} catch (JSONException e1) {
|
||||||
|
e1.printStackTrace();
|
||||||
|
}
|
||||||
_progress = i * 100 / values.size();
|
_progress = i * 100 / values.size();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Thread.sleep(100);
|
Thread.sleep(100);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
@ -68,6 +95,179 @@ public class ReconProcess extends LongRunningProcess implements Runnable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MassCellChange massCellChange = new MassCellChange(cellChanges);
|
||||||
|
HistoryEntry historyEntry = new HistoryEntry(_project, _description, massCellChange);
|
||||||
|
|
||||||
|
_project.history.addEntry(historyEntry);
|
||||||
|
|
||||||
_project.processManager.onDoneProcess(this);
|
_project.processManager.onDoneProcess(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void recon(
|
||||||
|
Map<String, List<ReconEntry>> valueToEntries,
|
||||||
|
List<String> values,
|
||||||
|
int from,
|
||||||
|
int to,
|
||||||
|
List<CellChange> cellChanges
|
||||||
|
) throws JSONException {
|
||||||
|
|
||||||
|
StringWriter stringWriter = new StringWriter();
|
||||||
|
JSONWriter jsonWriter = new JSONWriter(stringWriter);
|
||||||
|
|
||||||
|
jsonWriter.object();
|
||||||
|
for (int i = 0; from + i < to; i++) {
|
||||||
|
jsonWriter.key("q" + i + ":search");
|
||||||
|
|
||||||
|
jsonWriter.object();
|
||||||
|
|
||||||
|
jsonWriter.key("query"); jsonWriter.value(values.get(from + i));
|
||||||
|
jsonWriter.key("limit"); jsonWriter.value(5);
|
||||||
|
jsonWriter.key("type"); jsonWriter.value(_typeID);
|
||||||
|
jsonWriter.key("type_strict"); jsonWriter.value("should");
|
||||||
|
jsonWriter.key("indent"); jsonWriter.value(1);
|
||||||
|
|
||||||
|
jsonWriter.endObject();
|
||||||
|
}
|
||||||
|
jsonWriter.endObject();
|
||||||
|
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
sb.append("http://api.freebase.com/api/service/search?indent=1&queries=");
|
||||||
|
sb.append(ParsingUtilities.encode(stringWriter.toString()));
|
||||||
|
|
||||||
|
try {
|
||||||
|
URL url = new URL(sb.toString());
|
||||||
|
URLConnection connection = url.openConnection();
|
||||||
|
connection.setConnectTimeout(5000);
|
||||||
|
connection.connect();
|
||||||
|
|
||||||
|
InputStream is = connection.getInputStream();
|
||||||
|
try {
|
||||||
|
String s = ParsingUtilities.inputStreamToString(is);
|
||||||
|
JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s);
|
||||||
|
|
||||||
|
for (int i = 0; from + i < to; i++) {
|
||||||
|
String value = values.get(from + i);
|
||||||
|
String key = "q" + i + ":search";
|
||||||
|
if (!o.has(key)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONObject o2 = o.getJSONObject(key);
|
||||||
|
if (!(o2.has("result"))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONArray results = o2.getJSONArray("result");
|
||||||
|
|
||||||
|
Recon recon = createRecon(value, results);
|
||||||
|
for (ReconEntry entry : valueToEntries.get(value)) {
|
||||||
|
Cell oldCell = entry.cell;
|
||||||
|
|
||||||
|
Cell newCell = new Cell();
|
||||||
|
newCell.value = oldCell.value;
|
||||||
|
newCell.recon = recon;
|
||||||
|
|
||||||
|
CellChange cellChange = new CellChange(
|
||||||
|
entry.rowIndex,
|
||||||
|
_cellIndex,
|
||||||
|
oldCell,
|
||||||
|
newCell
|
||||||
|
);
|
||||||
|
cellChanges.add(cellChange);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
is.close();
|
||||||
|
}
|
||||||
|
} catch (UnsupportedEncodingException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Recon createRecon(String text, JSONArray results) throws JSONException {
|
||||||
|
Recon recon = new Recon();
|
||||||
|
|
||||||
|
int length = results.length();
|
||||||
|
for (int i = 0; i < length && recon.candidates.size() < 3; i++) {
|
||||||
|
JSONObject result = results.getJSONObject(i);
|
||||||
|
if (!result.has("name")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ReconCandidate candidate = new ReconCandidate();
|
||||||
|
|
||||||
|
candidate.topicID = result.getString("id");
|
||||||
|
candidate.topicGUID = result.getString("guid");
|
||||||
|
candidate.topicName = result.getString("name");
|
||||||
|
candidate.score = result.getDouble("relevance:score");
|
||||||
|
|
||||||
|
JSONArray types = result.getJSONArray("type");
|
||||||
|
candidate.typeIDs = new String[types.length()];
|
||||||
|
for (int j = 0; j < candidate.typeIDs.length; j++) {
|
||||||
|
candidate.typeIDs[j] = types.getJSONObject(j).getString("id");
|
||||||
|
}
|
||||||
|
|
||||||
|
// best match
|
||||||
|
if (i == 0) {
|
||||||
|
recon.features.put("nameMatch", text.equalsIgnoreCase(candidate.topicName));
|
||||||
|
recon.features.put("nameLevenshtein", StringUtils.getLevenshteinDistance(text, candidate.topicName));
|
||||||
|
recon.features.put("nameWordDistance", wordDistance(text, candidate.topicName));
|
||||||
|
|
||||||
|
recon.features.put("typeMatch", false);
|
||||||
|
for (String typeID : candidate.typeIDs) {
|
||||||
|
if (_typeID.equals(typeID)) {
|
||||||
|
recon.features.put("typeMatch", true);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
recon.candidates.add(candidate);
|
||||||
|
}
|
||||||
|
|
||||||
|
return recon;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected double wordDistance(String s1, String s2) {
|
||||||
|
Set<String> words1 = breakWords(s1);
|
||||||
|
Set<String> words2 = breakWords(s2);
|
||||||
|
return words1.size() >= words2.size() ? wordDistance(words1, words2) : wordDistance(words2, words1);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected double wordDistance(Set<String> longWords, Set<String> shortWords) {
|
||||||
|
double common = 0;
|
||||||
|
for (String word : shortWords) {
|
||||||
|
if (longWords.contains(word)) {
|
||||||
|
common++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return common / longWords.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
static protected Set<String> s_stopWords;
|
||||||
|
static {
|
||||||
|
s_stopWords = new HashSet<String>();
|
||||||
|
s_stopWords.add("the");
|
||||||
|
s_stopWords.add("a");
|
||||||
|
s_stopWords.add("and");
|
||||||
|
s_stopWords.add("of");
|
||||||
|
s_stopWords.add("on");
|
||||||
|
s_stopWords.add("in");
|
||||||
|
s_stopWords.add("at");
|
||||||
|
s_stopWords.add("by");
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Set<String> breakWords(String s) {
|
||||||
|
String[] words = s.toLowerCase().split("\\s+");
|
||||||
|
|
||||||
|
Set<String> set = new HashSet<String>(words.length);
|
||||||
|
for (String word : words) {
|
||||||
|
if (!s_stopWords.contains(word)) {
|
||||||
|
set.add(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,59 @@
|
|||||||
|
package com.metaweb.gridlock.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.net.URLCodec;
|
||||||
|
import org.json.JSONArray;
|
||||||
|
import org.json.JSONException;
|
||||||
|
import org.json.JSONObject;
|
||||||
|
import org.json.JSONTokener;
|
||||||
|
|
||||||
|
public class ParsingUtilities {
|
||||||
|
static public String inputStreamToString(InputStream is) throws IOException {
|
||||||
|
Reader reader = new InputStreamReader(is, "UTF-8");
|
||||||
|
try {
|
||||||
|
return readerToString(reader);
|
||||||
|
} finally {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static public String readerToString(Reader reader) throws IOException {
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
|
||||||
|
char[] chars = new char[8192];
|
||||||
|
int c;
|
||||||
|
|
||||||
|
while ((c = reader.read(chars)) > 0) {
|
||||||
|
sb.insert(sb.length(), chars, 0, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
static public JSONObject evaluateJsonStringToObject(String s) throws JSONException {
|
||||||
|
JSONTokener t = new JSONTokener(s);
|
||||||
|
JSONObject o = (JSONObject) t.nextValue();
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
static public JSONArray evaluateJsonStringToArray(String s) throws JSONException {
|
||||||
|
JSONTokener t = new JSONTokener(s);
|
||||||
|
JSONArray a = (JSONArray) t.nextValue();
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final URLCodec codec = new URLCodec();
|
||||||
|
static public String encode(String s) {
|
||||||
|
try {
|
||||||
|
return codec.encode(s, "UTF-8");
|
||||||
|
} catch (UnsupportedEncodingException e) {
|
||||||
|
return s; // should not happen
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
function DataTableView(div) {
|
function DataTableView(div) {
|
||||||
this._div = div;
|
this._div = div;
|
||||||
this._pageSize = 20;
|
this._pageSize = 20;
|
||||||
|
this._showRecon = true;
|
||||||
this._showRows(0);
|
this._showRows(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,6 +82,29 @@ DataTableView.prototype.render = function() {
|
|||||||
createColumnHeader(columns[i], i);
|
createColumnHeader(columns[i], i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var renderCell = function(cell, td) {
|
||||||
|
if (cell.v == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$(td).html(cell.v);
|
||||||
|
|
||||||
|
if ("r" in cell && self._showRecon) {
|
||||||
|
var candidates = cell.r.c;
|
||||||
|
var ul = $('<ul></ul>').appendTo(td);
|
||||||
|
|
||||||
|
for (var i = 0; i < candidates.length; i++) {
|
||||||
|
var candidate = candidates[i];
|
||||||
|
var li = $('<li></li>').appendTo(ul);
|
||||||
|
$('<a></a>')
|
||||||
|
.attr("href", "http://www.freebase.com/view" + candidate.id)
|
||||||
|
.attr("target", "_blank")
|
||||||
|
.text(candidate.name)
|
||||||
|
.appendTo(li);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
var rows = theProject.rowModel.rows;
|
var rows = theProject.rowModel.rows;
|
||||||
for (var r = 0; r < rows.length; r++) {
|
for (var r = 0; r < rows.length; r++) {
|
||||||
var row = rows[r];
|
var row = rows[r];
|
||||||
@ -99,9 +123,7 @@ DataTableView.prototype.render = function() {
|
|||||||
td.innerHTML = " ";
|
td.innerHTML = " ";
|
||||||
} else if (column.cellIndex < cells.length) {
|
} else if (column.cellIndex < cells.length) {
|
||||||
var cell = cells[column.cellIndex];
|
var cell = cells[column.cellIndex];
|
||||||
if (cell.v != null) {
|
renderCell(cell, td);
|
||||||
$(td).html(cell.v);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user