From 512cd1638154aa158264aaa78b06deec3a5609a8 Mon Sep 17 00:00:00 2001 From: David Huynh Date: Tue, 2 Mar 2010 18:19:20 +0000 Subject: [PATCH] Implemented recon by keys, guids, and ids. git-svn-id: http://google-refine.googlecode.com/svn/trunk@165 7d457c2a-affb-35e4-300a-418c747d4874 --- .../model/recon/GuidBasedReconConfig.java | 174 +++++++++++++++++ .../model/recon/IdBasedReconConfig.java | 178 ++++++++++++++++++ .../model/recon/KeyBasedReconConfig.java | 14 +- .../model/recon/StrictReconConfig.java | 6 + .../webapp/scripts/dialogs/recon-dialog.js | 63 ++++++- .../views/data-table-column-header-ui.js | 63 ++++--- tests/presidents.tsv | 44 +++++ 7 files changed, 504 insertions(+), 38 deletions(-) create mode 100644 src/main/java/com/metaweb/gridworks/model/recon/GuidBasedReconConfig.java create mode 100644 src/main/java/com/metaweb/gridworks/model/recon/IdBasedReconConfig.java create mode 100644 tests/presidents.tsv diff --git a/src/main/java/com/metaweb/gridworks/model/recon/GuidBasedReconConfig.java b/src/main/java/com/metaweb/gridworks/model/recon/GuidBasedReconConfig.java new file mode 100644 index 000000000..750519b92 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/model/recon/GuidBasedReconConfig.java @@ -0,0 +1,174 @@ +package com.metaweb.gridworks.model.recon; + +import java.io.InputStream; +import java.io.StringWriter; +import java.net.URL; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONWriter; + +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Recon; +import com.metaweb.gridworks.model.ReconCandidate; +import com.metaweb.gridworks.model.Row; +import com.metaweb.gridworks.model.Recon.Judgment; +import com.metaweb.gridworks.util.ParsingUtilities; + +public class GuidBasedReconConfig extends StrictReconConfig { + private static final long serialVersionUID = 1857895989346775294L; + + static public ReconConfig reconstruct(JSONObject obj) throws Exception { + return new GuidBasedReconConfig(); + } + + public GuidBasedReconConfig() { + } + + static protected class GuidBasedReconJob extends ReconJob { + String guid; + + public int getKey() { + return guid.hashCode(); + } + } + + @Override + public ReconJob createJob(Project project, int rowIndex, Row row, + String columnName, Cell cell) { + + GuidBasedReconJob job = new GuidBasedReconJob(); + String s = cell.value.toString(); + + if (s.startsWith("/guid/")) { + s = "#" + s.substring(6); + } else if (!s.startsWith("#")) { + s = "#" + s; + } + + job.guid = s; + + return job; + } + + @Override + public int getBatchSize() { + return 10; + } + + @Override + public String getBriefDescription(Project project, String columnName) { + return "Reconcile cells in column " + columnName + " as Freebase IDs"; + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("mode"); writer.value("strict"); + writer.key("match"); writer.value("id"); + writer.endObject(); + } + + @Override + public List batchRecon(List jobs) { + List recons = new ArrayList(jobs.size()); + Map guidToRecon = new HashMap(); + + try { + String query = null; + { + StringWriter stringWriter = new StringWriter(); + JSONWriter jsonWriter = new JSONWriter(stringWriter); + + jsonWriter.object(); + jsonWriter.key("query"); + jsonWriter.array(); + jsonWriter.object(); + + jsonWriter.key("id"); jsonWriter.value(null); + jsonWriter.key("name"); jsonWriter.value(null); + jsonWriter.key("guid"); jsonWriter.value(null); + jsonWriter.key("type"); jsonWriter.array(); jsonWriter.endArray(); + + jsonWriter.key("guid|="); + jsonWriter.array(); + for (ReconJob job : jobs) { + jsonWriter.value(((GuidBasedReconJob) job).guid); + } + jsonWriter.endArray(); + + jsonWriter.endObject(); + jsonWriter.endArray(); + jsonWriter.endObject(); + + query = stringWriter.toString(); + } + + StringBuffer sb = new StringBuffer(); + sb.append(s_mqlreadService + "?query="); + sb.append(ParsingUtilities.encode(query)); + + URL url = new URL(sb.toString()); + URLConnection connection = url.openConnection(); + connection.setConnectTimeout(5000); + connection.connect(); + + InputStream is = connection.getInputStream(); + try { + String s = ParsingUtilities.inputStreamToString(is); + JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s); + JSONArray results = o.getJSONArray("result"); + int count = results.length(); + + for (int i = 0; i < count; i++) { + JSONObject result = results.getJSONObject(i); + + String guid = result.getString("guid"); + + JSONArray types = result.getJSONArray("type"); + String[] typeIDs = new String[types.length()]; + for (int j = 0; j < typeIDs.length; j++) { + typeIDs[j] = types.getString(j); + } + + ReconCandidate candidate = new ReconCandidate( + result.getString("id"), + guid, + result.getString("name"), + typeIDs, + 100 + ); + + Recon recon = new Recon(); + recon.addCandidate(candidate); + recon.match = candidate; + recon.judgment = Judgment.Matched; + + guidToRecon.put(guid, recon); + } + } finally { + is.close(); + } + } catch (Exception e) { + e.printStackTrace(); + } + + for (int i = 0; i < jobs.size(); i++) { + String guid = ((GuidBasedReconJob) jobs.get(i)).guid; + Recon recon = guidToRecon.get(guid); + recons.add(recon); + } + + return recons; + } + +} diff --git a/src/main/java/com/metaweb/gridworks/model/recon/IdBasedReconConfig.java b/src/main/java/com/metaweb/gridworks/model/recon/IdBasedReconConfig.java new file mode 100644 index 000000000..0c5d2cc66 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/model/recon/IdBasedReconConfig.java @@ -0,0 +1,178 @@ +package com.metaweb.gridworks.model.recon; + +import java.io.InputStream; +import java.io.StringWriter; +import java.net.URL; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONWriter; + +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Recon; +import com.metaweb.gridworks.model.ReconCandidate; +import com.metaweb.gridworks.model.Row; +import com.metaweb.gridworks.model.Recon.Judgment; +import com.metaweb.gridworks.util.ParsingUtilities; + +public class IdBasedReconConfig extends StrictReconConfig { + private static final long serialVersionUID = 1857895989346775294L; + + static public ReconConfig reconstruct(JSONObject obj) throws Exception { + return new IdBasedReconConfig(); + } + + public IdBasedReconConfig() { + } + + static protected class IdBasedReconJob extends ReconJob { + String id; + + public int getKey() { + return id.hashCode(); + } + } + + @Override + public ReconJob createJob(Project project, int rowIndex, Row row, + String columnName, Cell cell) { + + IdBasedReconJob job = new IdBasedReconJob(); + String s = cell.value.toString(); + + if (!s.startsWith("/")) { + if (s.startsWith("92")) { + s = "/guid/" + s; + } else if (!s.contains("/")){ + s = "/en/" + s; + } else { + s = "/" + s; + } + } + + job.id = s; + + return job; + } + + @Override + public int getBatchSize() { + return 10; + } + + @Override + public String getBriefDescription(Project project, String columnName) { + return "Reconcile cells in column " + columnName + " as Freebase IDs"; + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("mode"); writer.value("strict"); + writer.key("match"); writer.value("id"); + writer.endObject(); + } + + @Override + public List batchRecon(List jobs) { + List recons = new ArrayList(jobs.size()); + Map idToRecon = new HashMap(); + + try { + String query = null; + { + StringWriter stringWriter = new StringWriter(); + JSONWriter jsonWriter = new JSONWriter(stringWriter); + + jsonWriter.object(); + jsonWriter.key("query"); + jsonWriter.array(); + jsonWriter.object(); + + jsonWriter.key("id"); jsonWriter.value(null); + jsonWriter.key("name"); jsonWriter.value(null); + jsonWriter.key("guid"); jsonWriter.value(null); + jsonWriter.key("type"); jsonWriter.array(); jsonWriter.endArray(); + + jsonWriter.key("id|="); + jsonWriter.array(); + for (ReconJob job : jobs) { + jsonWriter.value(((IdBasedReconJob) job).id); + } + jsonWriter.endArray(); + + jsonWriter.endObject(); + jsonWriter.endArray(); + jsonWriter.endObject(); + + query = stringWriter.toString(); + } + + StringBuffer sb = new StringBuffer(); + sb.append(s_mqlreadService + "?query="); + sb.append(ParsingUtilities.encode(query)); + + URL url = new URL(sb.toString()); + URLConnection connection = url.openConnection(); + connection.setConnectTimeout(5000); + connection.connect(); + + InputStream is = connection.getInputStream(); + try { + String s = ParsingUtilities.inputStreamToString(is); + JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s); + JSONArray results = o.getJSONArray("result"); + int count = results.length(); + + for (int i = 0; i < count; i++) { + JSONObject result = results.getJSONObject(i); + + String id = result.getString("id"); + + JSONArray types = result.getJSONArray("type"); + String[] typeIDs = new String[types.length()]; + for (int j = 0; j < typeIDs.length; j++) { + typeIDs[j] = types.getString(j); + } + + ReconCandidate candidate = new ReconCandidate( + id, + result.getString("guid"), + result.getString("name"), + typeIDs, + 100 + ); + + Recon recon = new Recon(); + recon.addCandidate(candidate); + recon.match = candidate; + recon.judgment = Judgment.Matched; + + idToRecon.put(id, recon); + } + } finally { + is.close(); + } + } catch (Exception e) { + e.printStackTrace(); + } + + for (int i = 0; i < jobs.size(); i++) { + String id = ((IdBasedReconJob) jobs.get(i)).id; + Recon recon = idToRecon.get(id); + recons.add(recon); + } + + return recons; + } + +} diff --git a/src/main/java/com/metaweb/gridworks/model/recon/KeyBasedReconConfig.java b/src/main/java/com/metaweb/gridworks/model/recon/KeyBasedReconConfig.java index 427a323fc..f6087f0f1 100644 --- a/src/main/java/com/metaweb/gridworks/model/recon/KeyBasedReconConfig.java +++ b/src/main/java/com/metaweb/gridworks/model/recon/KeyBasedReconConfig.java @@ -15,14 +15,12 @@ import org.json.JSONException; import org.json.JSONObject; import org.json.JSONWriter; -import com.metaweb.gridworks.expr.ExpressionUtils; import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Recon; import com.metaweb.gridworks.model.ReconCandidate; import com.metaweb.gridworks.model.Row; import com.metaweb.gridworks.model.Recon.Judgment; -import com.metaweb.gridworks.model.recon.HeuristicReconConfig.ColumnDetail; import com.metaweb.gridworks.protograph.FreebaseTopic; import com.metaweb.gridworks.util.ParsingUtilities; @@ -72,7 +70,7 @@ public class KeyBasedReconConfig extends StrictReconConfig { @Override public String getBriefDescription(Project project, String columnName) { - return "Reconcile cells in column " + columnName + " topics with keys in namespace " + namespace.id; + return "Reconcile cells in column " + columnName + " to topics with keys in namespace " + namespace.id; } public void write(JSONWriter writer, Properties options) @@ -85,8 +83,6 @@ public class KeyBasedReconConfig extends StrictReconConfig { writer.endObject(); } - final static String s_mqlreadService = "http://api.freebase.com/api/service/mqlread"; - @Override public List batchRecon(List jobs) { List recons = new ArrayList(jobs.size()); @@ -171,7 +167,7 @@ public class KeyBasedReconConfig extends StrictReconConfig { ); Recon recon = new Recon(); - recon.candidates.add(candidate); + recon.addCandidate(candidate); recon.match = candidate; recon.judgment = Judgment.Matched; @@ -184,8 +180,10 @@ public class KeyBasedReconConfig extends StrictReconConfig { e.printStackTrace(); } - while (recons.size() < jobs.size()) { - recons.add(new Recon()); + for (int i = 0; i < jobs.size(); i++) { + String key = ((KeyBasedReconJob) jobs.get(i)).key; + Recon recon = keyToRecon.get(key); + recons.add(recon); } return recons; diff --git a/src/main/java/com/metaweb/gridworks/model/recon/StrictReconConfig.java b/src/main/java/com/metaweb/gridworks/model/recon/StrictReconConfig.java index ebba7f97e..cb16935b7 100644 --- a/src/main/java/com/metaweb/gridworks/model/recon/StrictReconConfig.java +++ b/src/main/java/com/metaweb/gridworks/model/recon/StrictReconConfig.java @@ -5,10 +5,16 @@ import org.json.JSONObject; abstract public class StrictReconConfig extends ReconConfig { private static final long serialVersionUID = 4454059850557793074L; + final static protected String s_mqlreadService = "http://api.freebase.com/api/service/mqlread"; + static public ReconConfig reconstruct(JSONObject obj) throws Exception { String match = obj.getString("match"); if ("key".equals(match)) { return KeyBasedReconConfig.reconstruct(obj); + } else if ("id".equals(match)) { + return IdBasedReconConfig.reconstruct(obj); + } else if ("guid".equals(match)) { + return GuidBasedReconConfig.reconstruct(obj); } return null; } diff --git a/src/main/webapp/scripts/dialogs/recon-dialog.js b/src/main/webapp/scripts/dialogs/recon-dialog.js index e91a44361..a5719f067 100644 --- a/src/main/webapp/scripts/dialogs/recon-dialog.js +++ b/src/main/webapp/scripts/dialogs/recon-dialog.js @@ -82,7 +82,7 @@ ReconDialog.prototype._createDialog = function() { '' + '' + '' + - '' + + '' + '' + '' + '' + @@ -246,4 +246,65 @@ ReconDialog.prototype._onDoHeuristic = function() { this._dismiss(); } +}; + +ReconDialog.prototype._onDoStrict = function() { + var bodyParams; + + var match = $('input[name="recon-dialog-strict-choice"]:checked')[0].value; + if (match == "key") { + var namespaceChoice = $('input[name="recon-dialog-strict-namespace-choice"]:checked')[0]; + var namespace; + + if (namespaceChoice.value == "other") { + var suggest = this._elmts.strictNamespaceInput.data("data.suggest"); + if (suggest == null) { + alert("Please specify a namespace."); + return; + } + namespace = { + id: suggest.id, + name: suggest.name + }; + } else { + namespace = { + id: namespaceChoice.value, + name: namespaceChoice.getAttribute("nsName") + }; + } + + bodyParams = { + columnName: this._column.headerLabel, + config: JSON.stringify({ + mode: "strict", + match: "key", + namespace: namespace + }) + }; + } else if (match == "id") { + bodyParams = { + columnName: this._column.headerLabel, + config: JSON.stringify({ + mode: "strict", + match: "id" + }) + }; + } else if (match == "guid") { + bodyParams = { + columnName: this._column.headerLabel, + config: JSON.stringify({ + mode: "strict", + match: "guid" + }) + }; + } + + Gridworks.postProcess( + "reconcile", + {}, + bodyParams, + { cellsChanged: true, columnStatsChanged: true } + ); + + this._dismiss(); }; \ No newline at end of file diff --git a/src/main/webapp/scripts/views/data-table-column-header-ui.js b/src/main/webapp/scripts/views/data-table-column-header-ui.js index 169a2b41a..fb4ca4397 100644 --- a/src/main/webapp/scripts/views/data-table-column-header-ui.js +++ b/src/main/webapp/scripts/views/data-table-column-header-ui.js @@ -434,37 +434,42 @@ DataTableColumnHeaderUI.prototype._doReconcile = function() { data.types = data.types.slice(0, 20); var ids = $.map(data.types, function(elmt) { return elmt.id; }); - var query = [{ - "id|=" : ids, - "id" : null, - "/freebase/type_profile/kind" : [] - }]; - $.getJSON( - "http://api.freebase.com/api/service/mqlread?" + $.param({ "query" : JSON.stringify({ "query" : query }) }) + "&callback=?", - null, - function(o) { - dismissBusy(); - - var kindMap = {}; - $.each(o.result, function() { - var m = kindMap[this.id] = {}; - $.each(this["/freebase/type_profile/kind"], function() { - m[this] = true; + if (ids.length == 0) { + dismissBusy(); + new ReconDialog(self._column, []); + } else { + var query = [{ + "id|=" : ids, + "id" : null, + "/freebase/type_profile/kind" : [] + }]; + $.getJSON( + "http://api.freebase.com/api/service/mqlread?" + $.param({ "query" : JSON.stringify({ "query" : query }) }) + "&callback=?", + null, + function(o) { + dismissBusy(); + + var kindMap = {}; + $.each(o.result, function() { + var m = kindMap[this.id] = {}; + $.each(this["/freebase/type_profile/kind"], function() { + m[this] = true; + }); }); - }); - - new ReconDialog(self._column, $.map(data.types, function(type) { - if (type.id in kindMap) { - var m = kindMap[type.id]; - if (!("Role" in m) && !("Annotation" in m)) { - return type; + + new ReconDialog(self._column, $.map(data.types, function(type) { + if (type.id in kindMap) { + var m = kindMap[type.id]; + if (!("Role" in m) && !("Annotation" in m)) { + return type; + } } - } - return null; - })); - }, - "jsonp" - ); + return null; + })); + }, + "jsonp" + ); + } } }, "json" diff --git a/tests/presidents.tsv b/tests/presidents.tsv new file mode 100644 index 000000000..081bada23 --- /dev/null +++ b/tests/presidents.tsv @@ -0,0 +1,44 @@ +name guid id key +Abraham Lincoln #9202a8c04000641f8000000000003bcf /en/abraham_lincoln Abraham_Lincoln +Andrew Jackson #9202a8c04000641f8000000000005e5e /en/andrew_jackson Andrew_Jackson +Andrew Johnson #9202a8c04000641f8000000000005e6e /en/andrew_johnson Andrew_Johnson +Bill Clinton #9202a8c04000641f80000000000094f3 /en/bill_clinton Bill_Clinton +Benjamin Harrison #9202a8c04000641f800000000000a8c1 /en/benjamin_harrison Benjamin_Harrison +Chester A. Arthur #9202a8c04000641f800000000000e942 /en/chester_a_arthur Chester_A$002E_Arthur +Calvin Coolidge #9202a8c04000641f800000000000e952 /en/calvin_coolidge Calvin_Coolidge +Dwight D. Eisenhower #9202a8c04000641f80000000000122f1 /en/dwight_d_eisenhower Dwight_D$002E_Eisenhower +Franklin D. Roosevelt #9202a8c04000641f80000000000177a8 /en/franklin_d_roosevelt Franklin_D$002E_Roosevelt +Franklin Pierce #9202a8c04000641f80000000000178a1 /en/franklin_pierce Franklin_Pierce +George H. W. Bush #9202a8c04000641f8000000000019258 /en/george_h_w_bush George_H$002E_W$002E_Bush +George Washington #9202a8c04000641f80000000000192ec /en/george_washington George_Washington +Grover Cleveland #9202a8c04000641f800000000001a368 /en/grover_cleveland Grover_Cleveland +Herbert Hoover #9202a8c04000641f800000000001c592 /en/herbert_hoover Herbert_Hoover +John Adams #9202a8c04000641f800000000001fe18 /en/john_adams John_Adams +John Quincy Adams #9202a8c04000641f800000000001fe96 /en/john_quincy_adams John_Quincy_Adams +James Madison #9202a8c04000641f8000000000020893 /en/james_madison James_Madison +James Monroe #9202a8c04000641f8000000000020981 /en/james_monroe James_Monroe +John Tyler #9202a8c04000641f8000000000020991 /en/john_tyler John_Tyler +James K. Polk #9202a8c04000641f80000000000209a1 /en/james_k_polk James_K$002E_Polk +James Buchanan #9202a8c04000641f80000000000209b1 /en/james_buchanan James_Buchanan +Jimmy Carter #9202a8c04000641f8000000000020a2e /en/jimmy_carter Jimmy_Carter +Martin Van Buren #9202a8c04000641f8000000000027c13 /en/martin_van_buren Martin_Van_Buren +Millard Fillmore #9202a8c04000641f8000000000027c23 /en/millard_fillmore Millard_Fillmore +Ronald Reagan #9202a8c04000641f8000000000032c10 /en/ronald_reagan Ronald_Reagan +Richard Nixon #9202a8c04000641f8000000000032d27 /en/richard_nixon Richard_Nixon +Rutherford B. Hayes #9202a8c04000641f8000000000033882 /en/rutherford_b_hayes Rutherford_B$002E_Hayes +Thomas Jefferson #9202a8c04000641f800000000003ad58 /en/thomas_jefferson Thomas_Jefferson +Theodore Roosevelt #9202a8c04000641f800000000003bfb1 /en/theodore_roosevelt Theodore_Roosevelt +Ulysses S. Grant #9202a8c04000641f800000000003e451 /en/ulysses_s_grant Ulysses_S$002E_Grant +Warren G. Harding #9202a8c04000641f8000000000040726 /en/warren_g_harding Warren_G$002E_Harding +William Henry Harrison #9202a8c04000641f8000000000040cb6 /en/william_henry_harrison William_Henry_Harrison +William McKinley #9202a8c04000641f8000000000040ea7 /en/william_mckinley William_McKinley +William Howard Taft #9202a8c04000641f8000000000040eb7 /en/william_howard_taft William_Howard_Taft +Woodrow Wilson #9202a8c04000641f8000000000040ec7 /en/woodrow_wilson Woodrow_Wilson +Zachary Taylor #9202a8c04000641f80000000000424a9 /en/zachary_taylor Zachary_Taylor +James Garfield #9202a8c04000641f800000000005085b /en/james_garfield James_Garfield +Lyndon B. Johnson #9202a8c04000641f8000000000069dbd /en/lyndon_b_johnson Lyndon_B$002E_Johnson +Barack Obama #9202a8c04000641f800000000029c277 /en/barack_obama Barack_Obama +George W. Bush #9202a8c04000641f8000000000951bd7 /en/george_w_bush George_W$002E_Bush +Harry S. Truman #9202a8c04000641f8000000000953892 /en/harry_s_truman Harry_S$002E_Truman +Gerald Ford #9202a8c04000641f8000000000bfcd9f /en/gerald_ford Gerald_Ford +John F. Kennedy #9202a8c04000641f8000000000c1c424 /en/john_f_kennedy John_F$002E_Kennedy \ No newline at end of file
a Freebase key in
the Wikipedia English namespace