From 99ae6109d8e2d8c872690975dc37c244e9534ecd Mon Sep 17 00:00:00 2001 From: David Huynh Date: Tue, 2 Mar 2010 03:31:58 +0000 Subject: [PATCH] Started work on key-based recon. git-svn-id: http://google-refine.googlecode.com/svn/trunk@164 7d457c2a-affb-35e4-300a-418c747d4874 --- .../model/recon/KeyBasedReconConfig.java | 194 ++++++++++++++++++ .../gridworks/model/recon/ReconConfig.java | 2 + .../model/recon/StrictReconConfig.java | 15 ++ 3 files changed, 211 insertions(+) create mode 100644 src/main/java/com/metaweb/gridworks/model/recon/KeyBasedReconConfig.java create mode 100644 src/main/java/com/metaweb/gridworks/model/recon/StrictReconConfig.java diff --git a/src/main/java/com/metaweb/gridworks/model/recon/KeyBasedReconConfig.java b/src/main/java/com/metaweb/gridworks/model/recon/KeyBasedReconConfig.java new file mode 100644 index 000000000..427a323fc --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/model/recon/KeyBasedReconConfig.java @@ -0,0 +1,194 @@ +package com.metaweb.gridworks.model.recon; + +import java.io.InputStream; +import java.io.StringWriter; +import java.net.URL; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONWriter; + +import com.metaweb.gridworks.expr.ExpressionUtils; +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Recon; +import com.metaweb.gridworks.model.ReconCandidate; +import com.metaweb.gridworks.model.Row; +import com.metaweb.gridworks.model.Recon.Judgment; +import com.metaweb.gridworks.model.recon.HeuristicReconConfig.ColumnDetail; +import com.metaweb.gridworks.protograph.FreebaseTopic; +import com.metaweb.gridworks.util.ParsingUtilities; + +public class KeyBasedReconConfig extends StrictReconConfig { + private static final long serialVersionUID = 2363754609522023900L; + + final public FreebaseTopic namespace; + + static public ReconConfig reconstruct(JSONObject obj) throws Exception { + JSONObject ns = obj.getJSONObject("namespace"); + + return new KeyBasedReconConfig( + new FreebaseTopic( + ns.getString("id"), + ns.getString("name") + ) + ); + } + + public KeyBasedReconConfig(FreebaseTopic namespace) { + this.namespace = namespace; + } + + static protected class KeyBasedReconJob extends ReconJob { + String key; + + public int getKey() { + return key.hashCode(); + } + } + + @Override + public ReconJob createJob(Project project, int rowIndex, Row row, + String columnName, Cell cell) { + + KeyBasedReconJob job = new KeyBasedReconJob(); + + job.key = cell.value.toString().replace(' ', '_'); + + return job; + } + + @Override + public int getBatchSize() { + return 10; + } + + @Override + public String getBriefDescription(Project project, String columnName) { + return "Reconcile cells in column " + columnName + " topics with keys in namespace " + namespace.id; + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("mode"); writer.value("strict"); + writer.key("match"); writer.value("key"); + writer.key("namespace"); namespace.write(writer, options); + writer.endObject(); + } + + final static String s_mqlreadService = "http://api.freebase.com/api/service/mqlread"; + + @Override + public List batchRecon(List jobs) { + List recons = new ArrayList(jobs.size()); + Map keyToRecon = new HashMap(); + + try { + String query = null; + { + StringWriter stringWriter = new StringWriter(); + JSONWriter jsonWriter = new JSONWriter(stringWriter); + + jsonWriter.object(); + jsonWriter.key("query"); + jsonWriter.array(); + jsonWriter.object(); + + jsonWriter.key("id"); jsonWriter.value(null); + jsonWriter.key("name"); jsonWriter.value(null); + jsonWriter.key("guid"); jsonWriter.value(null); + jsonWriter.key("type"); jsonWriter.array(); jsonWriter.endArray(); + + jsonWriter.key("key"); + jsonWriter.array(); + jsonWriter.object(); + + jsonWriter.key("namespace"); + jsonWriter.object(); + jsonWriter.key("id"); jsonWriter.value(namespace.id); + jsonWriter.endObject(); + + jsonWriter.key("value"); jsonWriter.value(null); + jsonWriter.key("value|="); + jsonWriter.array(); + for (ReconJob job : jobs) { + jsonWriter.value(((KeyBasedReconJob) job).key); + } + jsonWriter.endArray(); + + jsonWriter.endObject(); + jsonWriter.endArray(); + + jsonWriter.endObject(); + jsonWriter.endArray(); + jsonWriter.endObject(); + + query = stringWriter.toString(); + } + + StringBuffer sb = new StringBuffer(); + sb.append(s_mqlreadService + "?query="); + sb.append(ParsingUtilities.encode(query)); + + URL url = new URL(sb.toString()); + URLConnection connection = url.openConnection(); + connection.setConnectTimeout(5000); + connection.connect(); + + InputStream is = connection.getInputStream(); + try { + String s = ParsingUtilities.inputStreamToString(is); + JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s); + JSONArray results = o.getJSONArray("result"); + int count = results.length(); + + for (int i = 0; i < count; i++) { + JSONObject result = results.getJSONObject(i); + + String key = result.getJSONArray("key").getJSONObject(0).getString("value"); + + JSONArray types = result.getJSONArray("type"); + String[] typeIDs = new String[types.length()]; + for (int j = 0; j < typeIDs.length; j++) { + typeIDs[j] = types.getString(j); + } + + ReconCandidate candidate = new ReconCandidate( + result.getString("id"), + result.getString("guid"), + result.getString("name"), + typeIDs, + 100 + ); + + Recon recon = new Recon(); + recon.candidates.add(candidate); + recon.match = candidate; + recon.judgment = Judgment.Matched; + + keyToRecon.put(key, recon); + } + } finally { + is.close(); + } + } catch (Exception e) { + e.printStackTrace(); + } + + while (recons.size() < jobs.size()) { + recons.add(new Recon()); + } + + return recons; + } + +} diff --git a/src/main/java/com/metaweb/gridworks/model/recon/ReconConfig.java b/src/main/java/com/metaweb/gridworks/model/recon/ReconConfig.java index 68c9c9e3e..1659b2dbd 100644 --- a/src/main/java/com/metaweb/gridworks/model/recon/ReconConfig.java +++ b/src/main/java/com/metaweb/gridworks/model/recon/ReconConfig.java @@ -18,6 +18,8 @@ abstract public class ReconConfig implements Serializable, Jsonizable { String mode = obj.getString("mode"); if ("heuristic".equals(mode)) { return HeuristicReconConfig.reconstruct(obj); + } else if ("strict".equals(mode)) { + return StrictReconConfig.reconstruct(obj); } return null; } diff --git a/src/main/java/com/metaweb/gridworks/model/recon/StrictReconConfig.java b/src/main/java/com/metaweb/gridworks/model/recon/StrictReconConfig.java new file mode 100644 index 000000000..ebba7f97e --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/model/recon/StrictReconConfig.java @@ -0,0 +1,15 @@ +package com.metaweb.gridworks.model.recon; + +import org.json.JSONObject; + +abstract public class StrictReconConfig extends ReconConfig { + private static final long serialVersionUID = 4454059850557793074L; + + static public ReconConfig reconstruct(JSONObject obj) throws Exception { + String match = obj.getString("match"); + if ("key".equals(match)) { + return KeyBasedReconConfig.reconstruct(obj); + } + return null; + } +}