diff --git a/src/main/java/com/metaweb/gridworks/commands/recon/ReconcileCommand.java b/src/main/java/com/metaweb/gridworks/commands/recon/ReconcileCommand.java index 2dd9904f9..a1f67cf9a 100644 --- a/src/main/java/com/metaweb/gridworks/commands/recon/ReconcileCommand.java +++ b/src/main/java/com/metaweb/gridworks/commands/recon/ReconcileCommand.java @@ -3,9 +3,11 @@ package com.metaweb.gridworks.commands.recon; import javax.servlet.http.HttpServletRequest; import org.json.JSONObject; +import org.json.JSONTokener; import com.metaweb.gridworks.commands.EngineDependentCommand; import com.metaweb.gridworks.model.AbstractOperation; +import com.metaweb.gridworks.model.recon.ReconConfig; import com.metaweb.gridworks.operations.ReconOperation; public class ReconcileCommand extends EngineDependentCommand { @@ -15,11 +17,11 @@ public class ReconcileCommand extends EngineDependentCommand { JSONObject engineConfig) throws Exception { String columnName = request.getParameter("columnName"); - String typeID = request.getParameter("typeID"); - String typeName = request.getParameter("typeName"); - boolean autoMatch = "true".equals(request.getParameter("autoMatch")); - double minScore = autoMatch ? Double.parseDouble(request.getParameter("minScore")) : 0; + String configString = request.getParameter("config"); - return new ReconOperation(engineConfig, columnName, typeID, typeName, autoMatch, minScore); + JSONTokener t = new JSONTokener(configString); + JSONObject config = (JSONObject) t.nextValue(); + + return new ReconOperation(engineConfig, columnName, ReconConfig.reconstruct(config)); } } diff --git a/src/main/java/com/metaweb/gridworks/model/AbstractOperation.java b/src/main/java/com/metaweb/gridworks/model/AbstractOperation.java index 8bb96262b..0dcd40b8f 100644 --- a/src/main/java/com/metaweb/gridworks/model/AbstractOperation.java +++ b/src/main/java/com/metaweb/gridworks/model/AbstractOperation.java @@ -18,7 +18,7 @@ abstract public class AbstractOperation implements Serializable, Jsonizable { private static final long serialVersionUID = 3916055862440019600L; public Process createProcess(Project project, Properties options) throws Exception { - return new QuickHistoryEntryProcess(project, getBriefDescription()) { + return new QuickHistoryEntryProcess(project, getBriefDescription(null)) { @Override protected HistoryEntry createHistoryEntry() throws Exception { return AbstractOperation.this.createHistoryEntry(_project); @@ -30,7 +30,7 @@ abstract public class AbstractOperation implements Serializable, Jsonizable { throw new NotImplementedException(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { throw new NotImplementedException(); } } diff --git a/src/main/java/com/metaweb/gridworks/model/Column.java b/src/main/java/com/metaweb/gridworks/model/Column.java index b73e84ec6..0589a2c9e 100644 --- a/src/main/java/com/metaweb/gridworks/model/Column.java +++ b/src/main/java/com/metaweb/gridworks/model/Column.java @@ -9,6 +9,7 @@ import org.json.JSONException; import org.json.JSONWriter; import com.metaweb.gridworks.Jsonizable; +import com.metaweb.gridworks.model.recon.ReconConfig; public class Column implements Serializable, Jsonizable { private static final long serialVersionUID = -1063342490951563563L; diff --git a/src/main/java/com/metaweb/gridworks/model/ReconConfig.java b/src/main/java/com/metaweb/gridworks/model/ReconConfig.java deleted file mode 100644 index 426bf0d96..000000000 --- a/src/main/java/com/metaweb/gridworks/model/ReconConfig.java +++ /dev/null @@ -1,33 +0,0 @@ -package com.metaweb.gridworks.model; - -import java.io.Serializable; -import java.util.Properties; - -import org.json.JSONException; -import org.json.JSONWriter; - -import com.metaweb.gridworks.Jsonizable; - -public class ReconConfig implements Serializable, Jsonizable { - private static final long serialVersionUID = -4831409797104437854L; - - final public String typeID; - final public String typeName; - - public ReconConfig(String typeID, String typeName) { - this.typeID = typeID; - this.typeName = typeName; - } - - public void write(JSONWriter writer, Properties options) - throws JSONException { - - writer.object(); - writer.key("type"); - writer.object(); - writer.key("id"); writer.value(typeID); - writer.key("name"); writer.value(typeName); - writer.endObject(); - writer.endObject(); - } -} diff --git a/src/main/java/com/metaweb/gridworks/model/changes/ReconChange.java b/src/main/java/com/metaweb/gridworks/model/changes/ReconChange.java index 41e4afb49..391b6e032 100644 --- a/src/main/java/com/metaweb/gridworks/model/changes/ReconChange.java +++ b/src/main/java/com/metaweb/gridworks/model/changes/ReconChange.java @@ -7,8 +7,8 @@ import java.util.List; import com.metaweb.gridworks.model.Column; import com.metaweb.gridworks.model.Project; -import com.metaweb.gridworks.model.ReconConfig; import com.metaweb.gridworks.model.ReconStats; +import com.metaweb.gridworks.model.recon.ReconConfig; public class ReconChange extends MassCellChange { private static final long serialVersionUID = 7048806528587330543L; diff --git a/src/main/java/com/metaweb/gridworks/model/recon/HeuristicReconConfig.java b/src/main/java/com/metaweb/gridworks/model/recon/HeuristicReconConfig.java new file mode 100644 index 000000000..1a053458d --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/model/recon/HeuristicReconConfig.java @@ -0,0 +1,399 @@ +package com.metaweb.gridworks.model.recon; + +import java.io.InputStream; +import java.io.Serializable; +import java.io.StringWriter; +import java.net.URL; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Properties; +import java.util.Set; + +import org.apache.commons.lang.StringUtils; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONWriter; + +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Recon; +import com.metaweb.gridworks.model.ReconCandidate; +import com.metaweb.gridworks.model.Row; +import com.metaweb.gridworks.model.Recon.Judgment; +import com.metaweb.gridworks.protograph.FreebaseProperty; +import com.metaweb.gridworks.util.ParsingUtilities; + +public class HeuristicReconConfig extends ReconConfig { + private static final long serialVersionUID = 423145327938373362L; + + static public class ColumnDetail implements Serializable { + private static final long serialVersionUID = -8996704822460155543L; + + final public String columnName; + final public FreebaseProperty property; + + public ColumnDetail(String columnName, FreebaseProperty property) { + this.columnName = columnName; + this.property = property; + } + } + + static public ReconConfig reconstruct(JSONObject obj) throws Exception { + List columnDetails = null; + if (obj.has("columnDetails")) { + JSONArray columnDetailsA = obj.getJSONArray("columnDetails"); + int l = columnDetailsA.length(); + + columnDetails = new ArrayList(l); + for (int i = 0; i < l; i++) { + JSONObject o = columnDetailsA.getJSONObject(i); + JSONObject p = o.getJSONObject("property"); + + columnDetails.add(new ColumnDetail( + o.getString("column"), + new FreebaseProperty( + p.getString("id"), + p.getString("name") + ) + )); + } + } else { + columnDetails = new ArrayList(); + } + + JSONObject t = obj.getJSONObject("type"); + + return new HeuristicReconConfig( + obj.getString("service"), + t.getString("id"), + t.getString("name"), + obj.getBoolean("autoMatch"), + columnDetails + ); + } + + static protected class HeuristicReconJob extends ReconJob { + String text; + + public int getKey() { + return text.hashCode(); + } + } + + final public String service; // either "recon" or "relevance" + final public String typeID; + final public String typeName; + final public boolean autoMatch; + final public List columnDetails; + + public HeuristicReconConfig( + String service, + String typeID, + String typeName, + boolean autoMatch, + List columnDetails + ) { + this.service = service; + this.typeID = typeID; + this.typeName = typeName; + this.autoMatch = autoMatch; + this.columnDetails = new ArrayList(); + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("mode"); writer.value("heuristic"); + writer.key("service"); writer.value(service); + writer.key("type"); + writer.object(); + writer.key("id"); writer.value(typeID); + writer.key("name"); writer.value(typeName); + writer.endObject(); + writer.key("autoMatch"); writer.value(autoMatch); + writer.key("columnDetails"); + writer.array(); + for (ColumnDetail c : columnDetails) { + writer.object(); + writer.key("column"); writer.value(c.columnName); + writer.key("property"); c.property.write(writer, options); + writer.endObject(); + } + writer.endArray(); + writer.endObject(); + } + + @Override + public int getBatchSize() { + return 10; + } + + @Override + public String getBriefDescription(Project project, String columnName) { + return "Reconcile cells in column " + columnName + " to type " + typeID; + } + + @Override + public ReconJob createJob(Project project, int rowIndex, Row row, + String columnName, Cell cell) { + + HeuristicReconJob job = new HeuristicReconJob(); + + job.text = cell.value.toString(); + + return job; + } + + @Override + public List batchRecon(List jobs) { + if ("relevance".equals(service)) { + return batchReconUsingRelevance(jobs); + } else { + return batchReconUsingReconService(jobs); + } + } + + protected List batchReconUsingRelevance(List jobs) { + List recons = new ArrayList(jobs.size()); + + try { + StringWriter stringWriter = new StringWriter(); + JSONWriter jsonWriter = new JSONWriter(stringWriter); + + jsonWriter.object(); + for (int i = 0; i < jobs.size(); i++) { + HeuristicReconJob job = (HeuristicReconJob) jobs.get(i); + + jsonWriter.key("q" + i + ":search"); + + jsonWriter.object(); + jsonWriter.key("query"); jsonWriter.value(job.text); + jsonWriter.key("limit"); jsonWriter.value(3); + jsonWriter.key("type"); jsonWriter.value(typeID); + jsonWriter.key("type_strict"); jsonWriter.value("should"); + jsonWriter.key("type_exclude"); jsonWriter.value("/common/image"); + jsonWriter.key("domain_exclude"); jsonWriter.value("/freebase"); + jsonWriter.endObject(); + } + jsonWriter.endObject(); + + StringBuffer sb = new StringBuffer(); + sb.append("http://api.freebase.com/api/service/search?indent=1&queries="); + sb.append(ParsingUtilities.encode(stringWriter.toString())); + + URL url = new URL(sb.toString()); + URLConnection connection = url.openConnection(); + connection.setConnectTimeout(5000); + connection.connect(); + + InputStream is = connection.getInputStream(); + try { + String s = ParsingUtilities.inputStreamToString(is); + JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s); + + for (int i = 0; i < jobs.size(); i++) { + HeuristicReconJob job = (HeuristicReconJob) jobs.get(i); + + String text = job.text; + String key = "q" + i + ":search"; + if (!o.has(key)) { + continue; + } + + Recon recon = null; + + JSONObject o2 = o.getJSONObject(key); + if (o2.has("result")) { + JSONArray results = o2.getJSONArray("result"); + + recon = createReconFromRelevanceResults(text, results); + } else { + recon = new Recon(); + } + + recons.add(recon); + } + } finally { + is.close(); + } + } catch (Exception e) { + e.printStackTrace(); + } + + System.gc(); + + return recons; + } + + protected Recon createReconFromRelevanceResults(String text, JSONArray results) { + Recon recon = new Recon(); + try { + int length = results.length(); + int count = 0; + for (int i = 0; i < length && count < 3; i++) { + JSONObject result = results.getJSONObject(i); + if (!result.has("name")) { + continue; + } + + JSONArray types = result.getJSONArray("type"); + String[] typeIDs = new String[types.length()]; + for (int j = 0; j < typeIDs.length; j++) { + typeIDs[j] = types.getJSONObject(j).getString("id"); + } + + double score = result.getDouble("relevance:score"); + ReconCandidate candidate = new ReconCandidate( + result.getString("id"), + result.getString("guid"), + result.getString("name"), + typeIDs, + score + ); + + // best match + if (i == 0) { + recon.setFeature(Recon.Feature_nameMatch, text.equalsIgnoreCase(candidate.topicName)); + recon.setFeature(Recon.Feature_nameLevenshtein, StringUtils.getLevenshteinDistance(text, candidate.topicName)); + recon.setFeature(Recon.Feature_nameWordDistance, wordDistance(text, candidate.topicName)); + + recon.setFeature(Recon.Feature_typeMatch, false); + for (String typeID : candidate.typeIDs) { + if (typeID.equals(typeID)) { + recon.setFeature(Recon.Feature_typeMatch, true); + if (autoMatch && score >= 100) { + recon.match = candidate; + recon.judgment = Judgment.Matched; + } + break; + } + } + } + + recon.addCandidate(candidate); + count++; + } + } catch (JSONException e) { + e.printStackTrace(); + } + return recon; + } + + protected List batchReconUsingReconService(List jobs) { + List recons = new ArrayList(jobs.size()); + + try { + StringWriter stringWriter = new StringWriter(); + JSONWriter jsonWriter = new JSONWriter(stringWriter); + + jsonWriter.object(); + for (int i = 0; i < jobs.size(); i++) { + HeuristicReconJob job = (HeuristicReconJob) jobs.get(i); + + jsonWriter.key("q" + i + ":search"); + + jsonWriter.object(); + jsonWriter.key("query"); jsonWriter.value(job.text); + jsonWriter.key("limit"); jsonWriter.value(3); + jsonWriter.key("type"); jsonWriter.value(typeID); + jsonWriter.key("type_strict"); jsonWriter.value("should"); + jsonWriter.key("type_exclude"); jsonWriter.value("/common/image"); + jsonWriter.key("domain_exclude"); jsonWriter.value("/freebase"); + jsonWriter.endObject(); + } + jsonWriter.endObject(); + + StringBuffer sb = new StringBuffer(); + sb.append("http://api.freebase.com/api/service/search?indent=1&queries="); + sb.append(ParsingUtilities.encode(stringWriter.toString())); + + URL url = new URL(sb.toString()); + URLConnection connection = url.openConnection(); + connection.setConnectTimeout(5000); + connection.connect(); + + InputStream is = connection.getInputStream(); + try { + String s = ParsingUtilities.inputStreamToString(is); + JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s); + + for (int i = 0; i < jobs.size(); i++) { + HeuristicReconJob job = (HeuristicReconJob) jobs.get(i); + + String text = job.text; + String key = "q" + i + ":search"; + if (!o.has(key)) { + continue; + } + + Recon recon = null; + + JSONObject o2 = o.getJSONObject(key); + if (o2.has("result")) { + JSONArray results = o2.getJSONArray("result"); + + recon = createReconFromRelevanceResults(text, results); + } else { + recon = new Recon(); + } + + recons.add(recon); + } + } finally { + is.close(); + } + } catch (Exception e) { + e.printStackTrace(); + } + + System.gc(); + + return recons; + } + + + static protected double wordDistance(String s1, String s2) { + Set words1 = breakWords(s1); + Set words2 = breakWords(s2); + return words1.size() >= words2.size() ? wordDistance(words1, words2) : wordDistance(words2, words1); + } + + static protected double wordDistance(Set longWords, Set shortWords) { + double common = 0; + for (String word : shortWords) { + if (longWords.contains(word)) { + common++; + } + } + return common / longWords.size(); + } + + static protected Set s_stopWords; + static { + s_stopWords = new HashSet(); + s_stopWords.add("the"); + s_stopWords.add("a"); + s_stopWords.add("and"); + s_stopWords.add("of"); + s_stopWords.add("on"); + s_stopWords.add("in"); + s_stopWords.add("at"); + s_stopWords.add("by"); + } + + static protected Set breakWords(String s) { + String[] words = s.toLowerCase().split("\\s+"); + + Set set = new HashSet(words.length); + for (String word : words) { + if (!s_stopWords.contains(word)) { + set.add(word); + } + } + return set; + } +} diff --git a/src/main/java/com/metaweb/gridworks/model/recon/ReconConfig.java b/src/main/java/com/metaweb/gridworks/model/recon/ReconConfig.java new file mode 100644 index 000000000..68c9c9e3e --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/model/recon/ReconConfig.java @@ -0,0 +1,38 @@ +package com.metaweb.gridworks.model.recon; + +import java.io.Serializable; +import java.util.List; + +import org.json.JSONObject; + +import com.metaweb.gridworks.Jsonizable; +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Recon; +import com.metaweb.gridworks.model.Row; + +abstract public class ReconConfig implements Serializable, Jsonizable { + private static final long serialVersionUID = -4831409797104437854L; + + static public ReconConfig reconstruct(JSONObject obj) throws Exception { + String mode = obj.getString("mode"); + if ("heuristic".equals(mode)) { + return HeuristicReconConfig.reconstruct(obj); + } + return null; + } + + abstract public int getBatchSize(); + + abstract public String getBriefDescription(Project project, String columnName); + + abstract public ReconJob createJob( + Project project, + int rowIndex, + Row row, + String columnName, + Cell cell + ); + + abstract public List batchRecon(List jobs); +} diff --git a/src/main/java/com/metaweb/gridworks/model/recon/ReconJob.java b/src/main/java/com/metaweb/gridworks/model/recon/ReconJob.java new file mode 100644 index 000000000..7e8becbe8 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/model/recon/ReconJob.java @@ -0,0 +1,5 @@ +package com.metaweb.gridworks.model.recon; + +abstract public class ReconJob { + abstract public int getKey(); +} diff --git a/src/main/java/com/metaweb/gridworks/operations/ColumnAdditionOperation.java b/src/main/java/com/metaweb/gridworks/operations/ColumnAdditionOperation.java index c4b09f556..b012c6d1c 100644 --- a/src/main/java/com/metaweb/gridworks/operations/ColumnAdditionOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/ColumnAdditionOperation.java @@ -66,7 +66,7 @@ public class ColumnAdditionOperation extends EngineDependentOperation { writer.object(); writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); - writer.key("description"); writer.value(getBriefDescription()); + writer.key("description"); writer.value(getBriefDescription(null)); writer.key("engineConfig"); writer.value(getEngineConfig()); writer.key("headerLabel"); writer.value(_headerLabel); writer.key("columnInsertIndex"); writer.value(_columnInsertIndex); @@ -75,7 +75,7 @@ public class ColumnAdditionOperation extends EngineDependentOperation { writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return "Create column " + _headerLabel + " at index " + _columnInsertIndex + " based on column " + _baseColumnName + diff --git a/src/main/java/com/metaweb/gridworks/operations/ColumnRemovalOperation.java b/src/main/java/com/metaweb/gridworks/operations/ColumnRemovalOperation.java index 40d72bb6d..d2dc08a28 100644 --- a/src/main/java/com/metaweb/gridworks/operations/ColumnRemovalOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/ColumnRemovalOperation.java @@ -41,7 +41,7 @@ public class ColumnRemovalOperation extends AbstractOperation { } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return "Remove column " + _columnName; } diff --git a/src/main/java/com/metaweb/gridworks/operations/MultiValuedCellJoinOperation.java b/src/main/java/com/metaweb/gridworks/operations/MultiValuedCellJoinOperation.java index 6cf0a4e22..a983f5431 100644 --- a/src/main/java/com/metaweb/gridworks/operations/MultiValuedCellJoinOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/MultiValuedCellJoinOperation.java @@ -47,14 +47,14 @@ public class MultiValuedCellJoinOperation extends AbstractOperation { writer.object(); writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); - writer.key("description"); writer.value(getBriefDescription()); + writer.key("description"); writer.value(getBriefDescription(null)); writer.key("columnName"); writer.value(_columnName); writer.key("keyColumnName"); writer.value(_keyColumnName); writer.key("separator"); writer.value(_separator); writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return "Join multi-valued cells in column " + _columnName; } @@ -121,7 +121,7 @@ public class MultiValuedCellJoinOperation extends AbstractOperation { return new HistoryEntry( project, - getBriefDescription(), + getBriefDescription(null), this, new MassRowChange(newRows) ); diff --git a/src/main/java/com/metaweb/gridworks/operations/MultiValuedCellSplitOperation.java b/src/main/java/com/metaweb/gridworks/operations/MultiValuedCellSplitOperation.java index 946dcbcb9..132d300d2 100644 --- a/src/main/java/com/metaweb/gridworks/operations/MultiValuedCellSplitOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/MultiValuedCellSplitOperation.java @@ -59,7 +59,7 @@ public class MultiValuedCellSplitOperation extends AbstractOperation { writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return "Split multi-valued cells in column " + _columnName; } @@ -139,7 +139,7 @@ public class MultiValuedCellSplitOperation extends AbstractOperation { return new HistoryEntry( project, - getBriefDescription(), + getBriefDescription(null), this, new MassRowChange(newRows) ); diff --git a/src/main/java/com/metaweb/gridworks/operations/ReconDiscardJudgmentsOperation.java b/src/main/java/com/metaweb/gridworks/operations/ReconDiscardJudgmentsOperation.java index 257e89d31..71fefe7ad 100644 --- a/src/main/java/com/metaweb/gridworks/operations/ReconDiscardJudgmentsOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/ReconDiscardJudgmentsOperation.java @@ -41,13 +41,13 @@ public class ReconDiscardJudgmentsOperation extends EngineDependentMassCellOpera writer.object(); writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); - writer.key("description"); writer.value(getBriefDescription()); + writer.key("description"); writer.value(getBriefDescription(null)); writer.key("engineConfig"); writer.value(getEngineConfig()); writer.key("columnName"); writer.value(_columnName); writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return "Discard recon judgments for cells in column " + _columnName; } diff --git a/src/main/java/com/metaweb/gridworks/operations/ReconJudgeSimilarCellsOperation.java b/src/main/java/com/metaweb/gridworks/operations/ReconJudgeSimilarCellsOperation.java index 33cd2634f..eb5a9d75e 100644 --- a/src/main/java/com/metaweb/gridworks/operations/ReconJudgeSimilarCellsOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/ReconJudgeSimilarCellsOperation.java @@ -89,7 +89,7 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper writer.object(); writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); - writer.key("description"); writer.value(getBriefDescription()); + writer.key("description"); writer.value(getBriefDescription(null)); writer.key("engineConfig"); writer.value(getEngineConfig()); writer.key("columnName"); writer.value(_columnName); writer.key("similarValue"); writer.value(_similarValue); @@ -102,7 +102,7 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { if (_judgment == Judgment.None) { return "Discard recon judgments for cells containing \"" + _similarValue + "\" in column " + _columnName; diff --git a/src/main/java/com/metaweb/gridworks/operations/ReconMarkNewTopicsOperation.java b/src/main/java/com/metaweb/gridworks/operations/ReconMarkNewTopicsOperation.java index 25b61a7c8..a5d9277a3 100644 --- a/src/main/java/com/metaweb/gridworks/operations/ReconMarkNewTopicsOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/ReconMarkNewTopicsOperation.java @@ -46,14 +46,14 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio writer.object(); writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); - writer.key("description"); writer.value(getBriefDescription()); + writer.key("description"); writer.value(getBriefDescription(null)); writer.key("engineConfig"); writer.value(getEngineConfig()); writer.key("columnName"); writer.value(_columnName); writer.key("shareNewTopics"); writer.value(_shareNewTopics); writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return "Mark to create new topics for cells in column " + _columnName + (_shareNewTopics ? ", one topic for each group of similar cells" : diff --git a/src/main/java/com/metaweb/gridworks/operations/ReconMatchBestCandidatesOperation.java b/src/main/java/com/metaweb/gridworks/operations/ReconMatchBestCandidatesOperation.java index 9fb80239f..5cbbbfc97 100644 --- a/src/main/java/com/metaweb/gridworks/operations/ReconMatchBestCandidatesOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/ReconMatchBestCandidatesOperation.java @@ -41,13 +41,13 @@ public class ReconMatchBestCandidatesOperation extends EngineDependentMassCellOp writer.object(); writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); - writer.key("description"); writer.value(getBriefDescription()); + writer.key("description"); writer.value(getBriefDescription(null)); writer.key("engineConfig"); writer.value(getEngineConfig()); writer.key("columnName"); writer.value(_columnName); writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return "Match each cell to its best recon candidate in column " + _columnName; } diff --git a/src/main/java/com/metaweb/gridworks/operations/ReconMatchSpecificTopicOperation.java b/src/main/java/com/metaweb/gridworks/operations/ReconMatchSpecificTopicOperation.java index a490e2e26..332dfa0d6 100644 --- a/src/main/java/com/metaweb/gridworks/operations/ReconMatchSpecificTopicOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/ReconMatchSpecificTopicOperation.java @@ -60,7 +60,7 @@ public class ReconMatchSpecificTopicOperation extends EngineDependentMassCellOpe writer.object(); writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); - writer.key("description"); writer.value(getBriefDescription()); + writer.key("description"); writer.value(getBriefDescription(null)); writer.key("engineConfig"); writer.value(getEngineConfig()); writer.key("columnName"); writer.value(_columnName); writer.key("match"); @@ -78,7 +78,7 @@ public class ReconMatchSpecificTopicOperation extends EngineDependentMassCellOpe writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return "Match specific topic " + match.topicName + " (" + match.topicID + ") to cells in column " + _columnName; diff --git a/src/main/java/com/metaweb/gridworks/operations/ReconOperation.java b/src/main/java/com/metaweb/gridworks/operations/ReconOperation.java index af1e92c88..5d0c22a73 100644 --- a/src/main/java/com/metaweb/gridworks/operations/ReconOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/ReconOperation.java @@ -1,20 +1,11 @@ package com.metaweb.gridworks.operations; -import java.io.InputStream; -import java.io.StringWriter; -import java.net.URL; -import java.net.URLConnection; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Properties; -import java.util.Set; -import org.apache.commons.lang.StringUtils; -import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.json.JSONWriter; @@ -30,24 +21,19 @@ import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Column; import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Recon; -import com.metaweb.gridworks.model.ReconCandidate; -import com.metaweb.gridworks.model.ReconConfig; import com.metaweb.gridworks.model.Row; -import com.metaweb.gridworks.model.Recon.Judgment; import com.metaweb.gridworks.model.changes.CellChange; import com.metaweb.gridworks.model.changes.ReconChange; +import com.metaweb.gridworks.model.recon.ReconConfig; +import com.metaweb.gridworks.model.recon.ReconJob; import com.metaweb.gridworks.process.LongRunningProcess; import com.metaweb.gridworks.process.Process; -import com.metaweb.gridworks.util.ParsingUtilities; public class ReconOperation extends EngineDependentOperation { private static final long serialVersionUID = 838795186905314865L; final protected String _columnName; - final protected String _typeID; - final protected String _typeName; - final protected boolean _autoMatch; - final protected double _minScore; + final protected ReconConfig _reconConfig; static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception { JSONObject engineConfig = obj.getJSONObject("engineConfig"); @@ -55,39 +41,30 @@ public class ReconOperation extends EngineDependentOperation { return new ReconOperation( engineConfig, obj.getString("columnName"), - obj.getString("typeID"), - obj.getString("typeName"), - obj.getBoolean("autoMatch"), - obj.getDouble("minScore") + ReconConfig.reconstruct(obj.getJSONObject("config")) ); } public ReconOperation( JSONObject engineConfig, String columnName, - String typeID, - String typeName, - boolean autoMatch, - double minScore + ReconConfig reconConfig ) { super(engineConfig); _columnName = columnName; - _typeID = typeID; - _typeName = typeName; - _autoMatch = autoMatch; - _minScore = minScore; + _reconConfig = reconConfig; } public Process createProcess(Project project, Properties options) throws Exception { return new ReconProcess( project, getEngineConfig(), - getBriefDescription() + getBriefDescription(null) ); } - protected String getBriefDescription() { - return "Reconcile cells in column " + _columnName + " to type " + _typeID; + protected String getBriefDescription(Project project) { + return _reconConfig.getBriefDescription(project, _columnName); } public void write(JSONWriter writer, Properties options) @@ -95,12 +72,9 @@ public class ReconOperation extends EngineDependentOperation { writer.object(); writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); - writer.key("description"); writer.value(getBriefDescription()); + writer.key("description"); writer.value(getBriefDescription(null)); writer.key("columnName"); writer.value(_columnName); - writer.key("typeID"); writer.value(_typeID); - writer.key("typeName"); writer.value(_typeName); - writer.key("autoMatch"); writer.value(_autoMatch); - writer.key("minScore"); writer.value(_minScore); + writer.key("config"); _reconConfig.write(writer, options); writer.key("engineConfig"); writer.value(getEngineConfig()); writer.endObject(); } @@ -114,6 +88,14 @@ public class ReconOperation extends EngineDependentOperation { this.cell = cell; } } + static protected class JobGroup { + final public ReconJob job; + final public List entries = new ArrayList(); + + public JobGroup(ReconJob job) { + this.job = job; + } + } public class ReconProcess extends LongRunningProcess implements Runnable { final protected Project _project; @@ -169,31 +151,60 @@ public class ReconOperation extends EngineDependentOperation { e2.printStackTrace(); } - Map> valueToEntries = new HashMap>(); + Map jobKeyToGroup = new HashMap(); for (ReconEntry entry : _entries) { - Object value = entry.cell.value; - if (value != null && value instanceof String) { - List entries2; - if (valueToEntries.containsKey(value)) { - entries2 = valueToEntries.get(value); - } else { - entries2 = new LinkedList(); - valueToEntries.put((String) value, entries2); - } - entries2.add(entry); + ReconJob job = _reconConfig.createJob( + _project, + entry.rowIndex, + _project.rows.get(entry.rowIndex), + _columnName, + entry.cell + ); + + int key = job.getKey(); + JobGroup group = jobKeyToGroup.get(key); + if (group == null) { + group = new JobGroup(job); + jobKeyToGroup.put(key, group); } + group.entries.add(entry); } List cellChanges = new ArrayList(_entries.size()); - List values = new ArrayList(valueToEntries.keySet()); + List groups = new ArrayList(jobKeyToGroup.values()); - final int batchSize = 10; - for (int i = 0; i < values.size(); i += batchSize) { - recon(valueToEntries, values, i, Math.min(i + batchSize, values.size()), cellChanges); + int batchSize = _reconConfig.getBatchSize(); + for (int i = 0; i < groups.size(); i += batchSize) { + int to = Math.min(i + batchSize, groups.size()); - _progress = i * 100 / values.size(); + List jobs = new ArrayList(to - i); + for (int j = i; j < to; j++) { + jobs.add(groups.get(j).job); + } + List recons = _reconConfig.batchRecon(jobs); + for (int j = i; j < to; j++) { + Recon recon = recons.get(j - i); + if (recon == null) { + recon = new Recon(); + } + + for (ReconEntry entry : groups.get(j).entries) { + Cell oldCell = entry.cell; + Cell newCell = new Cell(oldCell.value, recon); + + CellChange cellChange = new CellChange( + entry.rowIndex, + _cellIndex, + oldCell, + newCell + ); + cellChanges.add(cellChange); + } + } + + _progress = i * 100 / groups.size(); try { Thread.sleep(50); } catch (InterruptedException e) { @@ -206,7 +217,7 @@ public class ReconOperation extends EngineDependentOperation { Change reconChange = new ReconChange( cellChanges, _columnName, - new ReconConfig(_typeID, _typeName), + _reconConfig, null ); @@ -220,204 +231,5 @@ public class ReconOperation extends EngineDependentOperation { _project.history.addEntry(historyEntry); _project.processManager.onDoneProcess(this); } - - protected void recon( - Map> valueToEntries, - List values, - int from, - int to, - List cellChanges - ) { - try { - StringWriter stringWriter = new StringWriter(); - JSONWriter jsonWriter = new JSONWriter(stringWriter); - - jsonWriter.object(); - for (int i = 0; from + i < to; i++) { - jsonWriter.key("q" + i + ":search"); - - jsonWriter.object(); - - jsonWriter.key("query"); jsonWriter.value(values.get(from + i)); - jsonWriter.key("limit"); jsonWriter.value(3); - jsonWriter.key("type"); jsonWriter.value(_typeID); - jsonWriter.key("type_strict"); jsonWriter.value("should"); - //jsonWriter.key("indent"); jsonWriter.value(1); - jsonWriter.key("type_exclude"); jsonWriter.value("/common/image"); - jsonWriter.key("domain_exclude"); jsonWriter.value("/freebase"); - - jsonWriter.endObject(); - } - jsonWriter.endObject(); - - StringBuffer sb = new StringBuffer(); - sb.append("http://api.freebase.com/api/service/search?indent=1&queries="); - sb.append(ParsingUtilities.encode(stringWriter.toString())); - - URL url = new URL(sb.toString()); - URLConnection connection = url.openConnection(); - connection.setConnectTimeout(5000); - connection.connect(); - - InputStream is = connection.getInputStream(); - try { - String s = ParsingUtilities.inputStreamToString(is); - JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s); - - for (int i = 0; from + i < to; i++) { - String value = values.get(from + i); - String key = "q" + i + ":search"; - if (!o.has(key)) { - continue; - } - - Recon recon; - - JSONObject o2 = o.getJSONObject(key); - if (o2.has("result")) { - JSONArray results = o2.getJSONArray("result"); - - recon = createRecon(value, results); - } else { - recon = new Recon(); - } - - for (ReconEntry entry : valueToEntries.get(value)) { - Cell oldCell = entry.cell; - - Cell newCell = new Cell(oldCell.value, recon); - - CellChange cellChange = new CellChange( - entry.rowIndex, - _cellIndex, - oldCell, - newCell - ); - cellChanges.add(cellChange); - } - - valueToEntries.remove(value); - } - } finally { - is.close(); - } - } catch (Exception e) { - e.printStackTrace(); - } - - for (List entries : valueToEntries.values()) { - Recon recon = new Recon(); - - for (ReconEntry entry : entries) { - Cell oldCell = entry.cell; - Cell newCell = new Cell(oldCell.value, recon); - - CellChange cellChange = new CellChange( - entry.rowIndex, - _cellIndex, - oldCell, - newCell - ); - cellChanges.add(cellChange); - } - } - - System.gc(); - } - - protected Recon createRecon(String text, JSONArray results) { - Recon recon = new Recon(); - try { - int length = results.length(); - int count = 0; - for (int i = 0; i < length && count < 3; i++) { - JSONObject result = results.getJSONObject(i); - if (!result.has("name")) { - continue; - } - - JSONArray types = result.getJSONArray("type"); - String[] typeIDs = new String[types.length()]; - for (int j = 0; j < typeIDs.length; j++) { - typeIDs[j] = types.getJSONObject(j).getString("id"); - } - - double score = result.getDouble("relevance:score"); - ReconCandidate candidate = new ReconCandidate( - result.getString("id"), - result.getString("guid"), - result.getString("name"), - typeIDs, - score - ); - - // best match - if (i == 0) { - recon.setFeature(Recon.Feature_nameMatch, text.equalsIgnoreCase(candidate.topicName)); - recon.setFeature(Recon.Feature_nameLevenshtein, StringUtils.getLevenshteinDistance(text, candidate.topicName)); - recon.setFeature(Recon.Feature_nameWordDistance, wordDistance(text, candidate.topicName)); - - recon.setFeature(Recon.Feature_typeMatch, false); - for (String typeID : candidate.typeIDs) { - if (_typeID.equals(typeID)) { - recon.setFeature(Recon.Feature_typeMatch, true); - if (_autoMatch && score >= _minScore) { - recon.match = candidate; - recon.judgment = Judgment.Matched; - } - break; - } - } - } - - recon.addCandidate(candidate); - count++; - } - } catch (JSONException e) { - e.printStackTrace(); - } - return recon; - } - } - - static protected double wordDistance(String s1, String s2) { - Set words1 = breakWords(s1); - Set words2 = breakWords(s2); - return words1.size() >= words2.size() ? wordDistance(words1, words2) : wordDistance(words2, words1); - } - - static protected double wordDistance(Set longWords, Set shortWords) { - double common = 0; - for (String word : shortWords) { - if (longWords.contains(word)) { - common++; - } - } - return common / longWords.size(); - } - - static protected Set s_stopWords; - static { - s_stopWords = new HashSet(); - s_stopWords.add("the"); - s_stopWords.add("a"); - s_stopWords.add("and"); - s_stopWords.add("of"); - s_stopWords.add("on"); - s_stopWords.add("in"); - s_stopWords.add("at"); - s_stopWords.add("by"); - } - - static protected Set breakWords(String s) { - String[] words = s.toLowerCase().split("\\s+"); - - Set set = new HashSet(words.length); - for (String word : words) { - if (!s_stopWords.contains(word)) { - set.add(word); - } - } - return set; } } diff --git a/src/main/java/com/metaweb/gridworks/operations/RowStarOperation.java b/src/main/java/com/metaweb/gridworks/operations/RowStarOperation.java index 8dcea557d..331184218 100644 --- a/src/main/java/com/metaweb/gridworks/operations/RowStarOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/RowStarOperation.java @@ -44,13 +44,13 @@ public class RowStarOperation extends EngineDependentOperation { writer.object(); writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); - writer.key("description"); writer.value(getBriefDescription()); + writer.key("description"); writer.value(getBriefDescription(null)); writer.key("engineConfig"); writer.value(getEngineConfig()); writer.key("starred"); writer.value(_starred); writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return (_starred ? "Star rows" : "Unstar rows"); } diff --git a/src/main/java/com/metaweb/gridworks/operations/SaveProtographOperation.java b/src/main/java/com/metaweb/gridworks/operations/SaveProtographOperation.java index ac9251424..5bad8b721 100644 --- a/src/main/java/com/metaweb/gridworks/operations/SaveProtographOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/SaveProtographOperation.java @@ -39,7 +39,7 @@ public class SaveProtographOperation extends AbstractOperation { writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return "Save schema skeleton"; } diff --git a/src/main/java/com/metaweb/gridworks/operations/TextTransformOperation.java b/src/main/java/com/metaweb/gridworks/operations/TextTransformOperation.java index 0a60ab0a0..d803c7da3 100644 --- a/src/main/java/com/metaweb/gridworks/operations/TextTransformOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/TextTransformOperation.java @@ -43,14 +43,14 @@ public class TextTransformOperation extends EngineDependentMassCellOperation { writer.object(); writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass())); - writer.key("description"); writer.value(getBriefDescription()); + writer.key("description"); writer.value(getBriefDescription(null)); writer.key("engineConfig"); writer.value(getEngineConfig()); writer.key("columnName"); writer.value(_columnName); writer.key("expression"); writer.value(_expression); writer.endObject(); } - protected String getBriefDescription() { + protected String getBriefDescription(Project project) { return "Text transform on cells in column " + _columnName + " using expression " + _expression; } diff --git a/src/main/webapp/scripts/dialogs/recon-dialog.js b/src/main/webapp/scripts/dialogs/recon-dialog.js index 0242417cf..82dc46bb1 100644 --- a/src/main/webapp/scripts/dialogs/recon-dialog.js +++ b/src/main/webapp/scripts/dialogs/recon-dialog.js @@ -59,11 +59,17 @@ ReconDialog.prototype._createDialog = function() { '
' + '' + '' + + '' + + '' + + ' Auto-match correctly-typed candidates scoring' + + '' + + '' + + 'Use ' + + ' recon service ' + + ' relevance service ' + + '' + + '' + '' + - '

' + - ' Auto-match correctly-typed candidates scoring at least ' + - '' + - '

' + '' + '