Refactored recon code on the server side to prepare for supporting other modes of recon.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@162 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
a17882c44f
commit
f16727c20c
@ -3,9 +3,11 @@ package com.metaweb.gridworks.commands.recon;
|
|||||||
import javax.servlet.http.HttpServletRequest;
|
import javax.servlet.http.HttpServletRequest;
|
||||||
|
|
||||||
import org.json.JSONObject;
|
import org.json.JSONObject;
|
||||||
|
import org.json.JSONTokener;
|
||||||
|
|
||||||
import com.metaweb.gridworks.commands.EngineDependentCommand;
|
import com.metaweb.gridworks.commands.EngineDependentCommand;
|
||||||
import com.metaweb.gridworks.model.AbstractOperation;
|
import com.metaweb.gridworks.model.AbstractOperation;
|
||||||
|
import com.metaweb.gridworks.model.recon.ReconConfig;
|
||||||
import com.metaweb.gridworks.operations.ReconOperation;
|
import com.metaweb.gridworks.operations.ReconOperation;
|
||||||
|
|
||||||
public class ReconcileCommand extends EngineDependentCommand {
|
public class ReconcileCommand extends EngineDependentCommand {
|
||||||
@ -15,11 +17,11 @@ public class ReconcileCommand extends EngineDependentCommand {
|
|||||||
JSONObject engineConfig) throws Exception {
|
JSONObject engineConfig) throws Exception {
|
||||||
|
|
||||||
String columnName = request.getParameter("columnName");
|
String columnName = request.getParameter("columnName");
|
||||||
String typeID = request.getParameter("typeID");
|
String configString = request.getParameter("config");
|
||||||
String typeName = request.getParameter("typeName");
|
|
||||||
boolean autoMatch = "true".equals(request.getParameter("autoMatch"));
|
|
||||||
double minScore = autoMatch ? Double.parseDouble(request.getParameter("minScore")) : 0;
|
|
||||||
|
|
||||||
return new ReconOperation(engineConfig, columnName, typeID, typeName, autoMatch, minScore);
|
JSONTokener t = new JSONTokener(configString);
|
||||||
|
JSONObject config = (JSONObject) t.nextValue();
|
||||||
|
|
||||||
|
return new ReconOperation(engineConfig, columnName, ReconConfig.reconstruct(config));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ abstract public class AbstractOperation implements Serializable, Jsonizable {
|
|||||||
private static final long serialVersionUID = 3916055862440019600L;
|
private static final long serialVersionUID = 3916055862440019600L;
|
||||||
|
|
||||||
public Process createProcess(Project project, Properties options) throws Exception {
|
public Process createProcess(Project project, Properties options) throws Exception {
|
||||||
return new QuickHistoryEntryProcess(project, getBriefDescription()) {
|
return new QuickHistoryEntryProcess(project, getBriefDescription(null)) {
|
||||||
@Override
|
@Override
|
||||||
protected HistoryEntry createHistoryEntry() throws Exception {
|
protected HistoryEntry createHistoryEntry() throws Exception {
|
||||||
return AbstractOperation.this.createHistoryEntry(_project);
|
return AbstractOperation.this.createHistoryEntry(_project);
|
||||||
@ -30,7 +30,7 @@ abstract public class AbstractOperation implements Serializable, Jsonizable {
|
|||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,7 @@ import org.json.JSONException;
|
|||||||
import org.json.JSONWriter;
|
import org.json.JSONWriter;
|
||||||
|
|
||||||
import com.metaweb.gridworks.Jsonizable;
|
import com.metaweb.gridworks.Jsonizable;
|
||||||
|
import com.metaweb.gridworks.model.recon.ReconConfig;
|
||||||
|
|
||||||
public class Column implements Serializable, Jsonizable {
|
public class Column implements Serializable, Jsonizable {
|
||||||
private static final long serialVersionUID = -1063342490951563563L;
|
private static final long serialVersionUID = -1063342490951563563L;
|
||||||
|
@ -1,33 +0,0 @@
|
|||||||
package com.metaweb.gridworks.model;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Properties;
|
|
||||||
|
|
||||||
import org.json.JSONException;
|
|
||||||
import org.json.JSONWriter;
|
|
||||||
|
|
||||||
import com.metaweb.gridworks.Jsonizable;
|
|
||||||
|
|
||||||
public class ReconConfig implements Serializable, Jsonizable {
|
|
||||||
private static final long serialVersionUID = -4831409797104437854L;
|
|
||||||
|
|
||||||
final public String typeID;
|
|
||||||
final public String typeName;
|
|
||||||
|
|
||||||
public ReconConfig(String typeID, String typeName) {
|
|
||||||
this.typeID = typeID;
|
|
||||||
this.typeName = typeName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void write(JSONWriter writer, Properties options)
|
|
||||||
throws JSONException {
|
|
||||||
|
|
||||||
writer.object();
|
|
||||||
writer.key("type");
|
|
||||||
writer.object();
|
|
||||||
writer.key("id"); writer.value(typeID);
|
|
||||||
writer.key("name"); writer.value(typeName);
|
|
||||||
writer.endObject();
|
|
||||||
writer.endObject();
|
|
||||||
}
|
|
||||||
}
|
|
@ -7,8 +7,8 @@ import java.util.List;
|
|||||||
|
|
||||||
import com.metaweb.gridworks.model.Column;
|
import com.metaweb.gridworks.model.Column;
|
||||||
import com.metaweb.gridworks.model.Project;
|
import com.metaweb.gridworks.model.Project;
|
||||||
import com.metaweb.gridworks.model.ReconConfig;
|
|
||||||
import com.metaweb.gridworks.model.ReconStats;
|
import com.metaweb.gridworks.model.ReconStats;
|
||||||
|
import com.metaweb.gridworks.model.recon.ReconConfig;
|
||||||
|
|
||||||
public class ReconChange extends MassCellChange {
|
public class ReconChange extends MassCellChange {
|
||||||
private static final long serialVersionUID = 7048806528587330543L;
|
private static final long serialVersionUID = 7048806528587330543L;
|
||||||
|
@ -0,0 +1,399 @@
|
|||||||
|
package com.metaweb.gridworks.model.recon;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.net.URLConnection;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Properties;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.json.JSONArray;
|
||||||
|
import org.json.JSONException;
|
||||||
|
import org.json.JSONObject;
|
||||||
|
import org.json.JSONWriter;
|
||||||
|
|
||||||
|
import com.metaweb.gridworks.model.Cell;
|
||||||
|
import com.metaweb.gridworks.model.Project;
|
||||||
|
import com.metaweb.gridworks.model.Recon;
|
||||||
|
import com.metaweb.gridworks.model.ReconCandidate;
|
||||||
|
import com.metaweb.gridworks.model.Row;
|
||||||
|
import com.metaweb.gridworks.model.Recon.Judgment;
|
||||||
|
import com.metaweb.gridworks.protograph.FreebaseProperty;
|
||||||
|
import com.metaweb.gridworks.util.ParsingUtilities;
|
||||||
|
|
||||||
|
public class HeuristicReconConfig extends ReconConfig {
|
||||||
|
private static final long serialVersionUID = 423145327938373362L;
|
||||||
|
|
||||||
|
static public class ColumnDetail implements Serializable {
|
||||||
|
private static final long serialVersionUID = -8996704822460155543L;
|
||||||
|
|
||||||
|
final public String columnName;
|
||||||
|
final public FreebaseProperty property;
|
||||||
|
|
||||||
|
public ColumnDetail(String columnName, FreebaseProperty property) {
|
||||||
|
this.columnName = columnName;
|
||||||
|
this.property = property;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static public ReconConfig reconstruct(JSONObject obj) throws Exception {
|
||||||
|
List<ColumnDetail> columnDetails = null;
|
||||||
|
if (obj.has("columnDetails")) {
|
||||||
|
JSONArray columnDetailsA = obj.getJSONArray("columnDetails");
|
||||||
|
int l = columnDetailsA.length();
|
||||||
|
|
||||||
|
columnDetails = new ArrayList<ColumnDetail>(l);
|
||||||
|
for (int i = 0; i < l; i++) {
|
||||||
|
JSONObject o = columnDetailsA.getJSONObject(i);
|
||||||
|
JSONObject p = o.getJSONObject("property");
|
||||||
|
|
||||||
|
columnDetails.add(new ColumnDetail(
|
||||||
|
o.getString("column"),
|
||||||
|
new FreebaseProperty(
|
||||||
|
p.getString("id"),
|
||||||
|
p.getString("name")
|
||||||
|
)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
columnDetails = new ArrayList<ColumnDetail>();
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONObject t = obj.getJSONObject("type");
|
||||||
|
|
||||||
|
return new HeuristicReconConfig(
|
||||||
|
obj.getString("service"),
|
||||||
|
t.getString("id"),
|
||||||
|
t.getString("name"),
|
||||||
|
obj.getBoolean("autoMatch"),
|
||||||
|
columnDetails
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static protected class HeuristicReconJob extends ReconJob {
|
||||||
|
String text;
|
||||||
|
|
||||||
|
public int getKey() {
|
||||||
|
return text.hashCode();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final public String service; // either "recon" or "relevance"
|
||||||
|
final public String typeID;
|
||||||
|
final public String typeName;
|
||||||
|
final public boolean autoMatch;
|
||||||
|
final public List<ColumnDetail> columnDetails;
|
||||||
|
|
||||||
|
public HeuristicReconConfig(
|
||||||
|
String service,
|
||||||
|
String typeID,
|
||||||
|
String typeName,
|
||||||
|
boolean autoMatch,
|
||||||
|
List<ColumnDetail> columnDetails
|
||||||
|
) {
|
||||||
|
this.service = service;
|
||||||
|
this.typeID = typeID;
|
||||||
|
this.typeName = typeName;
|
||||||
|
this.autoMatch = autoMatch;
|
||||||
|
this.columnDetails = new ArrayList<ColumnDetail>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void write(JSONWriter writer, Properties options)
|
||||||
|
throws JSONException {
|
||||||
|
|
||||||
|
writer.object();
|
||||||
|
writer.key("mode"); writer.value("heuristic");
|
||||||
|
writer.key("service"); writer.value(service);
|
||||||
|
writer.key("type");
|
||||||
|
writer.object();
|
||||||
|
writer.key("id"); writer.value(typeID);
|
||||||
|
writer.key("name"); writer.value(typeName);
|
||||||
|
writer.endObject();
|
||||||
|
writer.key("autoMatch"); writer.value(autoMatch);
|
||||||
|
writer.key("columnDetails");
|
||||||
|
writer.array();
|
||||||
|
for (ColumnDetail c : columnDetails) {
|
||||||
|
writer.object();
|
||||||
|
writer.key("column"); writer.value(c.columnName);
|
||||||
|
writer.key("property"); c.property.write(writer, options);
|
||||||
|
writer.endObject();
|
||||||
|
}
|
||||||
|
writer.endArray();
|
||||||
|
writer.endObject();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getBatchSize() {
|
||||||
|
return 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getBriefDescription(Project project, String columnName) {
|
||||||
|
return "Reconcile cells in column " + columnName + " to type " + typeID;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ReconJob createJob(Project project, int rowIndex, Row row,
|
||||||
|
String columnName, Cell cell) {
|
||||||
|
|
||||||
|
HeuristicReconJob job = new HeuristicReconJob();
|
||||||
|
|
||||||
|
job.text = cell.value.toString();
|
||||||
|
|
||||||
|
return job;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Recon> batchRecon(List<ReconJob> jobs) {
|
||||||
|
if ("relevance".equals(service)) {
|
||||||
|
return batchReconUsingRelevance(jobs);
|
||||||
|
} else {
|
||||||
|
return batchReconUsingReconService(jobs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<Recon> batchReconUsingRelevance(List<ReconJob> jobs) {
|
||||||
|
List<Recon> recons = new ArrayList<Recon>(jobs.size());
|
||||||
|
|
||||||
|
try {
|
||||||
|
StringWriter stringWriter = new StringWriter();
|
||||||
|
JSONWriter jsonWriter = new JSONWriter(stringWriter);
|
||||||
|
|
||||||
|
jsonWriter.object();
|
||||||
|
for (int i = 0; i < jobs.size(); i++) {
|
||||||
|
HeuristicReconJob job = (HeuristicReconJob) jobs.get(i);
|
||||||
|
|
||||||
|
jsonWriter.key("q" + i + ":search");
|
||||||
|
|
||||||
|
jsonWriter.object();
|
||||||
|
jsonWriter.key("query"); jsonWriter.value(job.text);
|
||||||
|
jsonWriter.key("limit"); jsonWriter.value(3);
|
||||||
|
jsonWriter.key("type"); jsonWriter.value(typeID);
|
||||||
|
jsonWriter.key("type_strict"); jsonWriter.value("should");
|
||||||
|
jsonWriter.key("type_exclude"); jsonWriter.value("/common/image");
|
||||||
|
jsonWriter.key("domain_exclude"); jsonWriter.value("/freebase");
|
||||||
|
jsonWriter.endObject();
|
||||||
|
}
|
||||||
|
jsonWriter.endObject();
|
||||||
|
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
sb.append("http://api.freebase.com/api/service/search?indent=1&queries=");
|
||||||
|
sb.append(ParsingUtilities.encode(stringWriter.toString()));
|
||||||
|
|
||||||
|
URL url = new URL(sb.toString());
|
||||||
|
URLConnection connection = url.openConnection();
|
||||||
|
connection.setConnectTimeout(5000);
|
||||||
|
connection.connect();
|
||||||
|
|
||||||
|
InputStream is = connection.getInputStream();
|
||||||
|
try {
|
||||||
|
String s = ParsingUtilities.inputStreamToString(is);
|
||||||
|
JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s);
|
||||||
|
|
||||||
|
for (int i = 0; i < jobs.size(); i++) {
|
||||||
|
HeuristicReconJob job = (HeuristicReconJob) jobs.get(i);
|
||||||
|
|
||||||
|
String text = job.text;
|
||||||
|
String key = "q" + i + ":search";
|
||||||
|
if (!o.has(key)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Recon recon = null;
|
||||||
|
|
||||||
|
JSONObject o2 = o.getJSONObject(key);
|
||||||
|
if (o2.has("result")) {
|
||||||
|
JSONArray results = o2.getJSONArray("result");
|
||||||
|
|
||||||
|
recon = createReconFromRelevanceResults(text, results);
|
||||||
|
} else {
|
||||||
|
recon = new Recon();
|
||||||
|
}
|
||||||
|
|
||||||
|
recons.add(recon);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
is.close();
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
System.gc();
|
||||||
|
|
||||||
|
return recons;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Recon createReconFromRelevanceResults(String text, JSONArray results) {
|
||||||
|
Recon recon = new Recon();
|
||||||
|
try {
|
||||||
|
int length = results.length();
|
||||||
|
int count = 0;
|
||||||
|
for (int i = 0; i < length && count < 3; i++) {
|
||||||
|
JSONObject result = results.getJSONObject(i);
|
||||||
|
if (!result.has("name")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONArray types = result.getJSONArray("type");
|
||||||
|
String[] typeIDs = new String[types.length()];
|
||||||
|
for (int j = 0; j < typeIDs.length; j++) {
|
||||||
|
typeIDs[j] = types.getJSONObject(j).getString("id");
|
||||||
|
}
|
||||||
|
|
||||||
|
double score = result.getDouble("relevance:score");
|
||||||
|
ReconCandidate candidate = new ReconCandidate(
|
||||||
|
result.getString("id"),
|
||||||
|
result.getString("guid"),
|
||||||
|
result.getString("name"),
|
||||||
|
typeIDs,
|
||||||
|
score
|
||||||
|
);
|
||||||
|
|
||||||
|
// best match
|
||||||
|
if (i == 0) {
|
||||||
|
recon.setFeature(Recon.Feature_nameMatch, text.equalsIgnoreCase(candidate.topicName));
|
||||||
|
recon.setFeature(Recon.Feature_nameLevenshtein, StringUtils.getLevenshteinDistance(text, candidate.topicName));
|
||||||
|
recon.setFeature(Recon.Feature_nameWordDistance, wordDistance(text, candidate.topicName));
|
||||||
|
|
||||||
|
recon.setFeature(Recon.Feature_typeMatch, false);
|
||||||
|
for (String typeID : candidate.typeIDs) {
|
||||||
|
if (typeID.equals(typeID)) {
|
||||||
|
recon.setFeature(Recon.Feature_typeMatch, true);
|
||||||
|
if (autoMatch && score >= 100) {
|
||||||
|
recon.match = candidate;
|
||||||
|
recon.judgment = Judgment.Matched;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
recon.addCandidate(candidate);
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
} catch (JSONException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return recon;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<Recon> batchReconUsingReconService(List<ReconJob> jobs) {
|
||||||
|
List<Recon> recons = new ArrayList<Recon>(jobs.size());
|
||||||
|
|
||||||
|
try {
|
||||||
|
StringWriter stringWriter = new StringWriter();
|
||||||
|
JSONWriter jsonWriter = new JSONWriter(stringWriter);
|
||||||
|
|
||||||
|
jsonWriter.object();
|
||||||
|
for (int i = 0; i < jobs.size(); i++) {
|
||||||
|
HeuristicReconJob job = (HeuristicReconJob) jobs.get(i);
|
||||||
|
|
||||||
|
jsonWriter.key("q" + i + ":search");
|
||||||
|
|
||||||
|
jsonWriter.object();
|
||||||
|
jsonWriter.key("query"); jsonWriter.value(job.text);
|
||||||
|
jsonWriter.key("limit"); jsonWriter.value(3);
|
||||||
|
jsonWriter.key("type"); jsonWriter.value(typeID);
|
||||||
|
jsonWriter.key("type_strict"); jsonWriter.value("should");
|
||||||
|
jsonWriter.key("type_exclude"); jsonWriter.value("/common/image");
|
||||||
|
jsonWriter.key("domain_exclude"); jsonWriter.value("/freebase");
|
||||||
|
jsonWriter.endObject();
|
||||||
|
}
|
||||||
|
jsonWriter.endObject();
|
||||||
|
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
sb.append("http://api.freebase.com/api/service/search?indent=1&queries=");
|
||||||
|
sb.append(ParsingUtilities.encode(stringWriter.toString()));
|
||||||
|
|
||||||
|
URL url = new URL(sb.toString());
|
||||||
|
URLConnection connection = url.openConnection();
|
||||||
|
connection.setConnectTimeout(5000);
|
||||||
|
connection.connect();
|
||||||
|
|
||||||
|
InputStream is = connection.getInputStream();
|
||||||
|
try {
|
||||||
|
String s = ParsingUtilities.inputStreamToString(is);
|
||||||
|
JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s);
|
||||||
|
|
||||||
|
for (int i = 0; i < jobs.size(); i++) {
|
||||||
|
HeuristicReconJob job = (HeuristicReconJob) jobs.get(i);
|
||||||
|
|
||||||
|
String text = job.text;
|
||||||
|
String key = "q" + i + ":search";
|
||||||
|
if (!o.has(key)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Recon recon = null;
|
||||||
|
|
||||||
|
JSONObject o2 = o.getJSONObject(key);
|
||||||
|
if (o2.has("result")) {
|
||||||
|
JSONArray results = o2.getJSONArray("result");
|
||||||
|
|
||||||
|
recon = createReconFromRelevanceResults(text, results);
|
||||||
|
} else {
|
||||||
|
recon = new Recon();
|
||||||
|
}
|
||||||
|
|
||||||
|
recons.add(recon);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
is.close();
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
System.gc();
|
||||||
|
|
||||||
|
return recons;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static protected double wordDistance(String s1, String s2) {
|
||||||
|
Set<String> words1 = breakWords(s1);
|
||||||
|
Set<String> words2 = breakWords(s2);
|
||||||
|
return words1.size() >= words2.size() ? wordDistance(words1, words2) : wordDistance(words2, words1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static protected double wordDistance(Set<String> longWords, Set<String> shortWords) {
|
||||||
|
double common = 0;
|
||||||
|
for (String word : shortWords) {
|
||||||
|
if (longWords.contains(word)) {
|
||||||
|
common++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return common / longWords.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
static protected Set<String> s_stopWords;
|
||||||
|
static {
|
||||||
|
s_stopWords = new HashSet<String>();
|
||||||
|
s_stopWords.add("the");
|
||||||
|
s_stopWords.add("a");
|
||||||
|
s_stopWords.add("and");
|
||||||
|
s_stopWords.add("of");
|
||||||
|
s_stopWords.add("on");
|
||||||
|
s_stopWords.add("in");
|
||||||
|
s_stopWords.add("at");
|
||||||
|
s_stopWords.add("by");
|
||||||
|
}
|
||||||
|
|
||||||
|
static protected Set<String> breakWords(String s) {
|
||||||
|
String[] words = s.toLowerCase().split("\\s+");
|
||||||
|
|
||||||
|
Set<String> set = new HashSet<String>(words.length);
|
||||||
|
for (String word : words) {
|
||||||
|
if (!s_stopWords.contains(word)) {
|
||||||
|
set.add(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,38 @@
|
|||||||
|
package com.metaweb.gridworks.model.recon;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.json.JSONObject;
|
||||||
|
|
||||||
|
import com.metaweb.gridworks.Jsonizable;
|
||||||
|
import com.metaweb.gridworks.model.Cell;
|
||||||
|
import com.metaweb.gridworks.model.Project;
|
||||||
|
import com.metaweb.gridworks.model.Recon;
|
||||||
|
import com.metaweb.gridworks.model.Row;
|
||||||
|
|
||||||
|
abstract public class ReconConfig implements Serializable, Jsonizable {
|
||||||
|
private static final long serialVersionUID = -4831409797104437854L;
|
||||||
|
|
||||||
|
static public ReconConfig reconstruct(JSONObject obj) throws Exception {
|
||||||
|
String mode = obj.getString("mode");
|
||||||
|
if ("heuristic".equals(mode)) {
|
||||||
|
return HeuristicReconConfig.reconstruct(obj);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
abstract public int getBatchSize();
|
||||||
|
|
||||||
|
abstract public String getBriefDescription(Project project, String columnName);
|
||||||
|
|
||||||
|
abstract public ReconJob createJob(
|
||||||
|
Project project,
|
||||||
|
int rowIndex,
|
||||||
|
Row row,
|
||||||
|
String columnName,
|
||||||
|
Cell cell
|
||||||
|
);
|
||||||
|
|
||||||
|
abstract public List<Recon> batchRecon(List<ReconJob> jobs);
|
||||||
|
}
|
@ -0,0 +1,5 @@
|
|||||||
|
package com.metaweb.gridworks.model.recon;
|
||||||
|
|
||||||
|
abstract public class ReconJob {
|
||||||
|
abstract public int getKey();
|
||||||
|
}
|
@ -66,7 +66,7 @@ public class ColumnAdditionOperation extends EngineDependentOperation {
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
||||||
writer.key("description"); writer.value(getBriefDescription());
|
writer.key("description"); writer.value(getBriefDescription(null));
|
||||||
writer.key("engineConfig"); writer.value(getEngineConfig());
|
writer.key("engineConfig"); writer.value(getEngineConfig());
|
||||||
writer.key("headerLabel"); writer.value(_headerLabel);
|
writer.key("headerLabel"); writer.value(_headerLabel);
|
||||||
writer.key("columnInsertIndex"); writer.value(_columnInsertIndex);
|
writer.key("columnInsertIndex"); writer.value(_columnInsertIndex);
|
||||||
@ -75,7 +75,7 @@ public class ColumnAdditionOperation extends EngineDependentOperation {
|
|||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Create column " + _headerLabel +
|
return "Create column " + _headerLabel +
|
||||||
" at index " + _columnInsertIndex +
|
" at index " + _columnInsertIndex +
|
||||||
" based on column " + _baseColumnName +
|
" based on column " + _baseColumnName +
|
||||||
|
@ -41,7 +41,7 @@ public class ColumnRemovalOperation extends AbstractOperation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Remove column " + _columnName;
|
return "Remove column " + _columnName;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -47,14 +47,14 @@ public class MultiValuedCellJoinOperation extends AbstractOperation {
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
||||||
writer.key("description"); writer.value(getBriefDescription());
|
writer.key("description"); writer.value(getBriefDescription(null));
|
||||||
writer.key("columnName"); writer.value(_columnName);
|
writer.key("columnName"); writer.value(_columnName);
|
||||||
writer.key("keyColumnName"); writer.value(_keyColumnName);
|
writer.key("keyColumnName"); writer.value(_keyColumnName);
|
||||||
writer.key("separator"); writer.value(_separator);
|
writer.key("separator"); writer.value(_separator);
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Join multi-valued cells in column " + _columnName;
|
return "Join multi-valued cells in column " + _columnName;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -121,7 +121,7 @@ public class MultiValuedCellJoinOperation extends AbstractOperation {
|
|||||||
|
|
||||||
return new HistoryEntry(
|
return new HistoryEntry(
|
||||||
project,
|
project,
|
||||||
getBriefDescription(),
|
getBriefDescription(null),
|
||||||
this,
|
this,
|
||||||
new MassRowChange(newRows)
|
new MassRowChange(newRows)
|
||||||
);
|
);
|
||||||
|
@ -59,7 +59,7 @@ public class MultiValuedCellSplitOperation extends AbstractOperation {
|
|||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Split multi-valued cells in column " + _columnName;
|
return "Split multi-valued cells in column " + _columnName;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -139,7 +139,7 @@ public class MultiValuedCellSplitOperation extends AbstractOperation {
|
|||||||
|
|
||||||
return new HistoryEntry(
|
return new HistoryEntry(
|
||||||
project,
|
project,
|
||||||
getBriefDescription(),
|
getBriefDescription(null),
|
||||||
this,
|
this,
|
||||||
new MassRowChange(newRows)
|
new MassRowChange(newRows)
|
||||||
);
|
);
|
||||||
|
@ -41,13 +41,13 @@ public class ReconDiscardJudgmentsOperation extends EngineDependentMassCellOpera
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
||||||
writer.key("description"); writer.value(getBriefDescription());
|
writer.key("description"); writer.value(getBriefDescription(null));
|
||||||
writer.key("engineConfig"); writer.value(getEngineConfig());
|
writer.key("engineConfig"); writer.value(getEngineConfig());
|
||||||
writer.key("columnName"); writer.value(_columnName);
|
writer.key("columnName"); writer.value(_columnName);
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Discard recon judgments for cells in column " + _columnName;
|
return "Discard recon judgments for cells in column " + _columnName;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
||||||
writer.key("description"); writer.value(getBriefDescription());
|
writer.key("description"); writer.value(getBriefDescription(null));
|
||||||
writer.key("engineConfig"); writer.value(getEngineConfig());
|
writer.key("engineConfig"); writer.value(getEngineConfig());
|
||||||
writer.key("columnName"); writer.value(_columnName);
|
writer.key("columnName"); writer.value(_columnName);
|
||||||
writer.key("similarValue"); writer.value(_similarValue);
|
writer.key("similarValue"); writer.value(_similarValue);
|
||||||
@ -102,7 +102,7 @@ public class ReconJudgeSimilarCellsOperation extends EngineDependentMassCellOper
|
|||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
if (_judgment == Judgment.None) {
|
if (_judgment == Judgment.None) {
|
||||||
return "Discard recon judgments for cells containing \"" +
|
return "Discard recon judgments for cells containing \"" +
|
||||||
_similarValue + "\" in column " + _columnName;
|
_similarValue + "\" in column " + _columnName;
|
||||||
|
@ -46,14 +46,14 @@ public class ReconMarkNewTopicsOperation extends EngineDependentMassCellOperatio
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
||||||
writer.key("description"); writer.value(getBriefDescription());
|
writer.key("description"); writer.value(getBriefDescription(null));
|
||||||
writer.key("engineConfig"); writer.value(getEngineConfig());
|
writer.key("engineConfig"); writer.value(getEngineConfig());
|
||||||
writer.key("columnName"); writer.value(_columnName);
|
writer.key("columnName"); writer.value(_columnName);
|
||||||
writer.key("shareNewTopics"); writer.value(_shareNewTopics);
|
writer.key("shareNewTopics"); writer.value(_shareNewTopics);
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Mark to create new topics for cells in column " + _columnName +
|
return "Mark to create new topics for cells in column " + _columnName +
|
||||||
(_shareNewTopics ?
|
(_shareNewTopics ?
|
||||||
", one topic for each group of similar cells" :
|
", one topic for each group of similar cells" :
|
||||||
|
@ -41,13 +41,13 @@ public class ReconMatchBestCandidatesOperation extends EngineDependentMassCellOp
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
||||||
writer.key("description"); writer.value(getBriefDescription());
|
writer.key("description"); writer.value(getBriefDescription(null));
|
||||||
writer.key("engineConfig"); writer.value(getEngineConfig());
|
writer.key("engineConfig"); writer.value(getEngineConfig());
|
||||||
writer.key("columnName"); writer.value(_columnName);
|
writer.key("columnName"); writer.value(_columnName);
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Match each cell to its best recon candidate in column " + _columnName;
|
return "Match each cell to its best recon candidate in column " + _columnName;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -60,7 +60,7 @@ public class ReconMatchSpecificTopicOperation extends EngineDependentMassCellOpe
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
||||||
writer.key("description"); writer.value(getBriefDescription());
|
writer.key("description"); writer.value(getBriefDescription(null));
|
||||||
writer.key("engineConfig"); writer.value(getEngineConfig());
|
writer.key("engineConfig"); writer.value(getEngineConfig());
|
||||||
writer.key("columnName"); writer.value(_columnName);
|
writer.key("columnName"); writer.value(_columnName);
|
||||||
writer.key("match");
|
writer.key("match");
|
||||||
@ -78,7 +78,7 @@ public class ReconMatchSpecificTopicOperation extends EngineDependentMassCellOpe
|
|||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Match specific topic " +
|
return "Match specific topic " +
|
||||||
match.topicName + " (" +
|
match.topicName + " (" +
|
||||||
match.topicID + ") to cells in column " + _columnName;
|
match.topicID + ") to cells in column " + _columnName;
|
||||||
|
@ -1,20 +1,11 @@
|
|||||||
package com.metaweb.gridworks.operations;
|
package com.metaweb.gridworks.operations;
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.StringWriter;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.net.URLConnection;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
|
||||||
import org.json.JSONArray;
|
|
||||||
import org.json.JSONException;
|
import org.json.JSONException;
|
||||||
import org.json.JSONObject;
|
import org.json.JSONObject;
|
||||||
import org.json.JSONWriter;
|
import org.json.JSONWriter;
|
||||||
@ -30,24 +21,19 @@ import com.metaweb.gridworks.model.Cell;
|
|||||||
import com.metaweb.gridworks.model.Column;
|
import com.metaweb.gridworks.model.Column;
|
||||||
import com.metaweb.gridworks.model.Project;
|
import com.metaweb.gridworks.model.Project;
|
||||||
import com.metaweb.gridworks.model.Recon;
|
import com.metaweb.gridworks.model.Recon;
|
||||||
import com.metaweb.gridworks.model.ReconCandidate;
|
|
||||||
import com.metaweb.gridworks.model.ReconConfig;
|
|
||||||
import com.metaweb.gridworks.model.Row;
|
import com.metaweb.gridworks.model.Row;
|
||||||
import com.metaweb.gridworks.model.Recon.Judgment;
|
|
||||||
import com.metaweb.gridworks.model.changes.CellChange;
|
import com.metaweb.gridworks.model.changes.CellChange;
|
||||||
import com.metaweb.gridworks.model.changes.ReconChange;
|
import com.metaweb.gridworks.model.changes.ReconChange;
|
||||||
|
import com.metaweb.gridworks.model.recon.ReconConfig;
|
||||||
|
import com.metaweb.gridworks.model.recon.ReconJob;
|
||||||
import com.metaweb.gridworks.process.LongRunningProcess;
|
import com.metaweb.gridworks.process.LongRunningProcess;
|
||||||
import com.metaweb.gridworks.process.Process;
|
import com.metaweb.gridworks.process.Process;
|
||||||
import com.metaweb.gridworks.util.ParsingUtilities;
|
|
||||||
|
|
||||||
public class ReconOperation extends EngineDependentOperation {
|
public class ReconOperation extends EngineDependentOperation {
|
||||||
private static final long serialVersionUID = 838795186905314865L;
|
private static final long serialVersionUID = 838795186905314865L;
|
||||||
|
|
||||||
final protected String _columnName;
|
final protected String _columnName;
|
||||||
final protected String _typeID;
|
final protected ReconConfig _reconConfig;
|
||||||
final protected String _typeName;
|
|
||||||
final protected boolean _autoMatch;
|
|
||||||
final protected double _minScore;
|
|
||||||
|
|
||||||
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
|
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
|
||||||
JSONObject engineConfig = obj.getJSONObject("engineConfig");
|
JSONObject engineConfig = obj.getJSONObject("engineConfig");
|
||||||
@ -55,39 +41,30 @@ public class ReconOperation extends EngineDependentOperation {
|
|||||||
return new ReconOperation(
|
return new ReconOperation(
|
||||||
engineConfig,
|
engineConfig,
|
||||||
obj.getString("columnName"),
|
obj.getString("columnName"),
|
||||||
obj.getString("typeID"),
|
ReconConfig.reconstruct(obj.getJSONObject("config"))
|
||||||
obj.getString("typeName"),
|
|
||||||
obj.getBoolean("autoMatch"),
|
|
||||||
obj.getDouble("minScore")
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReconOperation(
|
public ReconOperation(
|
||||||
JSONObject engineConfig,
|
JSONObject engineConfig,
|
||||||
String columnName,
|
String columnName,
|
||||||
String typeID,
|
ReconConfig reconConfig
|
||||||
String typeName,
|
|
||||||
boolean autoMatch,
|
|
||||||
double minScore
|
|
||||||
) {
|
) {
|
||||||
super(engineConfig);
|
super(engineConfig);
|
||||||
_columnName = columnName;
|
_columnName = columnName;
|
||||||
_typeID = typeID;
|
_reconConfig = reconConfig;
|
||||||
_typeName = typeName;
|
|
||||||
_autoMatch = autoMatch;
|
|
||||||
_minScore = minScore;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Process createProcess(Project project, Properties options) throws Exception {
|
public Process createProcess(Project project, Properties options) throws Exception {
|
||||||
return new ReconProcess(
|
return new ReconProcess(
|
||||||
project,
|
project,
|
||||||
getEngineConfig(),
|
getEngineConfig(),
|
||||||
getBriefDescription()
|
getBriefDescription(null)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Reconcile cells in column " + _columnName + " to type " + _typeID;
|
return _reconConfig.getBriefDescription(project, _columnName);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write(JSONWriter writer, Properties options)
|
public void write(JSONWriter writer, Properties options)
|
||||||
@ -95,12 +72,9 @@ public class ReconOperation extends EngineDependentOperation {
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
||||||
writer.key("description"); writer.value(getBriefDescription());
|
writer.key("description"); writer.value(getBriefDescription(null));
|
||||||
writer.key("columnName"); writer.value(_columnName);
|
writer.key("columnName"); writer.value(_columnName);
|
||||||
writer.key("typeID"); writer.value(_typeID);
|
writer.key("config"); _reconConfig.write(writer, options);
|
||||||
writer.key("typeName"); writer.value(_typeName);
|
|
||||||
writer.key("autoMatch"); writer.value(_autoMatch);
|
|
||||||
writer.key("minScore"); writer.value(_minScore);
|
|
||||||
writer.key("engineConfig"); writer.value(getEngineConfig());
|
writer.key("engineConfig"); writer.value(getEngineConfig());
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
@ -114,6 +88,14 @@ public class ReconOperation extends EngineDependentOperation {
|
|||||||
this.cell = cell;
|
this.cell = cell;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
static protected class JobGroup {
|
||||||
|
final public ReconJob job;
|
||||||
|
final public List<ReconEntry> entries = new ArrayList<ReconEntry>();
|
||||||
|
|
||||||
|
public JobGroup(ReconJob job) {
|
||||||
|
this.job = job;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public class ReconProcess extends LongRunningProcess implements Runnable {
|
public class ReconProcess extends LongRunningProcess implements Runnable {
|
||||||
final protected Project _project;
|
final protected Project _project;
|
||||||
@ -169,31 +151,60 @@ public class ReconOperation extends EngineDependentOperation {
|
|||||||
e2.printStackTrace();
|
e2.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String, List<ReconEntry>> valueToEntries = new HashMap<String, List<ReconEntry>>();
|
Map<Integer, JobGroup> jobKeyToGroup = new HashMap<Integer, JobGroup>();
|
||||||
|
|
||||||
for (ReconEntry entry : _entries) {
|
for (ReconEntry entry : _entries) {
|
||||||
Object value = entry.cell.value;
|
ReconJob job = _reconConfig.createJob(
|
||||||
if (value != null && value instanceof String) {
|
_project,
|
||||||
List<ReconEntry> entries2;
|
entry.rowIndex,
|
||||||
if (valueToEntries.containsKey(value)) {
|
_project.rows.get(entry.rowIndex),
|
||||||
entries2 = valueToEntries.get(value);
|
_columnName,
|
||||||
} else {
|
entry.cell
|
||||||
entries2 = new LinkedList<ReconEntry>();
|
);
|
||||||
valueToEntries.put((String) value, entries2);
|
|
||||||
}
|
int key = job.getKey();
|
||||||
entries2.add(entry);
|
JobGroup group = jobKeyToGroup.get(key);
|
||||||
|
if (group == null) {
|
||||||
|
group = new JobGroup(job);
|
||||||
|
jobKeyToGroup.put(key, group);
|
||||||
}
|
}
|
||||||
|
group.entries.add(entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
List<CellChange> cellChanges = new ArrayList<CellChange>(_entries.size());
|
List<CellChange> cellChanges = new ArrayList<CellChange>(_entries.size());
|
||||||
List<String> values = new ArrayList<String>(valueToEntries.keySet());
|
List<JobGroup> groups = new ArrayList<JobGroup>(jobKeyToGroup.values());
|
||||||
|
|
||||||
final int batchSize = 10;
|
int batchSize = _reconConfig.getBatchSize();
|
||||||
for (int i = 0; i < values.size(); i += batchSize) {
|
for (int i = 0; i < groups.size(); i += batchSize) {
|
||||||
recon(valueToEntries, values, i, Math.min(i + batchSize, values.size()), cellChanges);
|
int to = Math.min(i + batchSize, groups.size());
|
||||||
|
|
||||||
_progress = i * 100 / values.size();
|
List<ReconJob> jobs = new ArrayList<ReconJob>(to - i);
|
||||||
|
for (int j = i; j < to; j++) {
|
||||||
|
jobs.add(groups.get(j).job);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Recon> recons = _reconConfig.batchRecon(jobs);
|
||||||
|
for (int j = i; j < to; j++) {
|
||||||
|
Recon recon = recons.get(j - i);
|
||||||
|
if (recon == null) {
|
||||||
|
recon = new Recon();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (ReconEntry entry : groups.get(j).entries) {
|
||||||
|
Cell oldCell = entry.cell;
|
||||||
|
Cell newCell = new Cell(oldCell.value, recon);
|
||||||
|
|
||||||
|
CellChange cellChange = new CellChange(
|
||||||
|
entry.rowIndex,
|
||||||
|
_cellIndex,
|
||||||
|
oldCell,
|
||||||
|
newCell
|
||||||
|
);
|
||||||
|
cellChanges.add(cellChange);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_progress = i * 100 / groups.size();
|
||||||
try {
|
try {
|
||||||
Thread.sleep(50);
|
Thread.sleep(50);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
@ -206,7 +217,7 @@ public class ReconOperation extends EngineDependentOperation {
|
|||||||
Change reconChange = new ReconChange(
|
Change reconChange = new ReconChange(
|
||||||
cellChanges,
|
cellChanges,
|
||||||
_columnName,
|
_columnName,
|
||||||
new ReconConfig(_typeID, _typeName),
|
_reconConfig,
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -220,204 +231,5 @@ public class ReconOperation extends EngineDependentOperation {
|
|||||||
_project.history.addEntry(historyEntry);
|
_project.history.addEntry(historyEntry);
|
||||||
_project.processManager.onDoneProcess(this);
|
_project.processManager.onDoneProcess(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void recon(
|
|
||||||
Map<String, List<ReconEntry>> valueToEntries,
|
|
||||||
List<String> values,
|
|
||||||
int from,
|
|
||||||
int to,
|
|
||||||
List<CellChange> cellChanges
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
StringWriter stringWriter = new StringWriter();
|
|
||||||
JSONWriter jsonWriter = new JSONWriter(stringWriter);
|
|
||||||
|
|
||||||
jsonWriter.object();
|
|
||||||
for (int i = 0; from + i < to; i++) {
|
|
||||||
jsonWriter.key("q" + i + ":search");
|
|
||||||
|
|
||||||
jsonWriter.object();
|
|
||||||
|
|
||||||
jsonWriter.key("query"); jsonWriter.value(values.get(from + i));
|
|
||||||
jsonWriter.key("limit"); jsonWriter.value(3);
|
|
||||||
jsonWriter.key("type"); jsonWriter.value(_typeID);
|
|
||||||
jsonWriter.key("type_strict"); jsonWriter.value("should");
|
|
||||||
//jsonWriter.key("indent"); jsonWriter.value(1);
|
|
||||||
jsonWriter.key("type_exclude"); jsonWriter.value("/common/image");
|
|
||||||
jsonWriter.key("domain_exclude"); jsonWriter.value("/freebase");
|
|
||||||
|
|
||||||
jsonWriter.endObject();
|
|
||||||
}
|
|
||||||
jsonWriter.endObject();
|
|
||||||
|
|
||||||
StringBuffer sb = new StringBuffer();
|
|
||||||
sb.append("http://api.freebase.com/api/service/search?indent=1&queries=");
|
|
||||||
sb.append(ParsingUtilities.encode(stringWriter.toString()));
|
|
||||||
|
|
||||||
URL url = new URL(sb.toString());
|
|
||||||
URLConnection connection = url.openConnection();
|
|
||||||
connection.setConnectTimeout(5000);
|
|
||||||
connection.connect();
|
|
||||||
|
|
||||||
InputStream is = connection.getInputStream();
|
|
||||||
try {
|
|
||||||
String s = ParsingUtilities.inputStreamToString(is);
|
|
||||||
JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s);
|
|
||||||
|
|
||||||
for (int i = 0; from + i < to; i++) {
|
|
||||||
String value = values.get(from + i);
|
|
||||||
String key = "q" + i + ":search";
|
|
||||||
if (!o.has(key)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Recon recon;
|
|
||||||
|
|
||||||
JSONObject o2 = o.getJSONObject(key);
|
|
||||||
if (o2.has("result")) {
|
|
||||||
JSONArray results = o2.getJSONArray("result");
|
|
||||||
|
|
||||||
recon = createRecon(value, results);
|
|
||||||
} else {
|
|
||||||
recon = new Recon();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (ReconEntry entry : valueToEntries.get(value)) {
|
|
||||||
Cell oldCell = entry.cell;
|
|
||||||
|
|
||||||
Cell newCell = new Cell(oldCell.value, recon);
|
|
||||||
|
|
||||||
CellChange cellChange = new CellChange(
|
|
||||||
entry.rowIndex,
|
|
||||||
_cellIndex,
|
|
||||||
oldCell,
|
|
||||||
newCell
|
|
||||||
);
|
|
||||||
cellChanges.add(cellChange);
|
|
||||||
}
|
|
||||||
|
|
||||||
valueToEntries.remove(value);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
is.close();
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (List<ReconEntry> entries : valueToEntries.values()) {
|
|
||||||
Recon recon = new Recon();
|
|
||||||
|
|
||||||
for (ReconEntry entry : entries) {
|
|
||||||
Cell oldCell = entry.cell;
|
|
||||||
Cell newCell = new Cell(oldCell.value, recon);
|
|
||||||
|
|
||||||
CellChange cellChange = new CellChange(
|
|
||||||
entry.rowIndex,
|
|
||||||
_cellIndex,
|
|
||||||
oldCell,
|
|
||||||
newCell
|
|
||||||
);
|
|
||||||
cellChanges.add(cellChange);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
System.gc();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Recon createRecon(String text, JSONArray results) {
|
|
||||||
Recon recon = new Recon();
|
|
||||||
try {
|
|
||||||
int length = results.length();
|
|
||||||
int count = 0;
|
|
||||||
for (int i = 0; i < length && count < 3; i++) {
|
|
||||||
JSONObject result = results.getJSONObject(i);
|
|
||||||
if (!result.has("name")) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
JSONArray types = result.getJSONArray("type");
|
|
||||||
String[] typeIDs = new String[types.length()];
|
|
||||||
for (int j = 0; j < typeIDs.length; j++) {
|
|
||||||
typeIDs[j] = types.getJSONObject(j).getString("id");
|
|
||||||
}
|
|
||||||
|
|
||||||
double score = result.getDouble("relevance:score");
|
|
||||||
ReconCandidate candidate = new ReconCandidate(
|
|
||||||
result.getString("id"),
|
|
||||||
result.getString("guid"),
|
|
||||||
result.getString("name"),
|
|
||||||
typeIDs,
|
|
||||||
score
|
|
||||||
);
|
|
||||||
|
|
||||||
// best match
|
|
||||||
if (i == 0) {
|
|
||||||
recon.setFeature(Recon.Feature_nameMatch, text.equalsIgnoreCase(candidate.topicName));
|
|
||||||
recon.setFeature(Recon.Feature_nameLevenshtein, StringUtils.getLevenshteinDistance(text, candidate.topicName));
|
|
||||||
recon.setFeature(Recon.Feature_nameWordDistance, wordDistance(text, candidate.topicName));
|
|
||||||
|
|
||||||
recon.setFeature(Recon.Feature_typeMatch, false);
|
|
||||||
for (String typeID : candidate.typeIDs) {
|
|
||||||
if (_typeID.equals(typeID)) {
|
|
||||||
recon.setFeature(Recon.Feature_typeMatch, true);
|
|
||||||
if (_autoMatch && score >= _minScore) {
|
|
||||||
recon.match = candidate;
|
|
||||||
recon.judgment = Judgment.Matched;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
recon.addCandidate(candidate);
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
} catch (JSONException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
return recon;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static protected double wordDistance(String s1, String s2) {
|
|
||||||
Set<String> words1 = breakWords(s1);
|
|
||||||
Set<String> words2 = breakWords(s2);
|
|
||||||
return words1.size() >= words2.size() ? wordDistance(words1, words2) : wordDistance(words2, words1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static protected double wordDistance(Set<String> longWords, Set<String> shortWords) {
|
|
||||||
double common = 0;
|
|
||||||
for (String word : shortWords) {
|
|
||||||
if (longWords.contains(word)) {
|
|
||||||
common++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return common / longWords.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
static protected Set<String> s_stopWords;
|
|
||||||
static {
|
|
||||||
s_stopWords = new HashSet<String>();
|
|
||||||
s_stopWords.add("the");
|
|
||||||
s_stopWords.add("a");
|
|
||||||
s_stopWords.add("and");
|
|
||||||
s_stopWords.add("of");
|
|
||||||
s_stopWords.add("on");
|
|
||||||
s_stopWords.add("in");
|
|
||||||
s_stopWords.add("at");
|
|
||||||
s_stopWords.add("by");
|
|
||||||
}
|
|
||||||
|
|
||||||
static protected Set<String> breakWords(String s) {
|
|
||||||
String[] words = s.toLowerCase().split("\\s+");
|
|
||||||
|
|
||||||
Set<String> set = new HashSet<String>(words.length);
|
|
||||||
for (String word : words) {
|
|
||||||
if (!s_stopWords.contains(word)) {
|
|
||||||
set.add(word);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return set;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -44,13 +44,13 @@ public class RowStarOperation extends EngineDependentOperation {
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
||||||
writer.key("description"); writer.value(getBriefDescription());
|
writer.key("description"); writer.value(getBriefDescription(null));
|
||||||
writer.key("engineConfig"); writer.value(getEngineConfig());
|
writer.key("engineConfig"); writer.value(getEngineConfig());
|
||||||
writer.key("starred"); writer.value(_starred);
|
writer.key("starred"); writer.value(_starred);
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return (_starred ? "Star rows" : "Unstar rows");
|
return (_starred ? "Star rows" : "Unstar rows");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ public class SaveProtographOperation extends AbstractOperation {
|
|||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Save schema skeleton";
|
return "Save schema skeleton";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,14 +43,14 @@ public class TextTransformOperation extends EngineDependentMassCellOperation {
|
|||||||
|
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
writer.key("op"); writer.value(OperationRegistry.s_opClassToName.get(this.getClass()));
|
||||||
writer.key("description"); writer.value(getBriefDescription());
|
writer.key("description"); writer.value(getBriefDescription(null));
|
||||||
writer.key("engineConfig"); writer.value(getEngineConfig());
|
writer.key("engineConfig"); writer.value(getEngineConfig());
|
||||||
writer.key("columnName"); writer.value(_columnName);
|
writer.key("columnName"); writer.value(_columnName);
|
||||||
writer.key("expression"); writer.value(_expression);
|
writer.key("expression"); writer.value(_expression);
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getBriefDescription() {
|
protected String getBriefDescription(Project project) {
|
||||||
return "Text transform on cells in column " + _columnName + " using expression " + _expression;
|
return "Text transform on cells in column " + _columnName + " using expression " + _expression;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,11 +59,17 @@ ReconDialog.prototype._createDialog = function() {
|
|||||||
'<div class="recon-dialog-heuristic-details-container" bind="heuristicDetailContainer"></div>' +
|
'<div class="recon-dialog-heuristic-details-container" bind="heuristicDetailContainer"></div>' +
|
||||||
'</td>' +
|
'</td>' +
|
||||||
'</tr>' +
|
'</tr>' +
|
||||||
|
'<tr>' +
|
||||||
|
'<td>' +
|
||||||
|
'<input type="checkbox" checked bind="heuristicAutomatchCheck" /> Auto-match correctly-typed candidates scoring' +
|
||||||
|
'</td>' +
|
||||||
|
'<td>' +
|
||||||
|
'Use ' +
|
||||||
|
'<input type="radio" name="recon-dialog-heuristic-service" value="recon" checked /> recon service ' +
|
||||||
|
'<input type="radio" name="recon-dialog-heuristic-service" value="relevance" /> relevance service ' +
|
||||||
|
'</td>' +
|
||||||
|
'</tr>' +
|
||||||
'</table>' +
|
'</table>' +
|
||||||
'<p>' +
|
|
||||||
'<input type="checkbox" checked bind="heuristicAutomatchCheck" /> Auto-match correctly-typed candidates scoring at least ' +
|
|
||||||
'<input size="3" value="100" bind="heuristicAutomatchScoreInput" />' +
|
|
||||||
'</p>' +
|
|
||||||
'</div>' +
|
'</div>' +
|
||||||
'<div id="recon-dialog-tabs-strict" style="display: none;">' +
|
'<div id="recon-dialog-tabs-strict" style="display: none;">' +
|
||||||
'<p>Each cell contains:</p>' +
|
'<p>Each cell contains:</p>' +
|
||||||
@ -203,19 +209,24 @@ ReconDialog.prototype._onDoHeuristic = function() {
|
|||||||
if (type == null) {
|
if (type == null) {
|
||||||
alert("Please specify a type.");
|
alert("Please specify a type.");
|
||||||
} else {
|
} else {
|
||||||
this._dismiss();
|
|
||||||
|
|
||||||
Gridworks.postProcess(
|
Gridworks.postProcess(
|
||||||
"reconcile",
|
"reconcile",
|
||||||
|
{},
|
||||||
{
|
{
|
||||||
columnName: this._column.headerLabel,
|
columnName: this._column.headerLabel,
|
||||||
typeID: type.id,
|
config: JSON.stringify({
|
||||||
typeName: type.name,
|
mode: "heuristic",
|
||||||
autoMatch: this._elmts.heuristicAutomatchCheck[0].checked,
|
service: $('input[name="recon-dialog-heuristic-service"]:checked')[0].value,
|
||||||
minScore: this._elmts.heuristicAutomatchScoreInput[0].value
|
type: {
|
||||||
},
|
id: type.id,
|
||||||
null,
|
name: type.name
|
||||||
|
},
|
||||||
|
autoMatch: this._elmts.heuristicAutomatchCheck[0].checked
|
||||||
|
})
|
||||||
|
},
|
||||||
{ cellsChanged: true, columnStatsChanged: true }
|
{ cellsChanged: true, columnStatsChanged: true }
|
||||||
);
|
);
|
||||||
|
|
||||||
|
this._dismiss();
|
||||||
}
|
}
|
||||||
};
|
};
|
Loading…
Reference in New Issue
Block a user