package com.metaweb.gridworks.model.recon; import java.io.InputStream; import java.io.Serializable; import java.io.StringWriter; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Properties; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.json.JSONWriter; import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Recon; import com.metaweb.gridworks.model.ReconCandidate; import com.metaweb.gridworks.model.Row; import com.metaweb.gridworks.model.Recon.Judgment; import com.metaweb.gridworks.protograph.FreebaseProperty; import com.metaweb.gridworks.util.ParsingUtilities; public class HeuristicReconConfig extends ReconConfig { private static final long serialVersionUID = 423145327938373362L; static public class ColumnDetail implements Serializable { private static final long serialVersionUID = -8996704822460155543L; final public String columnName; final public FreebaseProperty property; public ColumnDetail(String columnName, FreebaseProperty property) { this.columnName = columnName; this.property = property; } } static public ReconConfig reconstruct(JSONObject obj) throws Exception { List columnDetails = null; if (obj.has("columnDetails")) { JSONArray columnDetailsA = obj.getJSONArray("columnDetails"); int l = columnDetailsA.length(); columnDetails = new ArrayList(l); for (int i = 0; i < l; i++) { JSONObject o = columnDetailsA.getJSONObject(i); JSONObject p = o.getJSONObject("property"); columnDetails.add(new ColumnDetail( o.getString("column"), new FreebaseProperty( p.getString("id"), p.getString("name") ) )); } } else { columnDetails = new ArrayList(); } JSONObject t = obj.getJSONObject("type"); return new HeuristicReconConfig( obj.getString("service"), t.getString("id"), t.getString("name"), obj.getBoolean("autoMatch"), columnDetails ); } static protected class HeuristicReconJob extends ReconJob { String text; public int getKey() { return text.hashCode(); } } final public String service; // either "recon" or "relevance" final public String typeID; final public String typeName; final public boolean autoMatch; final public List columnDetails; public HeuristicReconConfig( String service, String typeID, String typeName, boolean autoMatch, List columnDetails ) { this.service = service; this.typeID = typeID; this.typeName = typeName; this.autoMatch = autoMatch; this.columnDetails = new ArrayList(); } public void write(JSONWriter writer, Properties options) throws JSONException { writer.object(); writer.key("mode"); writer.value("heuristic"); writer.key("service"); writer.value(service); writer.key("type"); writer.object(); writer.key("id"); writer.value(typeID); writer.key("name"); writer.value(typeName); writer.endObject(); writer.key("autoMatch"); writer.value(autoMatch); writer.key("columnDetails"); writer.array(); for (ColumnDetail c : columnDetails) { writer.object(); writer.key("column"); writer.value(c.columnName); writer.key("property"); c.property.write(writer, options); writer.endObject(); } writer.endArray(); writer.endObject(); } @Override public int getBatchSize() { return 10; } @Override public String getBriefDescription(Project project, String columnName) { return "Reconcile cells in column " + columnName + " to type " + typeID; } @Override public ReconJob createJob(Project project, int rowIndex, Row row, String columnName, Cell cell) { HeuristicReconJob job = new HeuristicReconJob(); job.text = cell.value.toString(); return job; } @Override public List batchRecon(List jobs) { if ("relevance".equals(service)) { return batchReconUsingRelevance(jobs); } else { return batchReconUsingReconService(jobs); } } protected List batchReconUsingRelevance(List jobs) { List recons = new ArrayList(jobs.size()); try { StringWriter stringWriter = new StringWriter(); JSONWriter jsonWriter = new JSONWriter(stringWriter); jsonWriter.object(); for (int i = 0; i < jobs.size(); i++) { HeuristicReconJob job = (HeuristicReconJob) jobs.get(i); jsonWriter.key("q" + i + ":search"); jsonWriter.object(); jsonWriter.key("query"); jsonWriter.value(job.text); jsonWriter.key("limit"); jsonWriter.value(3); jsonWriter.key("type"); jsonWriter.value(typeID); jsonWriter.key("type_strict"); jsonWriter.value("should"); jsonWriter.key("type_exclude"); jsonWriter.value("/common/image"); jsonWriter.key("domain_exclude"); jsonWriter.value("/freebase"); jsonWriter.endObject(); } jsonWriter.endObject(); StringBuffer sb = new StringBuffer(); sb.append("http://api.freebase.com/api/service/search?indent=1&queries="); sb.append(ParsingUtilities.encode(stringWriter.toString())); URL url = new URL(sb.toString()); URLConnection connection = url.openConnection(); connection.setConnectTimeout(5000); connection.connect(); InputStream is = connection.getInputStream(); try { String s = ParsingUtilities.inputStreamToString(is); JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s); for (int i = 0; i < jobs.size(); i++) { HeuristicReconJob job = (HeuristicReconJob) jobs.get(i); String text = job.text; String key = "q" + i + ":search"; if (!o.has(key)) { continue; } Recon recon = null; JSONObject o2 = o.getJSONObject(key); if (o2.has("result")) { JSONArray results = o2.getJSONArray("result"); recon = createReconFromRelevanceResults(text, results); } else { recon = new Recon(); } recons.add(recon); } } finally { is.close(); } } catch (Exception e) { e.printStackTrace(); } System.gc(); return recons; } protected Recon createReconFromRelevanceResults(String text, JSONArray results) { Recon recon = new Recon(); try { int length = results.length(); int count = 0; for (int i = 0; i < length && count < 3; i++) { JSONObject result = results.getJSONObject(i); if (!result.has("name")) { continue; } JSONArray types = result.getJSONArray("type"); String[] typeIDs = new String[types.length()]; for (int j = 0; j < typeIDs.length; j++) { typeIDs[j] = types.getJSONObject(j).getString("id"); } double score = result.getDouble("relevance:score"); ReconCandidate candidate = new ReconCandidate( result.getString("id"), result.getString("guid"), result.getString("name"), typeIDs, score ); // best match if (i == 0) { recon.setFeature(Recon.Feature_nameMatch, text.equalsIgnoreCase(candidate.topicName)); recon.setFeature(Recon.Feature_nameLevenshtein, StringUtils.getLevenshteinDistance(text, candidate.topicName)); recon.setFeature(Recon.Feature_nameWordDistance, wordDistance(text, candidate.topicName)); recon.setFeature(Recon.Feature_typeMatch, false); for (String typeID : candidate.typeIDs) { if (typeID.equals(typeID)) { recon.setFeature(Recon.Feature_typeMatch, true); if (autoMatch && score >= 100) { recon.match = candidate; recon.judgment = Judgment.Matched; } break; } } } recon.addCandidate(candidate); count++; } } catch (JSONException e) { e.printStackTrace(); } return recon; } protected List batchReconUsingReconService(List jobs) { List recons = new ArrayList(jobs.size()); try { StringWriter stringWriter = new StringWriter(); JSONWriter jsonWriter = new JSONWriter(stringWriter); jsonWriter.object(); for (int i = 0; i < jobs.size(); i++) { HeuristicReconJob job = (HeuristicReconJob) jobs.get(i); jsonWriter.key("q" + i + ":search"); jsonWriter.object(); jsonWriter.key("query"); jsonWriter.value(job.text); jsonWriter.key("limit"); jsonWriter.value(3); jsonWriter.key("type"); jsonWriter.value(typeID); jsonWriter.key("type_strict"); jsonWriter.value("should"); jsonWriter.key("type_exclude"); jsonWriter.value("/common/image"); jsonWriter.key("domain_exclude"); jsonWriter.value("/freebase"); jsonWriter.endObject(); } jsonWriter.endObject(); StringBuffer sb = new StringBuffer(); sb.append("http://api.freebase.com/api/service/search?indent=1&queries="); sb.append(ParsingUtilities.encode(stringWriter.toString())); URL url = new URL(sb.toString()); URLConnection connection = url.openConnection(); connection.setConnectTimeout(5000); connection.connect(); InputStream is = connection.getInputStream(); try { String s = ParsingUtilities.inputStreamToString(is); JSONObject o = ParsingUtilities.evaluateJsonStringToObject(s); for (int i = 0; i < jobs.size(); i++) { HeuristicReconJob job = (HeuristicReconJob) jobs.get(i); String text = job.text; String key = "q" + i + ":search"; if (!o.has(key)) { continue; } Recon recon = null; JSONObject o2 = o.getJSONObject(key); if (o2.has("result")) { JSONArray results = o2.getJSONArray("result"); recon = createReconFromRelevanceResults(text, results); } else { recon = new Recon(); } recons.add(recon); } } finally { is.close(); } } catch (Exception e) { e.printStackTrace(); } System.gc(); return recons; } static protected double wordDistance(String s1, String s2) { Set words1 = breakWords(s1); Set words2 = breakWords(s2); return words1.size() >= words2.size() ? wordDistance(words1, words2) : wordDistance(words2, words1); } static protected double wordDistance(Set longWords, Set shortWords) { double common = 0; for (String word : shortWords) { if (longWords.contains(word)) { common++; } } return common / longWords.size(); } static protected Set s_stopWords; static { s_stopWords = new HashSet(); s_stopWords.add("the"); s_stopWords.add("a"); s_stopWords.add("and"); s_stopWords.add("of"); s_stopWords.add("on"); s_stopWords.add("in"); s_stopWords.add("at"); s_stopWords.add("by"); } static protected Set breakWords(String s) { String[] words = s.toLowerCase().split("\\s+"); Set set = new HashSet(words.length); for (String word : words) { if (!s_stopWords.contains(word)) { set.add(word); } } return set; } }