diff --git a/src/main/java/com/metaweb/gridworks/Gridworks.java b/src/main/java/com/metaweb/gridworks/Gridworks.java index ac5349703..60daa2607 100644 --- a/src/main/java/com/metaweb/gridworks/Gridworks.java +++ b/src/main/java/com/metaweb/gridworks/Gridworks.java @@ -69,6 +69,14 @@ public class Gridworks extends Server { public static void log(String message) { logger.info(message); } + + public static void error(String message, Throwable t) { + logger.error(message, t); + } + + public static void warn(String message) { + logger.warn(message); + } /* -------------- Gridworks HTTP server ----------------- */ diff --git a/src/main/java/com/metaweb/gridworks/GridworksServlet.java b/src/main/java/com/metaweb/gridworks/GridworksServlet.java index b1d0de670..ab9fdd653 100644 --- a/src/main/java/com/metaweb/gridworks/GridworksServlet.java +++ b/src/main/java/com/metaweb/gridworks/GridworksServlet.java @@ -15,15 +15,16 @@ import com.metaweb.gridworks.commands.edit.AnnotateOneRowCommand; import com.metaweb.gridworks.commands.edit.AnnotateRowsCommand; import com.metaweb.gridworks.commands.edit.ApplyOperationsCommand; import com.metaweb.gridworks.commands.edit.CreateProjectCommand; +import com.metaweb.gridworks.commands.edit.DeleteProjectCommand; import com.metaweb.gridworks.commands.edit.DoTextTransformCommand; import com.metaweb.gridworks.commands.edit.EditOneCellCommand; import com.metaweb.gridworks.commands.edit.FacetBasedEditCommand; import com.metaweb.gridworks.commands.edit.JoinMultiValueCellsCommand; import com.metaweb.gridworks.commands.edit.RemoveColumnCommand; -import com.metaweb.gridworks.commands.edit.DeleteProjectCommand; import com.metaweb.gridworks.commands.edit.SaveProtographCommand; import com.metaweb.gridworks.commands.edit.SplitMultiValueCellsCommand; import com.metaweb.gridworks.commands.edit.UndoRedoCommand; +import com.metaweb.gridworks.commands.info.ComputeClustersCommand; import com.metaweb.gridworks.commands.info.ComputeFacetsCommand; import com.metaweb.gridworks.commands.info.ExportRowsCommand; import com.metaweb.gridworks.commands.info.GetAllProjectMetadataCommand; @@ -74,6 +75,7 @@ public class GridworksServlet extends HttpServlet { _commands.put("cancel-processes", new CancelProcessesCommand()); _commands.put("compute-facets", new ComputeFacetsCommand()); + _commands.put("compute-clusters", new ComputeClustersCommand()); _commands.put("do-text-transform", new DoTextTransformCommand()); _commands.put("facet-based-edit", new FacetBasedEditCommand()); _commands.put("edit-one-cell", new EditOneCellCommand()); diff --git a/src/main/java/com/metaweb/gridworks/clustering/Clusterer.java b/src/main/java/com/metaweb/gridworks/clustering/Clusterer.java new file mode 100644 index 000000000..3ca49632d --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/clustering/Clusterer.java @@ -0,0 +1,29 @@ +package com.metaweb.gridworks.clustering; + +import org.json.JSONObject; + +import com.metaweb.gridworks.Jsonizable; +import com.metaweb.gridworks.browsing.Engine; +import com.metaweb.gridworks.model.Column; +import com.metaweb.gridworks.model.Project; + +public abstract class Clusterer implements Jsonizable { + + protected Project _project; + protected int _colindex; + protected JSONObject _config; + + public abstract void computeClusters(Engine engine); + + public void initializeFromJSON(Project project, JSONObject o) throws Exception { + _project = project; + _config = o; + + String colname = o.getString("column"); + for (Column column : project.columnModel.columns) { + if (column.getHeaderLabel().equals(colname)) { + _colindex = column.getCellIndex(); + } + } + } +} diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/BinningClusterer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/BinningClusterer.java new file mode 100644 index 000000000..53efbd355 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/BinningClusterer.java @@ -0,0 +1,151 @@ +package com.metaweb.gridworks.clustering.binning; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.TreeMap; +import java.util.Map.Entry; + +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONWriter; + +import com.metaweb.gridworks.browsing.Engine; +import com.metaweb.gridworks.browsing.FilteredRows; +import com.metaweb.gridworks.browsing.RowVisitor; +import com.metaweb.gridworks.clustering.Clusterer; +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Row; + +public class BinningClusterer extends Clusterer { + + private Keyer _keyer; + + static protected Map _keyers = new HashMap(); + + List> _clusters; + + static { + _keyers.put("fingerprint", new FingerprintKeyer()); + _keyers.put("ngram-fingerprint", new NGramFingerprintKeyer()); + _keyers.put("metaphone", new MetaphoneKeyer()); + _keyers.put("double-metaphone", new DoubleMetaphoneKeyer()); + _keyers.put("soundex", new SoundexKeyer()); + } + + class BinningRowVisitor implements RowVisitor { + + Keyer _keyer; + Object[] _params; + JSONObject _config; + + Map> _map = new HashMap>(); + + public BinningRowVisitor(Keyer k, JSONObject o) { + _keyer = k; + _config = o; + if (k instanceof NGramFingerprintKeyer) { + try { + int size = _config.getJSONObject("params").getInt("ngram-size"); + _params = new Object[1]; + _params[0] = size; + } catch (JSONException e) { + //Gridworks.warn("no ngram size specified, using default"); + } + } + } + + public boolean visit(Project project, int rowIndex, Row row, boolean contextual) { + Cell cell = row.cells.get(_colindex); + if (cell != null && cell.value != null) { + Object v = cell.value; + String s = (v instanceof String) ? ((String) v) : v.toString(); + String key = _keyer.key(s,_params); + if (_map.containsKey(key)) { + Map m = _map.get(key); + if (m.containsKey(v)) { + m.put(v, m.get(v) + 1); + } else { + m.put(v,1); + } + } else { + Map m = new TreeMap(); + m.put(v,0); + _map.put(key, m); + } + } + return false; + } + + public Map> getMap() { + return _map; + } + } + + public class SizeComparator implements Comparator> { + public int compare(Map o1, Map o2) { + int s1 = o1.size(); + int s2 = o2.size(); + if (o1 == o2) { + int total1 = 0; + for (int i : o1.values()) { + total1 += i; + } + int total2 = 0; + for (int i : o2.values()) { + total2 += i; + } + return total2 - total1; + } else { + return s2 - s1; + } + } + } + + public class EntriesComparator implements Comparator> { + public int compare(Entry o1, Entry o2) { + return o2.getValue() - o1.getValue(); + } + } + + public void initializeFromJSON(Project project, JSONObject o) throws Exception { + super.initializeFromJSON(project, o); + _keyer = _keyers.get(o.getString("function").toLowerCase()); + } + + public void computeClusters(Engine engine) { + BinningRowVisitor visitor = new BinningRowVisitor(_keyer,_config); + FilteredRows filteredRows = engine.getAllFilteredRows(true); + filteredRows.accept(_project, visitor); + + Map> map = visitor.getMap(); + _clusters = new ArrayList>(map.values()); + Collections.sort(_clusters, new SizeComparator()); + } + + public void write(JSONWriter writer, Properties options) throws JSONException { + EntriesComparator c = new EntriesComparator(); + + writer.array(); + for (Map m : _clusters) { + if (m.size() > 1) { + writer.array(); + List> entries = new ArrayList>(m.entrySet()); + Collections.sort(entries,c); + for (Entry e : entries) { + writer.object(); + writer.key("v"); writer.value(e.getKey()); + writer.key("c"); writer.value(e.getValue()); + writer.endObject(); + } + writer.endArray(); + } + } + writer.endArray(); + } +} diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/DoubleMetaphoneKeyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/DoubleMetaphoneKeyer.java new file mode 100644 index 000000000..554f84e59 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/DoubleMetaphoneKeyer.java @@ -0,0 +1,13 @@ +package com.metaweb.gridworks.clustering.binning; + +import org.apache.commons.codec.language.DoubleMetaphone; + +public class DoubleMetaphoneKeyer extends Keyer { + + private DoubleMetaphone _metaphone2 = new DoubleMetaphone(); + + public String key(String s, Object... o) { + return _metaphone2.doubleMetaphone(s); + } + +} diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/FingerprintKeyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/FingerprintKeyer.java new file mode 100644 index 000000000..6a63fa25b --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/FingerprintKeyer.java @@ -0,0 +1,31 @@ +package com.metaweb.gridworks.clustering.binning; + +import java.util.Iterator; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import org.apache.commons.lang.StringUtils; + +public class FingerprintKeyer extends Keyer { + + static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}"); + + public String key(String s, Object... o) { + s = s.trim(); // first off, remove whitespace around the string + s = s.toLowerCase(); // then lowercase it + s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars + String[] frags = StringUtils.split(s); // split by whitespace + TreeSet set = new TreeSet(); + for (String ss : frags) { + set.add(ss); // order fragments and dedupe + } + StringBuffer b = new StringBuffer(); + Iterator i = set.iterator(); + while (i.hasNext()) { + b.append(i.next()); + b.append(' '); + } + return b.toString(); // join ordered fragments back together + } + +} diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/Keyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/Keyer.java new file mode 100644 index 000000000..2ad1200bb --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/Keyer.java @@ -0,0 +1,12 @@ +package com.metaweb.gridworks.clustering.binning; + + +public abstract class Keyer { + + public String key(String s) { + return this.key(s, (Object[]) null); + } + + public abstract String key(String string, Object... params); + +} diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/MetaphoneKeyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/MetaphoneKeyer.java new file mode 100644 index 000000000..fe2ddf5a8 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/MetaphoneKeyer.java @@ -0,0 +1,13 @@ +package com.metaweb.gridworks.clustering.binning; + +import org.apache.commons.codec.language.Metaphone; + +public class MetaphoneKeyer extends Keyer { + + private Metaphone _metaphone = new Metaphone(); + + public String key(String s, Object... o) { + return _metaphone.metaphone(s); + } + +} diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/NGramFingerprintKeyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/NGramFingerprintKeyer.java new file mode 100644 index 000000000..4b3fe11f5 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/NGramFingerprintKeyer.java @@ -0,0 +1,35 @@ +package com.metaweb.gridworks.clustering.binning; + +import java.util.Iterator; +import java.util.TreeSet; +import java.util.regex.Pattern; + +public class NGramFingerprintKeyer extends Keyer { + + static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}"); + + public String key(String s, Object... o) { + int ngram_size = 1; + if (o != null && o.length > 0 && o[0] instanceof Number) { + ngram_size = (Integer) o[0]; + } + s = s.toLowerCase(); // then lowercase it + s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars + TreeSet set = ngram_split(s,ngram_size); + StringBuffer b = new StringBuffer(); + Iterator i = set.iterator(); + while (i.hasNext()) { + b.append(i.next()); + } + return b.toString(); // join ordered fragments back together + } + + protected TreeSet ngram_split(String s, int size) { + TreeSet set = new TreeSet(); + char[] chars = s.toCharArray(); + for (int i = 0; i + size <= chars.length; i++) { + set.add(new String(chars,i,size)); + } + return set; + } +} diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/SoundexKeyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/SoundexKeyer.java new file mode 100644 index 000000000..65c0801da --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/SoundexKeyer.java @@ -0,0 +1,13 @@ +package com.metaweb.gridworks.clustering.binning; + +import org.apache.commons.codec.language.Soundex; + +public class SoundexKeyer extends Keyer { + + private Soundex _soundex = new Soundex(); + + public String key(String s, Object... o) { + return _soundex.soundex(s); + } + +} diff --git a/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java b/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java new file mode 100644 index 000000000..ad488f894 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java @@ -0,0 +1,122 @@ +package com.metaweb.gridworks.clustering.knn; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONWriter; + +import com.metaweb.gridworks.Gridworks; +import com.metaweb.gridworks.browsing.Engine; +import com.metaweb.gridworks.browsing.FilteredRows; +import com.metaweb.gridworks.browsing.RowVisitor; +import com.metaweb.gridworks.clustering.Clusterer; +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Row; + +import edu.mit.simile.vicino.Distance; +import edu.mit.simile.vicino.distances.BZip2Distance; +import edu.mit.simile.vicino.distances.GZipDistance; +import edu.mit.simile.vicino.distances.JaccardDistance; +import edu.mit.simile.vicino.distances.JaroDistance; +import edu.mit.simile.vicino.distances.LevenshteinDistance; +import edu.mit.simile.vicino.distances.PPMDistance; +import edu.mit.simile.vicino.vptree.VPTreeBuilder; + +public class kNNClusterer extends Clusterer { + + private Distance _distance; + + static protected Map _distances = new HashMap(); + + List> _clusters; + + static { + _distances.put("levenshtein", new LevenshteinDistance()); + _distances.put("jaro", new JaroDistance()); + _distances.put("jaccard", new JaccardDistance()); + _distances.put("gzip", new GZipDistance()); + _distances.put("bzip2", new BZip2Distance()); + _distances.put("ppm", new PPMDistance()); + } + + class kNNClusteringRowVisitor implements RowVisitor { + + Distance _distance; + JSONObject _config; + VPTreeBuilder _treeBuilder; + float _radius; + + public kNNClusteringRowVisitor(Distance d, JSONObject o) { + _distance = d; + _config = o; + _treeBuilder = new VPTreeBuilder(_distance); + try { + _radius = (float) o.getDouble("radius"); + } catch (JSONException e) { + Gridworks.warn("No radius found, using default"); + _radius = 1.0f; + } + } + + public boolean visit(Project project, int rowIndex, Row row, boolean contextual) { + Cell cell = row.cells.get(_colindex); + if (cell != null && cell.value != null) { + Object v = cell.value; + String s = (v instanceof String) ? ((String) v) : v.toString(); + _treeBuilder.populate(s); + } + return false; + } + + public Map> getClusters() { + return _treeBuilder.getClusters(_radius); + } + } + + public class SizeComparator implements Comparator> { + public int compare(List o1, List o2) { + return o2.size() - o1.size(); + } + } + + public void initializeFromJSON(Project project, JSONObject o) throws Exception { + super.initializeFromJSON(project, o); + _distance = _distances.get(o.getString("function").toLowerCase()); + } + + public void computeClusters(Engine engine) { + kNNClusteringRowVisitor visitor = new kNNClusteringRowVisitor(_distance,_config); + FilteredRows filteredRows = engine.getAllFilteredRows(true); + filteredRows.accept(_project, visitor); + + Map> clusters = visitor.getClusters(); + _clusters = new ArrayList>(clusters.values()); + Collections.sort(_clusters, new SizeComparator()); + } + + public void write(JSONWriter writer, Properties options) throws JSONException { + writer.array(); + for (List m : _clusters) { + if (m.size() > 1) { + writer.array(); + for (Serializable s : m) { + writer.object(); + writer.key("v"); writer.value(s); + writer.key("c"); writer.value(1); + writer.endObject(); + } + writer.endArray(); + } + } + writer.endArray(); + } +} diff --git a/src/main/java/com/metaweb/gridworks/commands/info/ComputeClustersCommand.java b/src/main/java/com/metaweb/gridworks/commands/info/ComputeClustersCommand.java new file mode 100644 index 000000000..ff0513f5b --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/commands/info/ComputeClustersCommand.java @@ -0,0 +1,50 @@ +package com.metaweb.gridworks.commands.info; + +import java.io.IOException; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.json.JSONObject; + +import com.metaweb.gridworks.Gridworks; +import com.metaweb.gridworks.browsing.Engine; +import com.metaweb.gridworks.clustering.Clusterer; +import com.metaweb.gridworks.clustering.binning.BinningClusterer; +import com.metaweb.gridworks.clustering.knn.kNNClusterer; +import com.metaweb.gridworks.commands.Command; +import com.metaweb.gridworks.model.Project; + +public class ComputeClustersCommand extends Command { + + @Override + public void doPost(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + try { + long start = System.currentTimeMillis(); + Project project = getProject(request); + Engine engine = getEngine(request, project); + JSONObject clusterer_conf = getJsonParameter(request,"clusterer"); + + Clusterer clusterer = null; + String type = clusterer_conf.has("type") ? clusterer_conf.getString("type") : "binning"; + + if ("knn".equals(type)) { + clusterer = new kNNClusterer(); + } else { + clusterer = new BinningClusterer(); + } + + clusterer.initializeFromJSON(project, clusterer_conf); + + clusterer.computeClusters(engine); + + respondJSON(response, clusterer); + Gridworks.log("computed clusters [" + type + "," + clusterer_conf.getString("function") + "] in " + (System.currentTimeMillis() - start) + "ms"); + } catch (Exception e) { + respondException(response, e); + } + } +} diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java index a236ac089..c2f9fc505 100644 --- a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java @@ -1,39 +1,23 @@ package com.metaweb.gridworks.expr.functions.strings; -import java.util.Iterator; import java.util.Properties; -import java.util.TreeSet; -import java.util.regex.Pattern; -import org.apache.commons.lang.StringUtils; import org.json.JSONException; import org.json.JSONWriter; +import com.metaweb.gridworks.clustering.binning.FingerprintKeyer; +import com.metaweb.gridworks.clustering.binning.Keyer; import com.metaweb.gridworks.gel.Function; public class Fingerprint implements Function { - static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}"); - + static Keyer fingerprint = new FingerprintKeyer(); + public Object call(Properties bindings, Object[] args) { if (args.length == 1 && args[0] != null) { Object o = args[0]; - String s = (o instanceof String) ? (String) o : o.toString(); - s = s.trim(); // first off, remove whitespace around the string - s = s.toLowerCase(); // then lowercase it - s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars - String[] frags = StringUtils.split(s); // split by whitespace - TreeSet set = new TreeSet(); - for (String ss : frags) { - set.add(ss); // order fragments and dedupe - } - StringBuffer b = new StringBuffer(); - Iterator i = set.iterator(); - while (i.hasNext()) { - b.append(i.next()); - b.append(' '); - } - return b.toString(); // join ordered fragments back together + String s = (o instanceof String) ? (String) o : o.toString(); + return fingerprint.key(s); } return null; } diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/NGramFingerprint.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/NGramFingerprint.java index de28573bd..e8c67101f 100644 --- a/src/main/java/com/metaweb/gridworks/expr/functions/strings/NGramFingerprint.java +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/NGramFingerprint.java @@ -1,20 +1,20 @@ package com.metaweb.gridworks.expr.functions.strings; -import java.util.Iterator; import java.util.Properties; import java.util.TreeSet; -import java.util.regex.Pattern; import org.json.JSONException; import org.json.JSONWriter; +import com.metaweb.gridworks.clustering.binning.Keyer; +import com.metaweb.gridworks.clustering.binning.NGramFingerprintKeyer; import com.metaweb.gridworks.expr.EvalError; import com.metaweb.gridworks.gel.ControlFunctionRegistry; import com.metaweb.gridworks.gel.Function; public class NGramFingerprint implements Function { - static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}"); + static Keyer ngram_fingerprint = new NGramFingerprintKeyer(); public Object call(Properties bindings, Object[] args) { if (args.length == 1 || args.length == 2) { @@ -24,16 +24,8 @@ public class NGramFingerprint implements Function { ngram_size = (args[1] instanceof Number) ? ((Number) args[1]).intValue() : Integer.parseInt(args[1].toString()); } Object o = args[0]; - String s = (o instanceof String) ? (String) o : o.toString(); - s = s.toLowerCase(); // then lowercase it - s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars - TreeSet set = ngram_split(s,ngram_size); - StringBuffer b = new StringBuffer(); - Iterator i = set.iterator(); - while (i.hasNext()) { - b.append(i.next()); - } - return b.toString(); // join ordered fragments back together + String s = (o instanceof String) ? (String) o : o.toString(); + return ngram_fingerprint.key(s,ngram_size); } return null; } diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Phonetic.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Phonetic.java index 12b3a8583..07f861922 100644 --- a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Phonetic.java +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Phonetic.java @@ -2,21 +2,21 @@ package com.metaweb.gridworks.expr.functions.strings; import java.util.Properties; -import org.apache.commons.codec.language.DoubleMetaphone; -import org.apache.commons.codec.language.Metaphone; -import org.apache.commons.codec.language.Soundex; import org.json.JSONException; import org.json.JSONWriter; +import com.metaweb.gridworks.clustering.binning.DoubleMetaphoneKeyer; +import com.metaweb.gridworks.clustering.binning.MetaphoneKeyer; +import com.metaweb.gridworks.clustering.binning.SoundexKeyer; import com.metaweb.gridworks.expr.EvalError; import com.metaweb.gridworks.gel.ControlFunctionRegistry; import com.metaweb.gridworks.gel.Function; public class Phonetic implements Function { - private DoubleMetaphone metaphone2 = new DoubleMetaphone(); - private Metaphone metaphone = new Metaphone(); - private Soundex soundex = new Soundex(); + static private DoubleMetaphoneKeyer metaphone2 = new DoubleMetaphoneKeyer(); + static private MetaphoneKeyer metaphone = new MetaphoneKeyer(); + static private SoundexKeyer soundex = new SoundexKeyer(); public Object call(Properties bindings, Object[] args) { if (args.length == 2) { @@ -26,11 +26,11 @@ public class Phonetic implements Function { String str = (o1 instanceof String) ? (String) o1 : o1.toString(); String encoding = ((String) o2).toLowerCase(); if ("doublemetaphone".equals(encoding)) { - return metaphone2.doubleMetaphone(str); + return metaphone2.key(str); } else if ("metaphone".equals(encoding)) { - return metaphone.metaphone(str); + return metaphone.key(str); } else if ("soundex".equals(encoding)) { - return soundex.soundex(str); + return soundex.key(str); } else { return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " doesn't know how to handle the '" + encoding + "' encoding."); } diff --git a/src/main/java/edu/mit/simile/vicino/distances/LevensteinDistance.java b/src/main/java/edu/mit/simile/vicino/distances/LevenshteinDistance.java similarity index 83% rename from src/main/java/edu/mit/simile/vicino/distances/LevensteinDistance.java rename to src/main/java/edu/mit/simile/vicino/distances/LevenshteinDistance.java index 5eaff95a4..7907b034e 100644 --- a/src/main/java/edu/mit/simile/vicino/distances/LevensteinDistance.java +++ b/src/main/java/edu/mit/simile/vicino/distances/LevenshteinDistance.java @@ -5,11 +5,11 @@ import com.wcohen.ss.api.StringDistance; import edu.mit.simile.vicino.Distance; -public class LevensteinDistance implements Distance { +public class LevenshteinDistance implements Distance { StringDistance distance; - public LevensteinDistance() { + public LevenshteinDistance() { this.distance = new Levenstein(); } diff --git a/src/main/java/edu/mit/simile/vicino/vptree/Node.java b/src/main/java/edu/mit/simile/vicino/vptree/Node.java index 4de3f2f34..129396980 100755 --- a/src/main/java/edu/mit/simile/vicino/vptree/Node.java +++ b/src/main/java/edu/mit/simile/vicino/vptree/Node.java @@ -39,4 +39,12 @@ public class Node implements Serializable { public String toString() { return obj.toString(); } + + public boolean equals(Object n) { + if (n instanceof Node) { + return ((Node) n).get().equals(this.obj); + } else { + return false; + } + } } diff --git a/src/main/java/edu/mit/simile/vicino/vptree/VPTreeBuilder.java b/src/main/java/edu/mit/simile/vicino/vptree/VPTreeBuilder.java index c7ea617e3..8529a5c63 100755 --- a/src/main/java/edu/mit/simile/vicino/vptree/VPTreeBuilder.java +++ b/src/main/java/edu/mit/simile/vicino/vptree/VPTreeBuilder.java @@ -2,8 +2,14 @@ package edu.mit.simile.vicino.vptree; import java.io.Serializable; import java.util.Collection; -import java.util.Iterator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; import java.util.Random; +import java.util.Set; + +import com.metaweb.gridworks.Gridworks; import edu.mit.simile.vicino.Distance; @@ -17,9 +23,10 @@ public class VPTreeBuilder { private Random generator = new Random(System.currentTimeMillis()); - private VPTree tree; private final Distance distance; + private Set nodes = new HashSet(); + /** * Defines a VPTree Builder for a specific distance. * @@ -29,20 +36,47 @@ public class VPTreeBuilder { this.distance = distance; } - public VPTree buildVPTree(Collection col) { - Node nodes[] = new Node[col.size()]; - Iterator i = col.iterator(); - int counter = 0; - while (i.hasNext()) { - Serializable s = (Serializable) i.next(); - nodes[counter++] = new Node(s); - } + public void populate(Serializable s) { + nodes.add(new Node(s)); + } - tree = new VPTree(); - tree.setRoot(addNode(nodes, 0, nodes.length - 1)); + public VPTree buildVPTree() { + Node[] nodes_array = this.nodes.toArray(new Node[this.nodes.size()]); + Gridworks.log("building tree with nodes: " + nodes_array.length); + VPTree tree = new VPTree(); + tree.setRoot(addNode(nodes_array, 0, nodes_array.length - 1)); + Gridworks.log("tree built"); return tree; } + public VPTree buildVPTree(Collection values) { + reset(); + for (Serializable s : values) { + populate(s); + } + return buildVPTree(); + } + + public void reset() { + this.nodes.clear(); + } + + public Map> getClusters(float radius) { + VPTree tree = buildVPTree(); + VPTreeSeeker seeker = new VPTreeSeeker(distance,tree); + + Map> map = new HashMap>(); + for (Node n : nodes) { + Serializable s = n.get(); + Gridworks.log(" find results for: " + s); + List results = seeker.range(s, radius); + Gridworks.log(" found: " + results.size()); + map.put(s, results); + } + + return map; + } + private TNode addNode(Node nodes[], int begin, int end) { int delta = end - begin; diff --git a/src/main/webapp/scripts/dialogs/facet-based-edit-dialog.js b/src/main/webapp/scripts/dialogs/facet-based-edit-dialog.js index 588f7587e..6db161c62 100644 --- a/src/main/webapp/scripts/dialogs/facet-based-edit-dialog.js +++ b/src/main/webapp/scripts/dialogs/facet-based-edit-dialog.js @@ -2,6 +2,9 @@ function FacetBasedEditDialog(columnName, expression, entries) { this._columnName = columnName; this._expression = expression; this._entries = entries; + this._method = "binning"; + this._function = "fingerprint"; + this._params = {}; this._createDialog(); this._cluster(); @@ -10,7 +13,7 @@ function FacetBasedEditDialog(columnName, expression, entries) { FacetBasedEditDialog.prototype._createDialog = function() { var self = this; var frame = DialogSystem.createDialog(); - frame.width("800px"); + frame.width("900px"); var header = $('
').addClass("dialog-header").text("Facet-based edit of column " + this._columnName).appendTo(frame); var body = $('
').addClass("dialog-body").appendTo(frame); @@ -18,17 +21,78 @@ FacetBasedEditDialog.prototype._createDialog = function() { var html = $( '
' + + '
' + + '' + + '' + + '' + + '
' + + 'Method: ' + + '' + + '
Keying Function:
' + + '' + + '
' + + '' + + '
' + '
' + - '
' + - ' ' + - ' ' + - '
' + '
' ).appendTo(body); this._elmts = DOM.bind(html); - this._elmts.clusterButton.click(function() { self._cluster(); }); - this._elmts.unclusterButton.click(function() { self._uncluster(); }); + + this._elmts.methodSelector.change(function() { + var selection = $(this).find("option:selected").text(); + if (selection == 'key collision') { + body.find("#binning-controls").show(); + body.find("#knn-controls").hide(); + self._method = "binning"; + self._elmts.keyingFunctionSelector.change(); + } else if (selection = 'nearest neightbor') { + body.find("#binning-controls").hide(); + body.find("#knn-controls").show(); + self._method = "knn"; + self._elmts.distanceFunctionSelector.change(); + } + }); + + var changer = function() { + self._function = $(this).find("option:selected").text(); + $(".function-params").hide(); + $("#" + self._function + "-params").show(); + self._cluster(); + }; + + this._elmts.keyingFunctionSelector.change(changer); + this._elmts.distanceFunctionSelector.change(changer); + + this._elmts.ngramSize.change(function() { + try { + self._params = { "ngram-size" : parseInt($(this).val()) }; + self._cluster(); + } catch (e) { + alert("ngram size must be a number"); + } + }); + + //this._elmts.clusterButton.click(function() { self._cluster(); }); + //this._elmts.unclusterButton.click(function() { self._uncluster(); }); $('').text("OK").click(function() { self._onOK(); }).appendTo(footer); $('').text("Cancel").click(function() { self._dismiss(); }).appendTo(footer); @@ -41,9 +105,9 @@ FacetBasedEditDialog.prototype._createDialog = function() { FacetBasedEditDialog.prototype._renderTable = function() { var self = this; - var container = this._elmts.tableContainer.empty(); + var container = this._elmts.tableContainer; - var table = $('
').addClass("facet-based-edit-dialog-entry-table").appendTo(container)[0]; + var table = $('
').addClass("facet-based-edit-dialog-entry-table")[0]; var trHead = table.insertRow(table.rows.length); trHead.className = "header"; @@ -60,7 +124,7 @@ FacetBasedEditDialog.prototype._renderTable = function() { for (var c = 0; c < choices.length; c++) { var choice = choices[c]; var li = $('
  • ').appendTo(ul); - $('').text(choice.v.l).appendTo(li); + $('').text(choice.v).appendTo(li); $('').text(" (" + choice.c + ")").appendTo(li); } @@ -73,7 +137,7 @@ FacetBasedEditDialog.prototype._renderTable = function() { editCheck.attr("checked", "true"); } - var input = $('') + var input = $('') .attr("value", cluster.value) .appendTo(tr.insertCell(2)) .keyup(function() { @@ -83,65 +147,43 @@ FacetBasedEditDialog.prototype._renderTable = function() { for (var i = 0; i < this._clusters.length; i++) { renderCluster(this._clusters[i]); } + + container.empty().append(table); }; FacetBasedEditDialog.prototype._cluster = function() { - var clusters = []; - var map = {}; - $.each(this._entries, function() { - var choice = { - v: this.v, - c: this.c - }; - - var s = this.v.l.toLowerCase().replace(/\W/g, ' ').replace(/\s+/g, ' ').split(" ").sort().join(" "); - if (s in map) { - map[s].choices.push(choice); - } else { - map[s] = { - edit: false, - choices: [ choice ] - }; - clusters.push(map[s]); - } - }); + var self = this; - $.each(clusters, function() { - if (this.choices.length > 1) { - this.choices.sort(function(a, b) { - var c = b.c - a.c; - return c != 0 ? c : a.v.l.localeCompare(b.v.l); - }); - this.edit = true; - } - this.value = this.choices[0].v.l; - }); - clusters.sort(function(a, b) { - var c = b.choices.length - a.choices.length; - return c != 0 ? c : a.value.localeCompare(b.value); - }); - - this._clusters = clusters; - this._renderTable(); -}; + var container = this._elmts.tableContainer.html( + '
    Loading...
    ' + ); -FacetBasedEditDialog.prototype._uncluster = function() { - var clusters = []; - $.each(this._entries, function() { - var cluster = { - edit: false, - choices: [{ - v: this.v, - c: this.c - }], - value: this.v.l - }; - clusters.push(cluster); - }); - - this._clusters = clusters; - this._renderTable(); -}; + $.post( + "/command/compute-clusters?" + $.param({ project: theProject.id }), + { + engine: JSON.stringify(ui.browsingEngine.getJSON()), + clusterer: JSON.stringify({ + 'type' : this._method, + 'function' : this._function, + 'column' : this._columnName, + 'params' : this._params + }) + }, + function(data) { + var clusters = []; + $.each(data, function() { + clusters.push({ + edit: true, + choices: this, + value: this[0].v + }); + }); + self._clusters = clusters; + self._renderTable(); + }, + "json" + ); +} FacetBasedEditDialog.prototype._onOK = function() { var edits = []; @@ -150,7 +192,7 @@ FacetBasedEditDialog.prototype._onOK = function() { if (cluster.edit) { var values = []; for (var j = 0; j < cluster.choices.length; j++) { - values.push(cluster.choices[j].v.v); + values.push(cluster.choices[j].v); } edits.push({ diff --git a/src/main/webapp/styles/common.css b/src/main/webapp/styles/common.css index 9cb68e986..7c26ea504 100644 --- a/src/main/webapp/styles/common.css +++ b/src/main/webapp/styles/common.css @@ -74,4 +74,8 @@ img { .fbs-pane, .fbs-flyout-pane { z-index: 2000; +} + +.hidden { + display: none; } \ No newline at end of file diff --git a/src/main/webapp/styles/dialogs/facet-based-edit-dialog.css b/src/main/webapp/styles/dialogs/facet-based-edit-dialog.css index 4ac1561c5..1361b689f 100644 --- a/src/main/webapp/styles/dialogs/facet-based-edit-dialog.css +++ b/src/main/webapp/styles/dialogs/facet-based-edit-dialog.css @@ -14,7 +14,7 @@ table.facet-based-edit-dialog-main-layout > tbody > tr:last-child > td { } .facet-based-edit-dialog-table-container { - height: 450px; + height: 500px; overflow: auto; border: 1px solid #aaa; } @@ -42,5 +42,14 @@ table.facet-based-edit-dialog-entry-table > tbody > tr.even > td { } table.facet-based-edit-dialog-entry-table input { - border: none; + border: 1px solid #ccc; + padding: 0 0.1em; +} + +.facet-based-edit-dialog-controls { + margin-bottom: 0.5em; +} + +.facet-based-edit-dialog-controls td { + padding-right: 0.5em; } \ No newline at end of file