much improved facet clustering dialog and functionality

NOTE: kNN clustering code operational but is not working as expected


git-svn-id: http://google-refine.googlecode.com/svn/trunk@219 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-06 10:17:58 +00:00
parent db824bffeb
commit 976c1da5c7
21 changed files with 679 additions and 127 deletions

View File

@ -69,6 +69,14 @@ public class Gridworks extends Server {
public static void log(String message) {
logger.info(message);
}
public static void error(String message, Throwable t) {
logger.error(message, t);
}
public static void warn(String message) {
logger.warn(message);
}
/* -------------- Gridworks HTTP server ----------------- */

View File

@ -15,15 +15,16 @@ import com.metaweb.gridworks.commands.edit.AnnotateOneRowCommand;
import com.metaweb.gridworks.commands.edit.AnnotateRowsCommand;
import com.metaweb.gridworks.commands.edit.ApplyOperationsCommand;
import com.metaweb.gridworks.commands.edit.CreateProjectCommand;
import com.metaweb.gridworks.commands.edit.DeleteProjectCommand;
import com.metaweb.gridworks.commands.edit.DoTextTransformCommand;
import com.metaweb.gridworks.commands.edit.EditOneCellCommand;
import com.metaweb.gridworks.commands.edit.FacetBasedEditCommand;
import com.metaweb.gridworks.commands.edit.JoinMultiValueCellsCommand;
import com.metaweb.gridworks.commands.edit.RemoveColumnCommand;
import com.metaweb.gridworks.commands.edit.DeleteProjectCommand;
import com.metaweb.gridworks.commands.edit.SaveProtographCommand;
import com.metaweb.gridworks.commands.edit.SplitMultiValueCellsCommand;
import com.metaweb.gridworks.commands.edit.UndoRedoCommand;
import com.metaweb.gridworks.commands.info.ComputeClustersCommand;
import com.metaweb.gridworks.commands.info.ComputeFacetsCommand;
import com.metaweb.gridworks.commands.info.ExportRowsCommand;
import com.metaweb.gridworks.commands.info.GetAllProjectMetadataCommand;
@ -74,6 +75,7 @@ public class GridworksServlet extends HttpServlet {
_commands.put("cancel-processes", new CancelProcessesCommand());
_commands.put("compute-facets", new ComputeFacetsCommand());
_commands.put("compute-clusters", new ComputeClustersCommand());
_commands.put("do-text-transform", new DoTextTransformCommand());
_commands.put("facet-based-edit", new FacetBasedEditCommand());
_commands.put("edit-one-cell", new EditOneCellCommand());

View File

@ -0,0 +1,29 @@
package com.metaweb.gridworks.clustering;
import org.json.JSONObject;
import com.metaweb.gridworks.Jsonizable;
import com.metaweb.gridworks.browsing.Engine;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
public abstract class Clusterer implements Jsonizable {
protected Project _project;
protected int _colindex;
protected JSONObject _config;
public abstract void computeClusters(Engine engine);
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
_project = project;
_config = o;
String colname = o.getString("column");
for (Column column : project.columnModel.columns) {
if (column.getHeaderLabel().equals(colname)) {
_colindex = column.getCellIndex();
}
}
}
}

View File

@ -0,0 +1,151 @@
package com.metaweb.gridworks.clustering.binning;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;
import java.util.Map.Entry;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONWriter;
import com.metaweb.gridworks.browsing.Engine;
import com.metaweb.gridworks.browsing.FilteredRows;
import com.metaweb.gridworks.browsing.RowVisitor;
import com.metaweb.gridworks.clustering.Clusterer;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
public class BinningClusterer extends Clusterer {
private Keyer _keyer;
static protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>();
List<Map<Object,Integer>> _clusters;
static {
_keyers.put("fingerprint", new FingerprintKeyer());
_keyers.put("ngram-fingerprint", new NGramFingerprintKeyer());
_keyers.put("metaphone", new MetaphoneKeyer());
_keyers.put("double-metaphone", new DoubleMetaphoneKeyer());
_keyers.put("soundex", new SoundexKeyer());
}
class BinningRowVisitor implements RowVisitor {
Keyer _keyer;
Object[] _params;
JSONObject _config;
Map<String,Map<Object,Integer>> _map = new HashMap<String,Map<Object,Integer>>();
public BinningRowVisitor(Keyer k, JSONObject o) {
_keyer = k;
_config = o;
if (k instanceof NGramFingerprintKeyer) {
try {
int size = _config.getJSONObject("params").getInt("ngram-size");
_params = new Object[1];
_params[0] = size;
} catch (JSONException e) {
//Gridworks.warn("no ngram size specified, using default");
}
}
}
public boolean visit(Project project, int rowIndex, Row row, boolean contextual) {
Cell cell = row.cells.get(_colindex);
if (cell != null && cell.value != null) {
Object v = cell.value;
String s = (v instanceof String) ? ((String) v) : v.toString();
String key = _keyer.key(s,_params);
if (_map.containsKey(key)) {
Map<Object,Integer> m = _map.get(key);
if (m.containsKey(v)) {
m.put(v, m.get(v) + 1);
} else {
m.put(v,1);
}
} else {
Map<Object,Integer> m = new TreeMap<Object,Integer>();
m.put(v,0);
_map.put(key, m);
}
}
return false;
}
public Map<String,Map<Object,Integer>> getMap() {
return _map;
}
}
public class SizeComparator implements Comparator<Map<Object,Integer>> {
public int compare(Map<Object,Integer> o1, Map<Object,Integer> o2) {
int s1 = o1.size();
int s2 = o2.size();
if (o1 == o2) {
int total1 = 0;
for (int i : o1.values()) {
total1 += i;
}
int total2 = 0;
for (int i : o2.values()) {
total2 += i;
}
return total2 - total1;
} else {
return s2 - s1;
}
}
}
public class EntriesComparator implements Comparator<Entry<Object,Integer>> {
public int compare(Entry<Object,Integer> o1, Entry<Object,Integer> o2) {
return o2.getValue() - o1.getValue();
}
}
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
super.initializeFromJSON(project, o);
_keyer = _keyers.get(o.getString("function").toLowerCase());
}
public void computeClusters(Engine engine) {
BinningRowVisitor visitor = new BinningRowVisitor(_keyer,_config);
FilteredRows filteredRows = engine.getAllFilteredRows(true);
filteredRows.accept(_project, visitor);
Map<String,Map<Object,Integer>> map = visitor.getMap();
_clusters = new ArrayList<Map<Object,Integer>>(map.values());
Collections.sort(_clusters, new SizeComparator());
}
public void write(JSONWriter writer, Properties options) throws JSONException {
EntriesComparator c = new EntriesComparator();
writer.array();
for (Map<Object,Integer> m : _clusters) {
if (m.size() > 1) {
writer.array();
List<Entry<Object,Integer>> entries = new ArrayList<Entry<Object,Integer>>(m.entrySet());
Collections.sort(entries,c);
for (Entry<Object,Integer> e : entries) {
writer.object();
writer.key("v"); writer.value(e.getKey());
writer.key("c"); writer.value(e.getValue());
writer.endObject();
}
writer.endArray();
}
}
writer.endArray();
}
}

View File

@ -0,0 +1,13 @@
package com.metaweb.gridworks.clustering.binning;
import org.apache.commons.codec.language.DoubleMetaphone;
public class DoubleMetaphoneKeyer extends Keyer {
private DoubleMetaphone _metaphone2 = new DoubleMetaphone();
public String key(String s, Object... o) {
return _metaphone2.doubleMetaphone(s);
}
}

View File

@ -0,0 +1,31 @@
package com.metaweb.gridworks.clustering.binning;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
public class FingerprintKeyer extends Keyer {
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}");
public String key(String s, Object... o) {
s = s.trim(); // first off, remove whitespace around the string
s = s.toLowerCase(); // then lowercase it
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
String[] frags = StringUtils.split(s); // split by whitespace
TreeSet<String> set = new TreeSet<String>();
for (String ss : frags) {
set.add(ss); // order fragments and dedupe
}
StringBuffer b = new StringBuffer();
Iterator<String> i = set.iterator();
while (i.hasNext()) {
b.append(i.next());
b.append(' ');
}
return b.toString(); // join ordered fragments back together
}
}

View File

@ -0,0 +1,12 @@
package com.metaweb.gridworks.clustering.binning;
public abstract class Keyer {
public String key(String s) {
return this.key(s, (Object[]) null);
}
public abstract String key(String string, Object... params);
}

View File

@ -0,0 +1,13 @@
package com.metaweb.gridworks.clustering.binning;
import org.apache.commons.codec.language.Metaphone;
public class MetaphoneKeyer extends Keyer {
private Metaphone _metaphone = new Metaphone();
public String key(String s, Object... o) {
return _metaphone.metaphone(s);
}
}

View File

@ -0,0 +1,35 @@
package com.metaweb.gridworks.clustering.binning;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.regex.Pattern;
public class NGramFingerprintKeyer extends Keyer {
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}");
public String key(String s, Object... o) {
int ngram_size = 1;
if (o != null && o.length > 0 && o[0] instanceof Number) {
ngram_size = (Integer) o[0];
}
s = s.toLowerCase(); // then lowercase it
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
TreeSet<String> set = ngram_split(s,ngram_size);
StringBuffer b = new StringBuffer();
Iterator<String> i = set.iterator();
while (i.hasNext()) {
b.append(i.next());
}
return b.toString(); // join ordered fragments back together
}
protected TreeSet<String> ngram_split(String s, int size) {
TreeSet<String> set = new TreeSet<String>();
char[] chars = s.toCharArray();
for (int i = 0; i + size <= chars.length; i++) {
set.add(new String(chars,i,size));
}
return set;
}
}

View File

@ -0,0 +1,13 @@
package com.metaweb.gridworks.clustering.binning;
import org.apache.commons.codec.language.Soundex;
public class SoundexKeyer extends Keyer {
private Soundex _soundex = new Soundex();
public String key(String s, Object... o) {
return _soundex.soundex(s);
}
}

View File

@ -0,0 +1,122 @@
package com.metaweb.gridworks.clustering.knn;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONWriter;
import com.metaweb.gridworks.Gridworks;
import com.metaweb.gridworks.browsing.Engine;
import com.metaweb.gridworks.browsing.FilteredRows;
import com.metaweb.gridworks.browsing.RowVisitor;
import com.metaweb.gridworks.clustering.Clusterer;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
import edu.mit.simile.vicino.Distance;
import edu.mit.simile.vicino.distances.BZip2Distance;
import edu.mit.simile.vicino.distances.GZipDistance;
import edu.mit.simile.vicino.distances.JaccardDistance;
import edu.mit.simile.vicino.distances.JaroDistance;
import edu.mit.simile.vicino.distances.LevenshteinDistance;
import edu.mit.simile.vicino.distances.PPMDistance;
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
public class kNNClusterer extends Clusterer {
private Distance _distance;
static protected Map<String, Distance> _distances = new HashMap<String, Distance>();
List<List<? extends Serializable>> _clusters;
static {
_distances.put("levenshtein", new LevenshteinDistance());
_distances.put("jaro", new JaroDistance());
_distances.put("jaccard", new JaccardDistance());
_distances.put("gzip", new GZipDistance());
_distances.put("bzip2", new BZip2Distance());
_distances.put("ppm", new PPMDistance());
}
class kNNClusteringRowVisitor implements RowVisitor {
Distance _distance;
JSONObject _config;
VPTreeBuilder _treeBuilder;
float _radius;
public kNNClusteringRowVisitor(Distance d, JSONObject o) {
_distance = d;
_config = o;
_treeBuilder = new VPTreeBuilder(_distance);
try {
_radius = (float) o.getDouble("radius");
} catch (JSONException e) {
Gridworks.warn("No radius found, using default");
_radius = 1.0f;
}
}
public boolean visit(Project project, int rowIndex, Row row, boolean contextual) {
Cell cell = row.cells.get(_colindex);
if (cell != null && cell.value != null) {
Object v = cell.value;
String s = (v instanceof String) ? ((String) v) : v.toString();
_treeBuilder.populate(s);
}
return false;
}
public Map<Serializable,List<? extends Serializable>> getClusters() {
return _treeBuilder.getClusters(_radius);
}
}
public class SizeComparator implements Comparator<List<? extends Serializable>> {
public int compare(List<? extends Serializable> o1, List<? extends Serializable> o2) {
return o2.size() - o1.size();
}
}
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
super.initializeFromJSON(project, o);
_distance = _distances.get(o.getString("function").toLowerCase());
}
public void computeClusters(Engine engine) {
kNNClusteringRowVisitor visitor = new kNNClusteringRowVisitor(_distance,_config);
FilteredRows filteredRows = engine.getAllFilteredRows(true);
filteredRows.accept(_project, visitor);
Map<Serializable,List<? extends Serializable>> clusters = visitor.getClusters();
_clusters = new ArrayList<List<? extends Serializable>>(clusters.values());
Collections.sort(_clusters, new SizeComparator());
}
public void write(JSONWriter writer, Properties options) throws JSONException {
writer.array();
for (List<? extends Serializable> m : _clusters) {
if (m.size() > 1) {
writer.array();
for (Serializable s : m) {
writer.object();
writer.key("v"); writer.value(s);
writer.key("c"); writer.value(1);
writer.endObject();
}
writer.endArray();
}
}
writer.endArray();
}
}

View File

@ -0,0 +1,50 @@
package com.metaweb.gridworks.commands.info;
import java.io.IOException;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.json.JSONObject;
import com.metaweb.gridworks.Gridworks;
import com.metaweb.gridworks.browsing.Engine;
import com.metaweb.gridworks.clustering.Clusterer;
import com.metaweb.gridworks.clustering.binning.BinningClusterer;
import com.metaweb.gridworks.clustering.knn.kNNClusterer;
import com.metaweb.gridworks.commands.Command;
import com.metaweb.gridworks.model.Project;
public class ComputeClustersCommand extends Command {
@Override
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
try {
long start = System.currentTimeMillis();
Project project = getProject(request);
Engine engine = getEngine(request, project);
JSONObject clusterer_conf = getJsonParameter(request,"clusterer");
Clusterer clusterer = null;
String type = clusterer_conf.has("type") ? clusterer_conf.getString("type") : "binning";
if ("knn".equals(type)) {
clusterer = new kNNClusterer();
} else {
clusterer = new BinningClusterer();
}
clusterer.initializeFromJSON(project, clusterer_conf);
clusterer.computeClusters(engine);
respondJSON(response, clusterer);
Gridworks.log("computed clusters [" + type + "," + clusterer_conf.getString("function") + "] in " + (System.currentTimeMillis() - start) + "ms");
} catch (Exception e) {
respondException(response, e);
}
}
}

View File

@ -1,39 +1,23 @@
package com.metaweb.gridworks.expr.functions.strings;
import java.util.Iterator;
import java.util.Properties;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.clustering.binning.FingerprintKeyer;
import com.metaweb.gridworks.clustering.binning.Keyer;
import com.metaweb.gridworks.gel.Function;
public class Fingerprint implements Function {
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}");
static Keyer fingerprint = new FingerprintKeyer();
public Object call(Properties bindings, Object[] args) {
if (args.length == 1 && args[0] != null) {
Object o = args[0];
String s = (o instanceof String) ? (String) o : o.toString();
s = s.trim(); // first off, remove whitespace around the string
s = s.toLowerCase(); // then lowercase it
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
String[] frags = StringUtils.split(s); // split by whitespace
TreeSet<String> set = new TreeSet<String>();
for (String ss : frags) {
set.add(ss); // order fragments and dedupe
}
StringBuffer b = new StringBuffer();
Iterator<String> i = set.iterator();
while (i.hasNext()) {
b.append(i.next());
b.append(' ');
}
return b.toString(); // join ordered fragments back together
String s = (o instanceof String) ? (String) o : o.toString();
return fingerprint.key(s);
}
return null;
}

View File

@ -1,20 +1,20 @@
package com.metaweb.gridworks.expr.functions.strings;
import java.util.Iterator;
import java.util.Properties;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.clustering.binning.Keyer;
import com.metaweb.gridworks.clustering.binning.NGramFingerprintKeyer;
import com.metaweb.gridworks.expr.EvalError;
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
import com.metaweb.gridworks.gel.Function;
public class NGramFingerprint implements Function {
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}");
static Keyer ngram_fingerprint = new NGramFingerprintKeyer();
public Object call(Properties bindings, Object[] args) {
if (args.length == 1 || args.length == 2) {
@ -24,16 +24,8 @@ public class NGramFingerprint implements Function {
ngram_size = (args[1] instanceof Number) ? ((Number) args[1]).intValue() : Integer.parseInt(args[1].toString());
}
Object o = args[0];
String s = (o instanceof String) ? (String) o : o.toString();
s = s.toLowerCase(); // then lowercase it
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
TreeSet<String> set = ngram_split(s,ngram_size);
StringBuffer b = new StringBuffer();
Iterator<String> i = set.iterator();
while (i.hasNext()) {
b.append(i.next());
}
return b.toString(); // join ordered fragments back together
String s = (o instanceof String) ? (String) o : o.toString();
return ngram_fingerprint.key(s,ngram_size);
}
return null;
}

View File

@ -2,21 +2,21 @@ package com.metaweb.gridworks.expr.functions.strings;
import java.util.Properties;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.Soundex;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.clustering.binning.DoubleMetaphoneKeyer;
import com.metaweb.gridworks.clustering.binning.MetaphoneKeyer;
import com.metaweb.gridworks.clustering.binning.SoundexKeyer;
import com.metaweb.gridworks.expr.EvalError;
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
import com.metaweb.gridworks.gel.Function;
public class Phonetic implements Function {
private DoubleMetaphone metaphone2 = new DoubleMetaphone();
private Metaphone metaphone = new Metaphone();
private Soundex soundex = new Soundex();
static private DoubleMetaphoneKeyer metaphone2 = new DoubleMetaphoneKeyer();
static private MetaphoneKeyer metaphone = new MetaphoneKeyer();
static private SoundexKeyer soundex = new SoundexKeyer();
public Object call(Properties bindings, Object[] args) {
if (args.length == 2) {
@ -26,11 +26,11 @@ public class Phonetic implements Function {
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
String encoding = ((String) o2).toLowerCase();
if ("doublemetaphone".equals(encoding)) {
return metaphone2.doubleMetaphone(str);
return metaphone2.key(str);
} else if ("metaphone".equals(encoding)) {
return metaphone.metaphone(str);
return metaphone.key(str);
} else if ("soundex".equals(encoding)) {
return soundex.soundex(str);
return soundex.key(str);
} else {
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " doesn't know how to handle the '" + encoding + "' encoding.");
}

View File

@ -5,11 +5,11 @@ import com.wcohen.ss.api.StringDistance;
import edu.mit.simile.vicino.Distance;
public class LevensteinDistance implements Distance {
public class LevenshteinDistance implements Distance {
StringDistance distance;
public LevensteinDistance() {
public LevenshteinDistance() {
this.distance = new Levenstein();
}

View File

@ -39,4 +39,12 @@ public class Node implements Serializable {
public String toString() {
return obj.toString();
}
public boolean equals(Object n) {
if (n instanceof Node) {
return ((Node) n).get().equals(this.obj);
} else {
return false;
}
}
}

View File

@ -2,8 +2,14 @@ package edu.mit.simile.vicino.vptree;
import java.io.Serializable;
import java.util.Collection;
import java.util.Iterator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import com.metaweb.gridworks.Gridworks;
import edu.mit.simile.vicino.Distance;
@ -17,9 +23,10 @@ public class VPTreeBuilder {
private Random generator = new Random(System.currentTimeMillis());
private VPTree tree;
private final Distance distance;
private Set<Node> nodes = new HashSet<Node>();
/**
* Defines a VPTree Builder for a specific distance.
*
@ -29,20 +36,47 @@ public class VPTreeBuilder {
this.distance = distance;
}
public VPTree buildVPTree(Collection<? extends Serializable> col) {
Node nodes[] = new Node[col.size()];
Iterator<? extends Serializable> i = col.iterator();
int counter = 0;
while (i.hasNext()) {
Serializable s = (Serializable) i.next();
nodes[counter++] = new Node(s);
}
public void populate(Serializable s) {
nodes.add(new Node(s));
}
tree = new VPTree();
tree.setRoot(addNode(nodes, 0, nodes.length - 1));
public VPTree buildVPTree() {
Node[] nodes_array = this.nodes.toArray(new Node[this.nodes.size()]);
Gridworks.log("building tree with nodes: " + nodes_array.length);
VPTree tree = new VPTree();
tree.setRoot(addNode(nodes_array, 0, nodes_array.length - 1));
Gridworks.log("tree built");
return tree;
}
public VPTree buildVPTree(Collection<? extends Serializable> values) {
reset();
for (Serializable s : values) {
populate(s);
}
return buildVPTree();
}
public void reset() {
this.nodes.clear();
}
public Map<Serializable,List<? extends Serializable>> getClusters(float radius) {
VPTree tree = buildVPTree();
VPTreeSeeker seeker = new VPTreeSeeker(distance,tree);
Map<Serializable,List<? extends Serializable>> map = new HashMap<Serializable,List<? extends Serializable>>();
for (Node n : nodes) {
Serializable s = n.get();
Gridworks.log(" find results for: " + s);
List<? extends Serializable> results = seeker.range(s, radius);
Gridworks.log(" found: " + results.size());
map.put(s, results);
}
return map;
}
private TNode addNode(Node nodes[], int begin, int end) {
int delta = end - begin;

View File

@ -2,6 +2,9 @@ function FacetBasedEditDialog(columnName, expression, entries) {
this._columnName = columnName;
this._expression = expression;
this._entries = entries;
this._method = "binning";
this._function = "fingerprint";
this._params = {};
this._createDialog();
this._cluster();
@ -10,7 +13,7 @@ function FacetBasedEditDialog(columnName, expression, entries) {
FacetBasedEditDialog.prototype._createDialog = function() {
var self = this;
var frame = DialogSystem.createDialog();
frame.width("800px");
frame.width("900px");
var header = $('<div></div>').addClass("dialog-header").text("Facet-based edit of column " + this._columnName).appendTo(frame);
var body = $('<div></div>').addClass("dialog-body").appendTo(frame);
@ -18,17 +21,78 @@ FacetBasedEditDialog.prototype._createDialog = function() {
var html = $(
'<div>' +
'<div class="facet-based-edit-dialog-controls"><table><tr>' +
'<td>' +
'Method: <select bind="methodSelector">' +
'<option selected="true">key collision</option>' +
'<option>nearest neightbor</option>' +
'</select>' +
'</td>' +
'<td>' +
'<div id="binning-controls">Keying Function: <select bind="keyingFunctionSelector">' +
'<option selected="true">fingerprint</option>' +
'<option>ngram-fingerprint</option>' +
'<option>double-metaphone</option>' +
'<option>metaphone</option>' +
'<option>soundex</option>' +
'</select></div>' +
'<div id="knn-controls" class="hidden">Distance Function: <select bind="distanceFunctionSelector">' +
'<option selected="true">levenshtein</option>' +
'<option>jaro</option>' +
'<option>jaccard</option>' +
'<option>gzip</option>' +
'<option>bzip2</option>' +
'<option>PPM</option>' +
'</select></div>' +
'</td>' +
'<td>' +
'<div id="ngram-fingerprint-params" class="function-params hidden">' +
'Ngram Size: <input type="text" value="1" bind="ngramSize">' +
'</div>' +
'</td>' +
'</tr></table></div>' +
'<div bind="tableContainer" class="facet-based-edit-dialog-table-container"></div>' +
'<div class="facet-based-edit-dialog-controls">' +
'<button bind="clusterButton">Cluster</button> ' +
'<button bind="unclusterButton">Un-cluster</button> ' +
'</div>' +
'</div>'
).appendTo(body);
this._elmts = DOM.bind(html);
this._elmts.clusterButton.click(function() { self._cluster(); });
this._elmts.unclusterButton.click(function() { self._uncluster(); });
this._elmts.methodSelector.change(function() {
var selection = $(this).find("option:selected").text();
if (selection == 'key collision') {
body.find("#binning-controls").show();
body.find("#knn-controls").hide();
self._method = "binning";
self._elmts.keyingFunctionSelector.change();
} else if (selection = 'nearest neightbor') {
body.find("#binning-controls").hide();
body.find("#knn-controls").show();
self._method = "knn";
self._elmts.distanceFunctionSelector.change();
}
});
var changer = function() {
self._function = $(this).find("option:selected").text();
$(".function-params").hide();
$("#" + self._function + "-params").show();
self._cluster();
};
this._elmts.keyingFunctionSelector.change(changer);
this._elmts.distanceFunctionSelector.change(changer);
this._elmts.ngramSize.change(function() {
try {
self._params = { "ngram-size" : parseInt($(this).val()) };
self._cluster();
} catch (e) {
alert("ngram size must be a number");
}
});
//this._elmts.clusterButton.click(function() { self._cluster(); });
//this._elmts.unclusterButton.click(function() { self._uncluster(); });
$('<button></button>').text("OK").click(function() { self._onOK(); }).appendTo(footer);
$('<button></button>').text("Cancel").click(function() { self._dismiss(); }).appendTo(footer);
@ -41,9 +105,9 @@ FacetBasedEditDialog.prototype._createDialog = function() {
FacetBasedEditDialog.prototype._renderTable = function() {
var self = this;
var container = this._elmts.tableContainer.empty();
var container = this._elmts.tableContainer;
var table = $('<table></table>').addClass("facet-based-edit-dialog-entry-table").appendTo(container)[0];
var table = $('<table></table>').addClass("facet-based-edit-dialog-entry-table")[0];
var trHead = table.insertRow(table.rows.length);
trHead.className = "header";
@ -60,7 +124,7 @@ FacetBasedEditDialog.prototype._renderTable = function() {
for (var c = 0; c < choices.length; c++) {
var choice = choices[c];
var li = $('<li>').appendTo(ul);
$('<span>').text(choice.v.l).appendTo(li);
$('<span>').text(choice.v).appendTo(li);
$('<span>').text(" (" + choice.c + ")").appendTo(li);
}
@ -73,7 +137,7 @@ FacetBasedEditDialog.prototype._renderTable = function() {
editCheck.attr("checked", "true");
}
var input = $('<input size="35" />')
var input = $('<input size="55" />')
.attr("value", cluster.value)
.appendTo(tr.insertCell(2))
.keyup(function() {
@ -83,65 +147,43 @@ FacetBasedEditDialog.prototype._renderTable = function() {
for (var i = 0; i < this._clusters.length; i++) {
renderCluster(this._clusters[i]);
}
container.empty().append(table);
};
FacetBasedEditDialog.prototype._cluster = function() {
var clusters = [];
var map = {};
$.each(this._entries, function() {
var choice = {
v: this.v,
c: this.c
};
var s = this.v.l.toLowerCase().replace(/\W/g, ' ').replace(/\s+/g, ' ').split(" ").sort().join(" ");
if (s in map) {
map[s].choices.push(choice);
} else {
map[s] = {
edit: false,
choices: [ choice ]
};
clusters.push(map[s]);
}
});
var self = this;
$.each(clusters, function() {
if (this.choices.length > 1) {
this.choices.sort(function(a, b) {
var c = b.c - a.c;
return c != 0 ? c : a.v.l.localeCompare(b.v.l);
});
this.edit = true;
}
this.value = this.choices[0].v.l;
});
clusters.sort(function(a, b) {
var c = b.choices.length - a.choices.length;
return c != 0 ? c : a.value.localeCompare(b.value);
});
this._clusters = clusters;
this._renderTable();
};
var container = this._elmts.tableContainer.html(
'<div style="margin: 1em; font-size: 130%; color: #888;">Loading... <img src="/images/small-spinner.gif"></div>'
);
FacetBasedEditDialog.prototype._uncluster = function() {
var clusters = [];
$.each(this._entries, function() {
var cluster = {
edit: false,
choices: [{
v: this.v,
c: this.c
}],
value: this.v.l
};
clusters.push(cluster);
});
this._clusters = clusters;
this._renderTable();
};
$.post(
"/command/compute-clusters?" + $.param({ project: theProject.id }),
{
engine: JSON.stringify(ui.browsingEngine.getJSON()),
clusterer: JSON.stringify({
'type' : this._method,
'function' : this._function,
'column' : this._columnName,
'params' : this._params
})
},
function(data) {
var clusters = [];
$.each(data, function() {
clusters.push({
edit: true,
choices: this,
value: this[0].v
});
});
self._clusters = clusters;
self._renderTable();
},
"json"
);
}
FacetBasedEditDialog.prototype._onOK = function() {
var edits = [];
@ -150,7 +192,7 @@ FacetBasedEditDialog.prototype._onOK = function() {
if (cluster.edit) {
var values = [];
for (var j = 0; j < cluster.choices.length; j++) {
values.push(cluster.choices[j].v.v);
values.push(cluster.choices[j].v);
}
edits.push({

View File

@ -74,4 +74,8 @@ img {
.fbs-pane, .fbs-flyout-pane {
z-index: 2000;
}
.hidden {
display: none;
}

View File

@ -14,7 +14,7 @@ table.facet-based-edit-dialog-main-layout > tbody > tr:last-child > td {
}
.facet-based-edit-dialog-table-container {
height: 450px;
height: 500px;
overflow: auto;
border: 1px solid #aaa;
}
@ -42,5 +42,14 @@ table.facet-based-edit-dialog-entry-table > tbody > tr.even > td {
}
table.facet-based-edit-dialog-entry-table input {
border: none;
border: 1px solid #ccc;
padding: 0 0.1em;
}
.facet-based-edit-dialog-controls {
margin-bottom: 0.5em;
}
.facet-based-edit-dialog-controls td {
padding-right: 0.5em;
}