much improved facet clustering dialog and functionality
NOTE: kNN clustering code operational but is not working as expected git-svn-id: http://google-refine.googlecode.com/svn/trunk@219 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
db824bffeb
commit
976c1da5c7
@ -70,6 +70,14 @@ public class Gridworks extends Server {
|
||||
logger.info(message);
|
||||
}
|
||||
|
||||
public static void error(String message, Throwable t) {
|
||||
logger.error(message, t);
|
||||
}
|
||||
|
||||
public static void warn(String message) {
|
||||
logger.warn(message);
|
||||
}
|
||||
|
||||
/* -------------- Gridworks HTTP server ----------------- */
|
||||
|
||||
private ThreadPoolExecutor threadPool;
|
||||
|
@ -15,15 +15,16 @@ import com.metaweb.gridworks.commands.edit.AnnotateOneRowCommand;
|
||||
import com.metaweb.gridworks.commands.edit.AnnotateRowsCommand;
|
||||
import com.metaweb.gridworks.commands.edit.ApplyOperationsCommand;
|
||||
import com.metaweb.gridworks.commands.edit.CreateProjectCommand;
|
||||
import com.metaweb.gridworks.commands.edit.DeleteProjectCommand;
|
||||
import com.metaweb.gridworks.commands.edit.DoTextTransformCommand;
|
||||
import com.metaweb.gridworks.commands.edit.EditOneCellCommand;
|
||||
import com.metaweb.gridworks.commands.edit.FacetBasedEditCommand;
|
||||
import com.metaweb.gridworks.commands.edit.JoinMultiValueCellsCommand;
|
||||
import com.metaweb.gridworks.commands.edit.RemoveColumnCommand;
|
||||
import com.metaweb.gridworks.commands.edit.DeleteProjectCommand;
|
||||
import com.metaweb.gridworks.commands.edit.SaveProtographCommand;
|
||||
import com.metaweb.gridworks.commands.edit.SplitMultiValueCellsCommand;
|
||||
import com.metaweb.gridworks.commands.edit.UndoRedoCommand;
|
||||
import com.metaweb.gridworks.commands.info.ComputeClustersCommand;
|
||||
import com.metaweb.gridworks.commands.info.ComputeFacetsCommand;
|
||||
import com.metaweb.gridworks.commands.info.ExportRowsCommand;
|
||||
import com.metaweb.gridworks.commands.info.GetAllProjectMetadataCommand;
|
||||
@ -74,6 +75,7 @@ public class GridworksServlet extends HttpServlet {
|
||||
_commands.put("cancel-processes", new CancelProcessesCommand());
|
||||
|
||||
_commands.put("compute-facets", new ComputeFacetsCommand());
|
||||
_commands.put("compute-clusters", new ComputeClustersCommand());
|
||||
_commands.put("do-text-transform", new DoTextTransformCommand());
|
||||
_commands.put("facet-based-edit", new FacetBasedEditCommand());
|
||||
_commands.put("edit-one-cell", new EditOneCellCommand());
|
||||
|
@ -0,0 +1,29 @@
|
||||
package com.metaweb.gridworks.clustering;
|
||||
|
||||
import org.json.JSONObject;
|
||||
|
||||
import com.metaweb.gridworks.Jsonizable;
|
||||
import com.metaweb.gridworks.browsing.Engine;
|
||||
import com.metaweb.gridworks.model.Column;
|
||||
import com.metaweb.gridworks.model.Project;
|
||||
|
||||
public abstract class Clusterer implements Jsonizable {
|
||||
|
||||
protected Project _project;
|
||||
protected int _colindex;
|
||||
protected JSONObject _config;
|
||||
|
||||
public abstract void computeClusters(Engine engine);
|
||||
|
||||
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
|
||||
_project = project;
|
||||
_config = o;
|
||||
|
||||
String colname = o.getString("column");
|
||||
for (Column column : project.columnModel.columns) {
|
||||
if (column.getHeaderLabel().equals(colname)) {
|
||||
_colindex = column.getCellIndex();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,151 @@
|
||||
package com.metaweb.gridworks.clustering.binning;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.TreeMap;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.browsing.Engine;
|
||||
import com.metaweb.gridworks.browsing.FilteredRows;
|
||||
import com.metaweb.gridworks.browsing.RowVisitor;
|
||||
import com.metaweb.gridworks.clustering.Clusterer;
|
||||
import com.metaweb.gridworks.model.Cell;
|
||||
import com.metaweb.gridworks.model.Project;
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
public class BinningClusterer extends Clusterer {
|
||||
|
||||
private Keyer _keyer;
|
||||
|
||||
static protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>();
|
||||
|
||||
List<Map<Object,Integer>> _clusters;
|
||||
|
||||
static {
|
||||
_keyers.put("fingerprint", new FingerprintKeyer());
|
||||
_keyers.put("ngram-fingerprint", new NGramFingerprintKeyer());
|
||||
_keyers.put("metaphone", new MetaphoneKeyer());
|
||||
_keyers.put("double-metaphone", new DoubleMetaphoneKeyer());
|
||||
_keyers.put("soundex", new SoundexKeyer());
|
||||
}
|
||||
|
||||
class BinningRowVisitor implements RowVisitor {
|
||||
|
||||
Keyer _keyer;
|
||||
Object[] _params;
|
||||
JSONObject _config;
|
||||
|
||||
Map<String,Map<Object,Integer>> _map = new HashMap<String,Map<Object,Integer>>();
|
||||
|
||||
public BinningRowVisitor(Keyer k, JSONObject o) {
|
||||
_keyer = k;
|
||||
_config = o;
|
||||
if (k instanceof NGramFingerprintKeyer) {
|
||||
try {
|
||||
int size = _config.getJSONObject("params").getInt("ngram-size");
|
||||
_params = new Object[1];
|
||||
_params[0] = size;
|
||||
} catch (JSONException e) {
|
||||
//Gridworks.warn("no ngram size specified, using default");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean visit(Project project, int rowIndex, Row row, boolean contextual) {
|
||||
Cell cell = row.cells.get(_colindex);
|
||||
if (cell != null && cell.value != null) {
|
||||
Object v = cell.value;
|
||||
String s = (v instanceof String) ? ((String) v) : v.toString();
|
||||
String key = _keyer.key(s,_params);
|
||||
if (_map.containsKey(key)) {
|
||||
Map<Object,Integer> m = _map.get(key);
|
||||
if (m.containsKey(v)) {
|
||||
m.put(v, m.get(v) + 1);
|
||||
} else {
|
||||
m.put(v,1);
|
||||
}
|
||||
} else {
|
||||
Map<Object,Integer> m = new TreeMap<Object,Integer>();
|
||||
m.put(v,0);
|
||||
_map.put(key, m);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public Map<String,Map<Object,Integer>> getMap() {
|
||||
return _map;
|
||||
}
|
||||
}
|
||||
|
||||
public class SizeComparator implements Comparator<Map<Object,Integer>> {
|
||||
public int compare(Map<Object,Integer> o1, Map<Object,Integer> o2) {
|
||||
int s1 = o1.size();
|
||||
int s2 = o2.size();
|
||||
if (o1 == o2) {
|
||||
int total1 = 0;
|
||||
for (int i : o1.values()) {
|
||||
total1 += i;
|
||||
}
|
||||
int total2 = 0;
|
||||
for (int i : o2.values()) {
|
||||
total2 += i;
|
||||
}
|
||||
return total2 - total1;
|
||||
} else {
|
||||
return s2 - s1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public class EntriesComparator implements Comparator<Entry<Object,Integer>> {
|
||||
public int compare(Entry<Object,Integer> o1, Entry<Object,Integer> o2) {
|
||||
return o2.getValue() - o1.getValue();
|
||||
}
|
||||
}
|
||||
|
||||
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
|
||||
super.initializeFromJSON(project, o);
|
||||
_keyer = _keyers.get(o.getString("function").toLowerCase());
|
||||
}
|
||||
|
||||
public void computeClusters(Engine engine) {
|
||||
BinningRowVisitor visitor = new BinningRowVisitor(_keyer,_config);
|
||||
FilteredRows filteredRows = engine.getAllFilteredRows(true);
|
||||
filteredRows.accept(_project, visitor);
|
||||
|
||||
Map<String,Map<Object,Integer>> map = visitor.getMap();
|
||||
_clusters = new ArrayList<Map<Object,Integer>>(map.values());
|
||||
Collections.sort(_clusters, new SizeComparator());
|
||||
}
|
||||
|
||||
public void write(JSONWriter writer, Properties options) throws JSONException {
|
||||
EntriesComparator c = new EntriesComparator();
|
||||
|
||||
writer.array();
|
||||
for (Map<Object,Integer> m : _clusters) {
|
||||
if (m.size() > 1) {
|
||||
writer.array();
|
||||
List<Entry<Object,Integer>> entries = new ArrayList<Entry<Object,Integer>>(m.entrySet());
|
||||
Collections.sort(entries,c);
|
||||
for (Entry<Object,Integer> e : entries) {
|
||||
writer.object();
|
||||
writer.key("v"); writer.value(e.getKey());
|
||||
writer.key("c"); writer.value(e.getValue());
|
||||
writer.endObject();
|
||||
}
|
||||
writer.endArray();
|
||||
}
|
||||
}
|
||||
writer.endArray();
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
package com.metaweb.gridworks.clustering.binning;
|
||||
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
|
||||
public class DoubleMetaphoneKeyer extends Keyer {
|
||||
|
||||
private DoubleMetaphone _metaphone2 = new DoubleMetaphone();
|
||||
|
||||
public String key(String s, Object... o) {
|
||||
return _metaphone2.doubleMetaphone(s);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,31 @@
|
||||
package com.metaweb.gridworks.clustering.binning;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
public class FingerprintKeyer extends Keyer {
|
||||
|
||||
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}");
|
||||
|
||||
public String key(String s, Object... o) {
|
||||
s = s.trim(); // first off, remove whitespace around the string
|
||||
s = s.toLowerCase(); // then lowercase it
|
||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||
String[] frags = StringUtils.split(s); // split by whitespace
|
||||
TreeSet<String> set = new TreeSet<String>();
|
||||
for (String ss : frags) {
|
||||
set.add(ss); // order fragments and dedupe
|
||||
}
|
||||
StringBuffer b = new StringBuffer();
|
||||
Iterator<String> i = set.iterator();
|
||||
while (i.hasNext()) {
|
||||
b.append(i.next());
|
||||
b.append(' ');
|
||||
}
|
||||
return b.toString(); // join ordered fragments back together
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
package com.metaweb.gridworks.clustering.binning;
|
||||
|
||||
|
||||
public abstract class Keyer {
|
||||
|
||||
public String key(String s) {
|
||||
return this.key(s, (Object[]) null);
|
||||
}
|
||||
|
||||
public abstract String key(String string, Object... params);
|
||||
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
package com.metaweb.gridworks.clustering.binning;
|
||||
|
||||
import org.apache.commons.codec.language.Metaphone;
|
||||
|
||||
public class MetaphoneKeyer extends Keyer {
|
||||
|
||||
private Metaphone _metaphone = new Metaphone();
|
||||
|
||||
public String key(String s, Object... o) {
|
||||
return _metaphone.metaphone(s);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package com.metaweb.gridworks.clustering.binning;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class NGramFingerprintKeyer extends Keyer {
|
||||
|
||||
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}");
|
||||
|
||||
public String key(String s, Object... o) {
|
||||
int ngram_size = 1;
|
||||
if (o != null && o.length > 0 && o[0] instanceof Number) {
|
||||
ngram_size = (Integer) o[0];
|
||||
}
|
||||
s = s.toLowerCase(); // then lowercase it
|
||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||
TreeSet<String> set = ngram_split(s,ngram_size);
|
||||
StringBuffer b = new StringBuffer();
|
||||
Iterator<String> i = set.iterator();
|
||||
while (i.hasNext()) {
|
||||
b.append(i.next());
|
||||
}
|
||||
return b.toString(); // join ordered fragments back together
|
||||
}
|
||||
|
||||
protected TreeSet<String> ngram_split(String s, int size) {
|
||||
TreeSet<String> set = new TreeSet<String>();
|
||||
char[] chars = s.toCharArray();
|
||||
for (int i = 0; i + size <= chars.length; i++) {
|
||||
set.add(new String(chars,i,size));
|
||||
}
|
||||
return set;
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
package com.metaweb.gridworks.clustering.binning;
|
||||
|
||||
import org.apache.commons.codec.language.Soundex;
|
||||
|
||||
public class SoundexKeyer extends Keyer {
|
||||
|
||||
private Soundex _soundex = new Soundex();
|
||||
|
||||
public String key(String s, Object... o) {
|
||||
return _soundex.soundex(s);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,122 @@
|
||||
package com.metaweb.gridworks.clustering.knn;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.Gridworks;
|
||||
import com.metaweb.gridworks.browsing.Engine;
|
||||
import com.metaweb.gridworks.browsing.FilteredRows;
|
||||
import com.metaweb.gridworks.browsing.RowVisitor;
|
||||
import com.metaweb.gridworks.clustering.Clusterer;
|
||||
import com.metaweb.gridworks.model.Cell;
|
||||
import com.metaweb.gridworks.model.Project;
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
import edu.mit.simile.vicino.Distance;
|
||||
import edu.mit.simile.vicino.distances.BZip2Distance;
|
||||
import edu.mit.simile.vicino.distances.GZipDistance;
|
||||
import edu.mit.simile.vicino.distances.JaccardDistance;
|
||||
import edu.mit.simile.vicino.distances.JaroDistance;
|
||||
import edu.mit.simile.vicino.distances.LevenshteinDistance;
|
||||
import edu.mit.simile.vicino.distances.PPMDistance;
|
||||
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
|
||||
|
||||
public class kNNClusterer extends Clusterer {
|
||||
|
||||
private Distance _distance;
|
||||
|
||||
static protected Map<String, Distance> _distances = new HashMap<String, Distance>();
|
||||
|
||||
List<List<? extends Serializable>> _clusters;
|
||||
|
||||
static {
|
||||
_distances.put("levenshtein", new LevenshteinDistance());
|
||||
_distances.put("jaro", new JaroDistance());
|
||||
_distances.put("jaccard", new JaccardDistance());
|
||||
_distances.put("gzip", new GZipDistance());
|
||||
_distances.put("bzip2", new BZip2Distance());
|
||||
_distances.put("ppm", new PPMDistance());
|
||||
}
|
||||
|
||||
class kNNClusteringRowVisitor implements RowVisitor {
|
||||
|
||||
Distance _distance;
|
||||
JSONObject _config;
|
||||
VPTreeBuilder _treeBuilder;
|
||||
float _radius;
|
||||
|
||||
public kNNClusteringRowVisitor(Distance d, JSONObject o) {
|
||||
_distance = d;
|
||||
_config = o;
|
||||
_treeBuilder = new VPTreeBuilder(_distance);
|
||||
try {
|
||||
_radius = (float) o.getDouble("radius");
|
||||
} catch (JSONException e) {
|
||||
Gridworks.warn("No radius found, using default");
|
||||
_radius = 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean visit(Project project, int rowIndex, Row row, boolean contextual) {
|
||||
Cell cell = row.cells.get(_colindex);
|
||||
if (cell != null && cell.value != null) {
|
||||
Object v = cell.value;
|
||||
String s = (v instanceof String) ? ((String) v) : v.toString();
|
||||
_treeBuilder.populate(s);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public Map<Serializable,List<? extends Serializable>> getClusters() {
|
||||
return _treeBuilder.getClusters(_radius);
|
||||
}
|
||||
}
|
||||
|
||||
public class SizeComparator implements Comparator<List<? extends Serializable>> {
|
||||
public int compare(List<? extends Serializable> o1, List<? extends Serializable> o2) {
|
||||
return o2.size() - o1.size();
|
||||
}
|
||||
}
|
||||
|
||||
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
|
||||
super.initializeFromJSON(project, o);
|
||||
_distance = _distances.get(o.getString("function").toLowerCase());
|
||||
}
|
||||
|
||||
public void computeClusters(Engine engine) {
|
||||
kNNClusteringRowVisitor visitor = new kNNClusteringRowVisitor(_distance,_config);
|
||||
FilteredRows filteredRows = engine.getAllFilteredRows(true);
|
||||
filteredRows.accept(_project, visitor);
|
||||
|
||||
Map<Serializable,List<? extends Serializable>> clusters = visitor.getClusters();
|
||||
_clusters = new ArrayList<List<? extends Serializable>>(clusters.values());
|
||||
Collections.sort(_clusters, new SizeComparator());
|
||||
}
|
||||
|
||||
public void write(JSONWriter writer, Properties options) throws JSONException {
|
||||
writer.array();
|
||||
for (List<? extends Serializable> m : _clusters) {
|
||||
if (m.size() > 1) {
|
||||
writer.array();
|
||||
for (Serializable s : m) {
|
||||
writer.object();
|
||||
writer.key("v"); writer.value(s);
|
||||
writer.key("c"); writer.value(1);
|
||||
writer.endObject();
|
||||
}
|
||||
writer.endArray();
|
||||
}
|
||||
}
|
||||
writer.endArray();
|
||||
}
|
||||
}
|
@ -0,0 +1,50 @@
|
||||
package com.metaweb.gridworks.commands.info;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import javax.servlet.ServletException;
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
|
||||
import org.json.JSONObject;
|
||||
|
||||
import com.metaweb.gridworks.Gridworks;
|
||||
import com.metaweb.gridworks.browsing.Engine;
|
||||
import com.metaweb.gridworks.clustering.Clusterer;
|
||||
import com.metaweb.gridworks.clustering.binning.BinningClusterer;
|
||||
import com.metaweb.gridworks.clustering.knn.kNNClusterer;
|
||||
import com.metaweb.gridworks.commands.Command;
|
||||
import com.metaweb.gridworks.model.Project;
|
||||
|
||||
public class ComputeClustersCommand extends Command {
|
||||
|
||||
@Override
|
||||
public void doPost(HttpServletRequest request, HttpServletResponse response)
|
||||
throws ServletException, IOException {
|
||||
|
||||
try {
|
||||
long start = System.currentTimeMillis();
|
||||
Project project = getProject(request);
|
||||
Engine engine = getEngine(request, project);
|
||||
JSONObject clusterer_conf = getJsonParameter(request,"clusterer");
|
||||
|
||||
Clusterer clusterer = null;
|
||||
String type = clusterer_conf.has("type") ? clusterer_conf.getString("type") : "binning";
|
||||
|
||||
if ("knn".equals(type)) {
|
||||
clusterer = new kNNClusterer();
|
||||
} else {
|
||||
clusterer = new BinningClusterer();
|
||||
}
|
||||
|
||||
clusterer.initializeFromJSON(project, clusterer_conf);
|
||||
|
||||
clusterer.computeClusters(engine);
|
||||
|
||||
respondJSON(response, clusterer);
|
||||
Gridworks.log("computed clusters [" + type + "," + clusterer_conf.getString("function") + "] in " + (System.currentTimeMillis() - start) + "ms");
|
||||
} catch (Exception e) {
|
||||
respondException(response, e);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,39 +1,23 @@
|
||||
package com.metaweb.gridworks.expr.functions.strings;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Properties;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.clustering.binning.FingerprintKeyer;
|
||||
import com.metaweb.gridworks.clustering.binning.Keyer;
|
||||
import com.metaweb.gridworks.gel.Function;
|
||||
|
||||
public class Fingerprint implements Function {
|
||||
|
||||
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}");
|
||||
static Keyer fingerprint = new FingerprintKeyer();
|
||||
|
||||
public Object call(Properties bindings, Object[] args) {
|
||||
if (args.length == 1 && args[0] != null) {
|
||||
Object o = args[0];
|
||||
String s = (o instanceof String) ? (String) o : o.toString();
|
||||
s = s.trim(); // first off, remove whitespace around the string
|
||||
s = s.toLowerCase(); // then lowercase it
|
||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||
String[] frags = StringUtils.split(s); // split by whitespace
|
||||
TreeSet<String> set = new TreeSet<String>();
|
||||
for (String ss : frags) {
|
||||
set.add(ss); // order fragments and dedupe
|
||||
}
|
||||
StringBuffer b = new StringBuffer();
|
||||
Iterator<String> i = set.iterator();
|
||||
while (i.hasNext()) {
|
||||
b.append(i.next());
|
||||
b.append(' ');
|
||||
}
|
||||
return b.toString(); // join ordered fragments back together
|
||||
return fingerprint.key(s);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
@ -1,20 +1,20 @@
|
||||
package com.metaweb.gridworks.expr.functions.strings;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Properties;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.clustering.binning.Keyer;
|
||||
import com.metaweb.gridworks.clustering.binning.NGramFingerprintKeyer;
|
||||
import com.metaweb.gridworks.expr.EvalError;
|
||||
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
|
||||
import com.metaweb.gridworks.gel.Function;
|
||||
|
||||
public class NGramFingerprint implements Function {
|
||||
|
||||
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}");
|
||||
static Keyer ngram_fingerprint = new NGramFingerprintKeyer();
|
||||
|
||||
public Object call(Properties bindings, Object[] args) {
|
||||
if (args.length == 1 || args.length == 2) {
|
||||
@ -25,15 +25,7 @@ public class NGramFingerprint implements Function {
|
||||
}
|
||||
Object o = args[0];
|
||||
String s = (o instanceof String) ? (String) o : o.toString();
|
||||
s = s.toLowerCase(); // then lowercase it
|
||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||
TreeSet<String> set = ngram_split(s,ngram_size);
|
||||
StringBuffer b = new StringBuffer();
|
||||
Iterator<String> i = set.iterator();
|
||||
while (i.hasNext()) {
|
||||
b.append(i.next());
|
||||
}
|
||||
return b.toString(); // join ordered fragments back together
|
||||
return ngram_fingerprint.key(s,ngram_size);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
@ -2,21 +2,21 @@ package com.metaweb.gridworks.expr.functions.strings;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
import org.apache.commons.codec.language.Metaphone;
|
||||
import org.apache.commons.codec.language.Soundex;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.clustering.binning.DoubleMetaphoneKeyer;
|
||||
import com.metaweb.gridworks.clustering.binning.MetaphoneKeyer;
|
||||
import com.metaweb.gridworks.clustering.binning.SoundexKeyer;
|
||||
import com.metaweb.gridworks.expr.EvalError;
|
||||
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
|
||||
import com.metaweb.gridworks.gel.Function;
|
||||
|
||||
public class Phonetic implements Function {
|
||||
|
||||
private DoubleMetaphone metaphone2 = new DoubleMetaphone();
|
||||
private Metaphone metaphone = new Metaphone();
|
||||
private Soundex soundex = new Soundex();
|
||||
static private DoubleMetaphoneKeyer metaphone2 = new DoubleMetaphoneKeyer();
|
||||
static private MetaphoneKeyer metaphone = new MetaphoneKeyer();
|
||||
static private SoundexKeyer soundex = new SoundexKeyer();
|
||||
|
||||
public Object call(Properties bindings, Object[] args) {
|
||||
if (args.length == 2) {
|
||||
@ -26,11 +26,11 @@ public class Phonetic implements Function {
|
||||
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
|
||||
String encoding = ((String) o2).toLowerCase();
|
||||
if ("doublemetaphone".equals(encoding)) {
|
||||
return metaphone2.doubleMetaphone(str);
|
||||
return metaphone2.key(str);
|
||||
} else if ("metaphone".equals(encoding)) {
|
||||
return metaphone.metaphone(str);
|
||||
return metaphone.key(str);
|
||||
} else if ("soundex".equals(encoding)) {
|
||||
return soundex.soundex(str);
|
||||
return soundex.key(str);
|
||||
} else {
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " doesn't know how to handle the '" + encoding + "' encoding.");
|
||||
}
|
||||
|
@ -5,11 +5,11 @@ import com.wcohen.ss.api.StringDistance;
|
||||
|
||||
import edu.mit.simile.vicino.Distance;
|
||||
|
||||
public class LevensteinDistance implements Distance {
|
||||
public class LevenshteinDistance implements Distance {
|
||||
|
||||
StringDistance distance;
|
||||
|
||||
public LevensteinDistance() {
|
||||
public LevenshteinDistance() {
|
||||
this.distance = new Levenstein();
|
||||
}
|
||||
|
@ -39,4 +39,12 @@ public class Node implements Serializable {
|
||||
public String toString() {
|
||||
return obj.toString();
|
||||
}
|
||||
|
||||
public boolean equals(Object n) {
|
||||
if (n instanceof Node) {
|
||||
return ((Node) n).get().equals(this.obj);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,8 +2,14 @@ package edu.mit.simile.vicino.vptree;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
|
||||
import com.metaweb.gridworks.Gridworks;
|
||||
|
||||
import edu.mit.simile.vicino.Distance;
|
||||
|
||||
@ -17,9 +23,10 @@ public class VPTreeBuilder {
|
||||
|
||||
private Random generator = new Random(System.currentTimeMillis());
|
||||
|
||||
private VPTree tree;
|
||||
private final Distance distance;
|
||||
|
||||
private Set<Node> nodes = new HashSet<Node>();
|
||||
|
||||
/**
|
||||
* Defines a VPTree Builder for a specific distance.
|
||||
*
|
||||
@ -29,20 +36,47 @@ public class VPTreeBuilder {
|
||||
this.distance = distance;
|
||||
}
|
||||
|
||||
public VPTree buildVPTree(Collection<? extends Serializable> col) {
|
||||
Node nodes[] = new Node[col.size()];
|
||||
Iterator<? extends Serializable> i = col.iterator();
|
||||
int counter = 0;
|
||||
while (i.hasNext()) {
|
||||
Serializable s = (Serializable) i.next();
|
||||
nodes[counter++] = new Node(s);
|
||||
public void populate(Serializable s) {
|
||||
nodes.add(new Node(s));
|
||||
}
|
||||
|
||||
tree = new VPTree();
|
||||
tree.setRoot(addNode(nodes, 0, nodes.length - 1));
|
||||
public VPTree buildVPTree() {
|
||||
Node[] nodes_array = this.nodes.toArray(new Node[this.nodes.size()]);
|
||||
Gridworks.log("building tree with nodes: " + nodes_array.length);
|
||||
VPTree tree = new VPTree();
|
||||
tree.setRoot(addNode(nodes_array, 0, nodes_array.length - 1));
|
||||
Gridworks.log("tree built");
|
||||
return tree;
|
||||
}
|
||||
|
||||
public VPTree buildVPTree(Collection<? extends Serializable> values) {
|
||||
reset();
|
||||
for (Serializable s : values) {
|
||||
populate(s);
|
||||
}
|
||||
return buildVPTree();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
this.nodes.clear();
|
||||
}
|
||||
|
||||
public Map<Serializable,List<? extends Serializable>> getClusters(float radius) {
|
||||
VPTree tree = buildVPTree();
|
||||
VPTreeSeeker seeker = new VPTreeSeeker(distance,tree);
|
||||
|
||||
Map<Serializable,List<? extends Serializable>> map = new HashMap<Serializable,List<? extends Serializable>>();
|
||||
for (Node n : nodes) {
|
||||
Serializable s = n.get();
|
||||
Gridworks.log(" find results for: " + s);
|
||||
List<? extends Serializable> results = seeker.range(s, radius);
|
||||
Gridworks.log(" found: " + results.size());
|
||||
map.put(s, results);
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
private TNode addNode(Node nodes[], int begin, int end) {
|
||||
|
||||
int delta = end - begin;
|
||||
|
@ -2,6 +2,9 @@ function FacetBasedEditDialog(columnName, expression, entries) {
|
||||
this._columnName = columnName;
|
||||
this._expression = expression;
|
||||
this._entries = entries;
|
||||
this._method = "binning";
|
||||
this._function = "fingerprint";
|
||||
this._params = {};
|
||||
|
||||
this._createDialog();
|
||||
this._cluster();
|
||||
@ -10,7 +13,7 @@ function FacetBasedEditDialog(columnName, expression, entries) {
|
||||
FacetBasedEditDialog.prototype._createDialog = function() {
|
||||
var self = this;
|
||||
var frame = DialogSystem.createDialog();
|
||||
frame.width("800px");
|
||||
frame.width("900px");
|
||||
|
||||
var header = $('<div></div>').addClass("dialog-header").text("Facet-based edit of column " + this._columnName).appendTo(frame);
|
||||
var body = $('<div></div>').addClass("dialog-body").appendTo(frame);
|
||||
@ -18,17 +21,78 @@ FacetBasedEditDialog.prototype._createDialog = function() {
|
||||
|
||||
var html = $(
|
||||
'<div>' +
|
||||
'<div bind="tableContainer" class="facet-based-edit-dialog-table-container"></div>' +
|
||||
'<div class="facet-based-edit-dialog-controls">' +
|
||||
'<button bind="clusterButton">Cluster</button> ' +
|
||||
'<button bind="unclusterButton">Un-cluster</button> ' +
|
||||
'<div class="facet-based-edit-dialog-controls"><table><tr>' +
|
||||
'<td>' +
|
||||
'Method: <select bind="methodSelector">' +
|
||||
'<option selected="true">key collision</option>' +
|
||||
'<option>nearest neightbor</option>' +
|
||||
'</select>' +
|
||||
'</td>' +
|
||||
'<td>' +
|
||||
'<div id="binning-controls">Keying Function: <select bind="keyingFunctionSelector">' +
|
||||
'<option selected="true">fingerprint</option>' +
|
||||
'<option>ngram-fingerprint</option>' +
|
||||
'<option>double-metaphone</option>' +
|
||||
'<option>metaphone</option>' +
|
||||
'<option>soundex</option>' +
|
||||
'</select></div>' +
|
||||
'<div id="knn-controls" class="hidden">Distance Function: <select bind="distanceFunctionSelector">' +
|
||||
'<option selected="true">levenshtein</option>' +
|
||||
'<option>jaro</option>' +
|
||||
'<option>jaccard</option>' +
|
||||
'<option>gzip</option>' +
|
||||
'<option>bzip2</option>' +
|
||||
'<option>PPM</option>' +
|
||||
'</select></div>' +
|
||||
'</td>' +
|
||||
'<td>' +
|
||||
'<div id="ngram-fingerprint-params" class="function-params hidden">' +
|
||||
'Ngram Size: <input type="text" value="1" bind="ngramSize">' +
|
||||
'</div>' +
|
||||
'</td>' +
|
||||
'</tr></table></div>' +
|
||||
'<div bind="tableContainer" class="facet-based-edit-dialog-table-container"></div>' +
|
||||
'</div>'
|
||||
).appendTo(body);
|
||||
|
||||
this._elmts = DOM.bind(html);
|
||||
this._elmts.clusterButton.click(function() { self._cluster(); });
|
||||
this._elmts.unclusterButton.click(function() { self._uncluster(); });
|
||||
|
||||
this._elmts.methodSelector.change(function() {
|
||||
var selection = $(this).find("option:selected").text();
|
||||
if (selection == 'key collision') {
|
||||
body.find("#binning-controls").show();
|
||||
body.find("#knn-controls").hide();
|
||||
self._method = "binning";
|
||||
self._elmts.keyingFunctionSelector.change();
|
||||
} else if (selection = 'nearest neightbor') {
|
||||
body.find("#binning-controls").hide();
|
||||
body.find("#knn-controls").show();
|
||||
self._method = "knn";
|
||||
self._elmts.distanceFunctionSelector.change();
|
||||
}
|
||||
});
|
||||
|
||||
var changer = function() {
|
||||
self._function = $(this).find("option:selected").text();
|
||||
$(".function-params").hide();
|
||||
$("#" + self._function + "-params").show();
|
||||
self._cluster();
|
||||
};
|
||||
|
||||
this._elmts.keyingFunctionSelector.change(changer);
|
||||
this._elmts.distanceFunctionSelector.change(changer);
|
||||
|
||||
this._elmts.ngramSize.change(function() {
|
||||
try {
|
||||
self._params = { "ngram-size" : parseInt($(this).val()) };
|
||||
self._cluster();
|
||||
} catch (e) {
|
||||
alert("ngram size must be a number");
|
||||
}
|
||||
});
|
||||
|
||||
//this._elmts.clusterButton.click(function() { self._cluster(); });
|
||||
//this._elmts.unclusterButton.click(function() { self._uncluster(); });
|
||||
|
||||
$('<button></button>').text("OK").click(function() { self._onOK(); }).appendTo(footer);
|
||||
$('<button></button>').text("Cancel").click(function() { self._dismiss(); }).appendTo(footer);
|
||||
@ -41,9 +105,9 @@ FacetBasedEditDialog.prototype._createDialog = function() {
|
||||
|
||||
FacetBasedEditDialog.prototype._renderTable = function() {
|
||||
var self = this;
|
||||
var container = this._elmts.tableContainer.empty();
|
||||
var container = this._elmts.tableContainer;
|
||||
|
||||
var table = $('<table></table>').addClass("facet-based-edit-dialog-entry-table").appendTo(container)[0];
|
||||
var table = $('<table></table>').addClass("facet-based-edit-dialog-entry-table")[0];
|
||||
|
||||
var trHead = table.insertRow(table.rows.length);
|
||||
trHead.className = "header";
|
||||
@ -60,7 +124,7 @@ FacetBasedEditDialog.prototype._renderTable = function() {
|
||||
for (var c = 0; c < choices.length; c++) {
|
||||
var choice = choices[c];
|
||||
var li = $('<li>').appendTo(ul);
|
||||
$('<span>').text(choice.v.l).appendTo(li);
|
||||
$('<span>').text(choice.v).appendTo(li);
|
||||
$('<span>').text(" (" + choice.c + ")").appendTo(li);
|
||||
}
|
||||
|
||||
@ -73,7 +137,7 @@ FacetBasedEditDialog.prototype._renderTable = function() {
|
||||
editCheck.attr("checked", "true");
|
||||
}
|
||||
|
||||
var input = $('<input size="35" />')
|
||||
var input = $('<input size="55" />')
|
||||
.attr("value", cluster.value)
|
||||
.appendTo(tr.insertCell(2))
|
||||
.keyup(function() {
|
||||
@ -83,65 +147,43 @@ FacetBasedEditDialog.prototype._renderTable = function() {
|
||||
for (var i = 0; i < this._clusters.length; i++) {
|
||||
renderCluster(this._clusters[i]);
|
||||
}
|
||||
|
||||
container.empty().append(table);
|
||||
};
|
||||
|
||||
FacetBasedEditDialog.prototype._cluster = function() {
|
||||
var self = this;
|
||||
|
||||
var container = this._elmts.tableContainer.html(
|
||||
'<div style="margin: 1em; font-size: 130%; color: #888;">Loading... <img src="/images/small-spinner.gif"></div>'
|
||||
);
|
||||
|
||||
$.post(
|
||||
"/command/compute-clusters?" + $.param({ project: theProject.id }),
|
||||
{
|
||||
engine: JSON.stringify(ui.browsingEngine.getJSON()),
|
||||
clusterer: JSON.stringify({
|
||||
'type' : this._method,
|
||||
'function' : this._function,
|
||||
'column' : this._columnName,
|
||||
'params' : this._params
|
||||
})
|
||||
},
|
||||
function(data) {
|
||||
var clusters = [];
|
||||
var map = {};
|
||||
$.each(this._entries, function() {
|
||||
var choice = {
|
||||
v: this.v,
|
||||
c: this.c
|
||||
};
|
||||
|
||||
var s = this.v.l.toLowerCase().replace(/\W/g, ' ').replace(/\s+/g, ' ').split(" ").sort().join(" ");
|
||||
if (s in map) {
|
||||
map[s].choices.push(choice);
|
||||
} else {
|
||||
map[s] = {
|
||||
edit: false,
|
||||
choices: [ choice ]
|
||||
};
|
||||
clusters.push(map[s]);
|
||||
$.each(data, function() {
|
||||
clusters.push({
|
||||
edit: true,
|
||||
choices: this,
|
||||
value: this[0].v
|
||||
});
|
||||
});
|
||||
self._clusters = clusters;
|
||||
self._renderTable();
|
||||
},
|
||||
"json"
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
$.each(clusters, function() {
|
||||
if (this.choices.length > 1) {
|
||||
this.choices.sort(function(a, b) {
|
||||
var c = b.c - a.c;
|
||||
return c != 0 ? c : a.v.l.localeCompare(b.v.l);
|
||||
});
|
||||
this.edit = true;
|
||||
}
|
||||
this.value = this.choices[0].v.l;
|
||||
});
|
||||
clusters.sort(function(a, b) {
|
||||
var c = b.choices.length - a.choices.length;
|
||||
return c != 0 ? c : a.value.localeCompare(b.value);
|
||||
});
|
||||
|
||||
this._clusters = clusters;
|
||||
this._renderTable();
|
||||
};
|
||||
|
||||
FacetBasedEditDialog.prototype._uncluster = function() {
|
||||
var clusters = [];
|
||||
$.each(this._entries, function() {
|
||||
var cluster = {
|
||||
edit: false,
|
||||
choices: [{
|
||||
v: this.v,
|
||||
c: this.c
|
||||
}],
|
||||
value: this.v.l
|
||||
};
|
||||
clusters.push(cluster);
|
||||
});
|
||||
|
||||
this._clusters = clusters;
|
||||
this._renderTable();
|
||||
};
|
||||
|
||||
FacetBasedEditDialog.prototype._onOK = function() {
|
||||
var edits = [];
|
||||
@ -150,7 +192,7 @@ FacetBasedEditDialog.prototype._onOK = function() {
|
||||
if (cluster.edit) {
|
||||
var values = [];
|
||||
for (var j = 0; j < cluster.choices.length; j++) {
|
||||
values.push(cluster.choices[j].v.v);
|
||||
values.push(cluster.choices[j].v);
|
||||
}
|
||||
|
||||
edits.push({
|
||||
|
@ -75,3 +75,7 @@ img {
|
||||
.fbs-pane, .fbs-flyout-pane {
|
||||
z-index: 2000;
|
||||
}
|
||||
|
||||
.hidden {
|
||||
display: none;
|
||||
}
|
@ -14,7 +14,7 @@ table.facet-based-edit-dialog-main-layout > tbody > tr:last-child > td {
|
||||
}
|
||||
|
||||
.facet-based-edit-dialog-table-container {
|
||||
height: 450px;
|
||||
height: 500px;
|
||||
overflow: auto;
|
||||
border: 1px solid #aaa;
|
||||
}
|
||||
@ -42,5 +42,14 @@ table.facet-based-edit-dialog-entry-table > tbody > tr.even > td {
|
||||
}
|
||||
|
||||
table.facet-based-edit-dialog-entry-table input {
|
||||
border: none;
|
||||
border: 1px solid #ccc;
|
||||
padding: 0 0.1em;
|
||||
}
|
||||
|
||||
.facet-based-edit-dialog-controls {
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
.facet-based-edit-dialog-controls td {
|
||||
padding-right: 0.5em;
|
||||
}
|
Loading…
Reference in New Issue
Block a user