getting closer to the desired functionality... still way too slow though

git-svn-id: http://google-refine.googlecode.com/svn/trunk@256 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-09 17:28:50 +00:00
parent 50e58fb863
commit 8ce21461cb
15 changed files with 80 additions and 54 deletions

View File

@ -28,7 +28,7 @@ public class BinningClusterer extends Clusterer {
static protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>(); static protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>();
List<Map<Object,Integer>> _clusters; List<Map<String,Integer>> _clusters;
static { static {
_keyers.put("fingerprint", new FingerprintKeyer()); _keyers.put("fingerprint", new FingerprintKeyer());
@ -44,7 +44,7 @@ public class BinningClusterer extends Clusterer {
Object[] _params; Object[] _params;
JSONObject _config; JSONObject _config;
Map<String,Map<Object,Integer>> _map = new HashMap<String,Map<Object,Integer>>(); Map<String,Map<String,Integer>> _map = new HashMap<String,Map<String,Integer>>();
public BinningRowVisitor(Keyer k, JSONObject o) { public BinningRowVisitor(Keyer k, JSONObject o) {
_keyer = k; _keyer = k;
@ -63,18 +63,18 @@ public class BinningClusterer extends Clusterer {
public boolean visit(Project project, int rowIndex, Row row, boolean contextual) { public boolean visit(Project project, int rowIndex, Row row, boolean contextual) {
Cell cell = row.cells.get(_colindex); Cell cell = row.cells.get(_colindex);
if (cell != null && cell.value != null) { if (cell != null && cell.value != null) {
Object v = cell.value; String v = cell.value.toString();
String s = (v instanceof String) ? ((String) v) : v.toString(); String s = (v instanceof String) ? ((String) v) : v.toString();
String key = _keyer.key(s,_params); String key = _keyer.key(s,_params);
if (_map.containsKey(key)) { if (_map.containsKey(key)) {
Map<Object,Integer> m = _map.get(key); Map<String,Integer> m = _map.get(key);
if (m.containsKey(v)) { if (m.containsKey(v)) {
m.put(v, m.get(v) + 1); m.put(v, m.get(v) + 1);
} else { } else {
m.put(v,1); m.put(v,1);
} }
} else { } else {
Map<Object,Integer> m = new TreeMap<Object,Integer>(); Map<String,Integer> m = new TreeMap<String,Integer>();
m.put(v,0); m.put(v,0);
_map.put(key, m); _map.put(key, m);
} }
@ -82,13 +82,13 @@ public class BinningClusterer extends Clusterer {
return false; return false;
} }
public Map<String,Map<Object,Integer>> getMap() { public Map<String,Map<String,Integer>> getMap() {
return _map; return _map;
} }
} }
public class SizeComparator implements Comparator<Map<Object,Integer>> { public class SizeComparator implements Comparator<Map<String,Integer>> {
public int compare(Map<Object,Integer> o1, Map<Object,Integer> o2) { public int compare(Map<String,Integer> o1, Map<String,Integer> o2) {
int s1 = o1.size(); int s1 = o1.size();
int s2 = o2.size(); int s2 = o2.size();
if (o1 == o2) { if (o1 == o2) {
@ -107,8 +107,8 @@ public class BinningClusterer extends Clusterer {
} }
} }
public class EntriesComparator implements Comparator<Entry<Object,Integer>> { public class EntriesComparator implements Comparator<Entry<String,Integer>> {
public int compare(Entry<Object,Integer> o1, Entry<Object,Integer> o2) { public int compare(Entry<String,Integer> o1, Entry<String,Integer> o2) {
return o2.getValue() - o1.getValue(); return o2.getValue() - o1.getValue();
} }
} }
@ -123,8 +123,8 @@ public class BinningClusterer extends Clusterer {
FilteredRows filteredRows = engine.getAllFilteredRows(true); FilteredRows filteredRows = engine.getAllFilteredRows(true);
filteredRows.accept(_project, visitor); filteredRows.accept(_project, visitor);
Map<String,Map<Object,Integer>> map = visitor.getMap(); Map<String,Map<String,Integer>> map = visitor.getMap();
_clusters = new ArrayList<Map<Object,Integer>>(map.values()); _clusters = new ArrayList<Map<String,Integer>>(map.values());
Collections.sort(_clusters, new SizeComparator()); Collections.sort(_clusters, new SizeComparator());
} }
@ -132,12 +132,12 @@ public class BinningClusterer extends Clusterer {
EntriesComparator c = new EntriesComparator(); EntriesComparator c = new EntriesComparator();
writer.array(); writer.array();
for (Map<Object,Integer> m : _clusters) { for (Map<String,Integer> m : _clusters) {
if (m.size() > 1) { if (m.size() > 1) {
writer.array(); writer.array();
List<Entry<Object,Integer>> entries = new ArrayList<Entry<Object,Integer>>(m.entrySet()); List<Entry<String,Integer>> entries = new ArrayList<Entry<String,Integer>>(m.entrySet());
Collections.sort(entries,c); Collections.sort(entries,c);
for (Entry<Object,Integer> e : entries) { for (Entry<String,Integer> e : entries) {
writer.object(); writer.object();
writer.key("v"); writer.value(e.getKey()); writer.key("v"); writer.value(e.getKey());
writer.key("c"); writer.value(e.getValue()); writer.key("c"); writer.value(e.getValue());

View File

@ -28,8 +28,8 @@ import com.wcohen.ss.api.Token;
import com.wcohen.ss.tokens.NGramTokenizer; import com.wcohen.ss.tokens.NGramTokenizer;
import com.wcohen.ss.tokens.SimpleTokenizer; import com.wcohen.ss.tokens.SimpleTokenizer;
import edu.mit.simile.vicino.Distance;
import edu.mit.simile.vicino.distances.BZip2Distance; import edu.mit.simile.vicino.distances.BZip2Distance;
import edu.mit.simile.vicino.distances.Distance;
import edu.mit.simile.vicino.distances.GZipDistance; import edu.mit.simile.vicino.distances.GZipDistance;
import edu.mit.simile.vicino.distances.JaccardDistance; import edu.mit.simile.vicino.distances.JaccardDistance;
import edu.mit.simile.vicino.distances.JaroDistance; import edu.mit.simile.vicino.distances.JaroDistance;
@ -63,17 +63,17 @@ public class kNNClusterer extends Clusterer {
Distance _distance; Distance _distance;
JSONObject _config; JSONObject _config;
VPTreeBuilder _treeBuilder; VPTreeBuilder _treeBuilder;
float _radius; double _radius = 1.0f;
public VPTreeClusteringRowVisitor(Distance d, JSONObject o) { public VPTreeClusteringRowVisitor(Distance d, JSONObject o) {
_distance = d; _distance = d;
_config = o; _config = o;
_treeBuilder = new VPTreeBuilder(_distance); _treeBuilder = new VPTreeBuilder(_distance);
try { try {
_radius = (float) o.getJSONObject("params").getDouble("radius"); JSONObject params = o.getJSONObject("params");
_radius = params.getDouble("radius");
} catch (JSONException e) { } catch (JSONException e) {
Gridworks.warn("No radius found, using default"); Gridworks.warn("No parameters found, using defaults");
_radius = 0.1f;
} }
} }
@ -87,7 +87,7 @@ public class kNNClusterer extends Clusterer {
return false; return false;
} }
public Map<Serializable,List<? extends Serializable>> getClusters() { public Map<Serializable,List<Serializable>> getClusters() {
return _treeBuilder.getClusters(_radius); return _treeBuilder.getClusters(_radius);
} }
} }
@ -96,7 +96,8 @@ public class kNNClusterer extends Clusterer {
Distance _distance; Distance _distance;
JSONObject _config; JSONObject _config;
float _radius; double _radius = 1.0d;
int _blockingNgramSize = 6;
HashSet<String> _data; HashSet<String> _data;
public BlockingClusteringRowVisitor(Distance d, JSONObject o) { public BlockingClusteringRowVisitor(Distance d, JSONObject o) {
@ -104,10 +105,11 @@ public class kNNClusterer extends Clusterer {
_config = o; _config = o;
_data = new HashSet<String>(); _data = new HashSet<String>();
try { try {
_radius = (float) o.getJSONObject("params").getDouble("radius"); JSONObject params = o.getJSONObject("params");
_radius = params.getDouble("radius");
_blockingNgramSize = params.getInt("blocking-ngram-size");
} catch (JSONException e) { } catch (JSONException e) {
Gridworks.warn("No radius found, using default"); Gridworks.warn("No parameters found, using defaults");
_radius = 0.1f;
} }
} }
@ -122,7 +124,7 @@ public class kNNClusterer extends Clusterer {
} }
public Map<Serializable,Set<Serializable>> getClusters() { public Map<Serializable,Set<Serializable>> getClusters() {
NGramTokenizer tokenizer = new NGramTokenizer(4,4,false,SimpleTokenizer.DEFAULT_TOKENIZER); NGramTokenizer tokenizer = new NGramTokenizer(_blockingNgramSize,_blockingNgramSize,false,SimpleTokenizer.DEFAULT_TOKENIZER);
Map<String,List<String>> blocks = new HashMap<String,List<String>>(); Map<String,List<String>> blocks = new HashMap<String,List<String>>();
@ -148,9 +150,10 @@ public class kNNClusterer extends Clusterer {
for (String a : list) { for (String a : list) {
for (String b : list) { for (String b : list) {
if (a == b) continue; if (a == b) continue;
if (clusters.containsKey(a) && clusters.get(a).contains(b)) continue;
if (clusters.containsKey(b) && clusters.get(b).contains(a)) continue;
double d = _distance.d(a,b); double d = _distance.d(a,b);
if (d <= _radius) { if (d <= _radius || _radius < 0) {
System.out.println(a + " | " + b + ": " + d);
Set<Serializable> l = null; Set<Serializable> l = null;
if (!clusters.containsKey(a)) { if (!clusters.containsKey(a)) {
l = new TreeSet<Serializable>(); l = new TreeSet<Serializable>();
@ -165,6 +168,7 @@ public class kNNClusterer extends Clusterer {
} }
} }
Gridworks.log("Calculated " + _distance.getCount() + " distances");
return clusters; return clusters;
} }
} }

View File

@ -1,7 +0,0 @@
package edu.mit.simile.vicino;
public interface Distance {
public double d(String x, String y);
}

View File

@ -2,7 +2,7 @@ package edu.mit.simile.vicino;
import java.util.List; import java.util.List;
import edu.mit.simile.vicino.Distance; import edu.mit.simile.vicino.distances.Distance;
public class Distributor extends Operator { public class Distributor extends Operator {

View File

@ -1,5 +1,7 @@
package edu.mit.simile.vicino; package edu.mit.simile.vicino;
import edu.mit.simile.vicino.distances.Distance;
public class Meter extends Operator { public class Meter extends Operator {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {

View File

@ -7,6 +7,8 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import edu.mit.simile.vicino.distances.Distance;
public class Operator { public class Operator {
static void log(String msg) { static void log(String msg) {

View File

@ -6,6 +6,7 @@ import java.io.Serializable;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import edu.mit.simile.vicino.distances.Distance;
import edu.mit.simile.vicino.vptree.VPTree; import edu.mit.simile.vicino.vptree.VPTree;
import edu.mit.simile.vicino.vptree.VPTreeBuilder; import edu.mit.simile.vicino.vptree.VPTreeBuilder;
import edu.mit.simile.vicino.vptree.VPTreeSeeker; import edu.mit.simile.vicino.vptree.VPTreeSeeker;

View File

@ -2,6 +2,8 @@ package edu.mit.simile.vicino;
import java.util.List; import java.util.List;
import edu.mit.simile.vicino.distances.Distance;
public class Tester extends Operator { public class Tester extends Operator {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {

View File

@ -0,0 +1,13 @@
package edu.mit.simile.vicino.distances;
public abstract class Distance {
int counter = 0;
public int getCount() {
return counter;
}
public abstract double d(String x, String y);
}

View File

@ -3,9 +3,7 @@ package edu.mit.simile.vicino.distances;
import com.wcohen.ss.Levenstein; import com.wcohen.ss.Levenstein;
import com.wcohen.ss.api.StringDistance; import com.wcohen.ss.api.StringDistance;
import edu.mit.simile.vicino.Distance; public class LevenshteinDistance extends MetricDistance {
public class LevenshteinDistance implements Distance {
StringDistance distance; StringDistance distance;
@ -13,7 +11,7 @@ public class LevenshteinDistance implements Distance {
this.distance = new Levenstein(); this.distance = new Levenstein();
} }
public double d(String x, String y) { public double d2(String x, String y) {
return Math.abs(this.distance.score(x, y)); return Math.abs(this.distance.score(x, y));
} }

View File

@ -1,8 +1,7 @@
package edu.mit.simile.vicino.distances; package edu.mit.simile.vicino.distances;
import edu.mit.simile.vicino.Distance;
public abstract class MetricDistance implements Distance { public abstract class MetricDistance extends Distance {
/* /*
* public float d(String x,String y) { * public float d(String x,String y) {
@ -15,9 +14,11 @@ public abstract class MetricDistance implements Distance {
*/ */
public double d(String x, String y) { public double d(String x, String y) {
return d2(x, y); double result = d2(x, y);
counter += 1;
return result;
} }
abstract double d2(String x, String y); abstract double d2(String x, String y);
} }

View File

@ -1,14 +1,14 @@
package edu.mit.simile.vicino.distances; package edu.mit.simile.vicino.distances;
import edu.mit.simile.vicino.Distance;
public abstract class PseudoMetricDistance implements Distance { public abstract class PseudoMetricDistance extends Distance {
public double d(String x, String y) { public double d(String x, String y) {
double cxx = d2(x, x); double cxx = d2(x, x);
double cyy = d2(y, y); double cyy = d2(y, y);
double cxy = d2(x, y); double cxy = d2(x, y);
double cyx = d2(y, x); double cyx = d2(y, x);
counter += 4;
return (cxy + cyx) / (cxx + cyy) - 1.0d; return (cxy + cyx) / (cxx + cyy) - 1.0d;
} }

View File

@ -11,7 +11,7 @@ import java.util.Set;
import com.metaweb.gridworks.Gridworks; import com.metaweb.gridworks.Gridworks;
import edu.mit.simile.vicino.Distance; import edu.mit.simile.vicino.distances.Distance;
/** /**
* @author Paolo Ciccarese * @author Paolo Ciccarese
@ -60,14 +60,14 @@ public class VPTreeBuilder {
this.nodes.clear(); this.nodes.clear();
} }
public Map<Serializable,List<? extends Serializable>> getClusters(float radius) { public Map<Serializable,List<Serializable>> getClusters(double radius) {
VPTree tree = buildVPTree(); VPTree tree = buildVPTree();
VPTreeSeeker seeker = new VPTreeSeeker(distance,tree); VPTreeSeeker seeker = new VPTreeSeeker(distance,tree);
Map<Serializable,List<? extends Serializable>> map = new HashMap<Serializable,List<? extends Serializable>>(); Map<Serializable,List<Serializable>> map = new HashMap<Serializable,List<Serializable>>();
for (Node n : nodes) { for (Node n : nodes) {
Serializable s = n.get(); Serializable s = n.get();
List<? extends Serializable> results = seeker.range(s, radius); List<Serializable> results = seeker.range(s, radius);
map.put(s, results); map.put(s, results);
} }

View File

@ -4,7 +4,7 @@ import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import edu.mit.simile.vicino.Distance; import edu.mit.simile.vicino.distances.Distance;
/** /**
* @author Paolo Ciccarese * @author Paolo Ciccarese
@ -19,11 +19,11 @@ public class VPTreeSeeker {
this.tree = tree; this.tree = tree;
} }
public List<? extends Serializable> range(Serializable query, float range) { public List<Serializable> range(Serializable query, double range) {
return rangeTraversal(query, range, tree.getRoot(), new ArrayList<Serializable>()); return rangeTraversal(query, range, tree.getRoot(), new ArrayList<Serializable>());
} }
private List<Serializable> rangeTraversal(Serializable query, float range, TNode tNode, List<Serializable> results) { private List<Serializable> rangeTraversal(Serializable query, double range, TNode tNode, List<Serializable> results) {
if (tNode != null) { if (tNode != null) {
double distance = this.distance.d(query.toString(), tNode.get().toString()); double distance = this.distance.d(query.toString(), tNode.get().toString());

View File

@ -54,7 +54,8 @@ FacetBasedEditDialog.prototype._createDialog = function() {
'Ngram Size: <input type="text" value="1" bind="ngramSize" size="3">' + 'Ngram Size: <input type="text" value="1" bind="ngramSize" size="3">' +
'</div>' + '</div>' +
'<div class="knn-controls hidden">' + '<div class="knn-controls hidden">' +
'Radius: <input type="text" value="0.1" bind="radius" size="3">' + '<span style="margin-right: 1em">Radius: <input type="text" value="1.0" bind="radius" size="3"></span>' +
'<span>Block Chars: <input type="text" value="6" bind="ngramBlock" size="3"></span>' +
'</div>' + '</div>' +
'</td>' + '</td>' +
'<td bind="resultSummary" align="right">' + '<td bind="resultSummary" align="right">' +
@ -114,6 +115,15 @@ FacetBasedEditDialog.prototype._createDialog = function() {
alert("radius must be a number"); alert("radius must be a number");
} }
}); });
this._elmts.ngramBlock.change(function() {
try {
self._params = { "blocking-ngram-size" : parseInt($(this).val()) };
self._cluster();
} catch (e) {
alert("radius must be a number");
}
});
//this._elmts.clusterButton.click(function() { self._cluster(); }); //this._elmts.clusterButton.click(function() { self._cluster(); });
//this._elmts.unclusterButton.click(function() { self._uncluster(); }); //this._elmts.unclusterButton.click(function() { self._uncluster(); });