getting closer to the desired functionality... still way too slow though

git-svn-id: http://google-refine.googlecode.com/svn/trunk@256 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-09 17:28:50 +00:00
parent 50e58fb863
commit 8ce21461cb
15 changed files with 80 additions and 54 deletions

View File

@ -28,7 +28,7 @@ public class BinningClusterer extends Clusterer {
static protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>();
List<Map<Object,Integer>> _clusters;
List<Map<String,Integer>> _clusters;
static {
_keyers.put("fingerprint", new FingerprintKeyer());
@ -44,7 +44,7 @@ public class BinningClusterer extends Clusterer {
Object[] _params;
JSONObject _config;
Map<String,Map<Object,Integer>> _map = new HashMap<String,Map<Object,Integer>>();
Map<String,Map<String,Integer>> _map = new HashMap<String,Map<String,Integer>>();
public BinningRowVisitor(Keyer k, JSONObject o) {
_keyer = k;
@ -63,18 +63,18 @@ public class BinningClusterer extends Clusterer {
public boolean visit(Project project, int rowIndex, Row row, boolean contextual) {
Cell cell = row.cells.get(_colindex);
if (cell != null && cell.value != null) {
Object v = cell.value;
String v = cell.value.toString();
String s = (v instanceof String) ? ((String) v) : v.toString();
String key = _keyer.key(s,_params);
if (_map.containsKey(key)) {
Map<Object,Integer> m = _map.get(key);
Map<String,Integer> m = _map.get(key);
if (m.containsKey(v)) {
m.put(v, m.get(v) + 1);
} else {
m.put(v,1);
}
} else {
Map<Object,Integer> m = new TreeMap<Object,Integer>();
Map<String,Integer> m = new TreeMap<String,Integer>();
m.put(v,0);
_map.put(key, m);
}
@ -82,13 +82,13 @@ public class BinningClusterer extends Clusterer {
return false;
}
public Map<String,Map<Object,Integer>> getMap() {
public Map<String,Map<String,Integer>> getMap() {
return _map;
}
}
public class SizeComparator implements Comparator<Map<Object,Integer>> {
public int compare(Map<Object,Integer> o1, Map<Object,Integer> o2) {
public class SizeComparator implements Comparator<Map<String,Integer>> {
public int compare(Map<String,Integer> o1, Map<String,Integer> o2) {
int s1 = o1.size();
int s2 = o2.size();
if (o1 == o2) {
@ -107,8 +107,8 @@ public class BinningClusterer extends Clusterer {
}
}
public class EntriesComparator implements Comparator<Entry<Object,Integer>> {
public int compare(Entry<Object,Integer> o1, Entry<Object,Integer> o2) {
public class EntriesComparator implements Comparator<Entry<String,Integer>> {
public int compare(Entry<String,Integer> o1, Entry<String,Integer> o2) {
return o2.getValue() - o1.getValue();
}
}
@ -123,8 +123,8 @@ public class BinningClusterer extends Clusterer {
FilteredRows filteredRows = engine.getAllFilteredRows(true);
filteredRows.accept(_project, visitor);
Map<String,Map<Object,Integer>> map = visitor.getMap();
_clusters = new ArrayList<Map<Object,Integer>>(map.values());
Map<String,Map<String,Integer>> map = visitor.getMap();
_clusters = new ArrayList<Map<String,Integer>>(map.values());
Collections.sort(_clusters, new SizeComparator());
}
@ -132,12 +132,12 @@ public class BinningClusterer extends Clusterer {
EntriesComparator c = new EntriesComparator();
writer.array();
for (Map<Object,Integer> m : _clusters) {
for (Map<String,Integer> m : _clusters) {
if (m.size() > 1) {
writer.array();
List<Entry<Object,Integer>> entries = new ArrayList<Entry<Object,Integer>>(m.entrySet());
List<Entry<String,Integer>> entries = new ArrayList<Entry<String,Integer>>(m.entrySet());
Collections.sort(entries,c);
for (Entry<Object,Integer> e : entries) {
for (Entry<String,Integer> e : entries) {
writer.object();
writer.key("v"); writer.value(e.getKey());
writer.key("c"); writer.value(e.getValue());

View File

@ -28,8 +28,8 @@ import com.wcohen.ss.api.Token;
import com.wcohen.ss.tokens.NGramTokenizer;
import com.wcohen.ss.tokens.SimpleTokenizer;
import edu.mit.simile.vicino.Distance;
import edu.mit.simile.vicino.distances.BZip2Distance;
import edu.mit.simile.vicino.distances.Distance;
import edu.mit.simile.vicino.distances.GZipDistance;
import edu.mit.simile.vicino.distances.JaccardDistance;
import edu.mit.simile.vicino.distances.JaroDistance;
@ -63,17 +63,17 @@ public class kNNClusterer extends Clusterer {
Distance _distance;
JSONObject _config;
VPTreeBuilder _treeBuilder;
float _radius;
double _radius = 1.0f;
public VPTreeClusteringRowVisitor(Distance d, JSONObject o) {
_distance = d;
_config = o;
_treeBuilder = new VPTreeBuilder(_distance);
try {
_radius = (float) o.getJSONObject("params").getDouble("radius");
JSONObject params = o.getJSONObject("params");
_radius = params.getDouble("radius");
} catch (JSONException e) {
Gridworks.warn("No radius found, using default");
_radius = 0.1f;
Gridworks.warn("No parameters found, using defaults");
}
}
@ -87,7 +87,7 @@ public class kNNClusterer extends Clusterer {
return false;
}
public Map<Serializable,List<? extends Serializable>> getClusters() {
public Map<Serializable,List<Serializable>> getClusters() {
return _treeBuilder.getClusters(_radius);
}
}
@ -96,7 +96,8 @@ public class kNNClusterer extends Clusterer {
Distance _distance;
JSONObject _config;
float _radius;
double _radius = 1.0d;
int _blockingNgramSize = 6;
HashSet<String> _data;
public BlockingClusteringRowVisitor(Distance d, JSONObject o) {
@ -104,10 +105,11 @@ public class kNNClusterer extends Clusterer {
_config = o;
_data = new HashSet<String>();
try {
_radius = (float) o.getJSONObject("params").getDouble("radius");
JSONObject params = o.getJSONObject("params");
_radius = params.getDouble("radius");
_blockingNgramSize = params.getInt("blocking-ngram-size");
} catch (JSONException e) {
Gridworks.warn("No radius found, using default");
_radius = 0.1f;
Gridworks.warn("No parameters found, using defaults");
}
}
@ -122,7 +124,7 @@ public class kNNClusterer extends Clusterer {
}
public Map<Serializable,Set<Serializable>> getClusters() {
NGramTokenizer tokenizer = new NGramTokenizer(4,4,false,SimpleTokenizer.DEFAULT_TOKENIZER);
NGramTokenizer tokenizer = new NGramTokenizer(_blockingNgramSize,_blockingNgramSize,false,SimpleTokenizer.DEFAULT_TOKENIZER);
Map<String,List<String>> blocks = new HashMap<String,List<String>>();
@ -148,9 +150,10 @@ public class kNNClusterer extends Clusterer {
for (String a : list) {
for (String b : list) {
if (a == b) continue;
if (clusters.containsKey(a) && clusters.get(a).contains(b)) continue;
if (clusters.containsKey(b) && clusters.get(b).contains(a)) continue;
double d = _distance.d(a,b);
if (d <= _radius) {
System.out.println(a + " | " + b + ": " + d);
if (d <= _radius || _radius < 0) {
Set<Serializable> l = null;
if (!clusters.containsKey(a)) {
l = new TreeSet<Serializable>();
@ -165,6 +168,7 @@ public class kNNClusterer extends Clusterer {
}
}
Gridworks.log("Calculated " + _distance.getCount() + " distances");
return clusters;
}
}

View File

@ -1,7 +0,0 @@
package edu.mit.simile.vicino;
public interface Distance {
public double d(String x, String y);
}

View File

@ -2,7 +2,7 @@ package edu.mit.simile.vicino;
import java.util.List;
import edu.mit.simile.vicino.Distance;
import edu.mit.simile.vicino.distances.Distance;
public class Distributor extends Operator {

View File

@ -1,5 +1,7 @@
package edu.mit.simile.vicino;
import edu.mit.simile.vicino.distances.Distance;
public class Meter extends Operator {
public static void main(String[] args) throws Exception {

View File

@ -7,6 +7,8 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import edu.mit.simile.vicino.distances.Distance;
public class Operator {
static void log(String msg) {

View File

@ -6,6 +6,7 @@ import java.io.Serializable;
import java.util.Iterator;
import java.util.List;
import edu.mit.simile.vicino.distances.Distance;
import edu.mit.simile.vicino.vptree.VPTree;
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
import edu.mit.simile.vicino.vptree.VPTreeSeeker;

View File

@ -2,6 +2,8 @@ package edu.mit.simile.vicino;
import java.util.List;
import edu.mit.simile.vicino.distances.Distance;
public class Tester extends Operator {
public static void main(String[] args) throws Exception {

View File

@ -0,0 +1,13 @@
package edu.mit.simile.vicino.distances;
public abstract class Distance {
int counter = 0;
public int getCount() {
return counter;
}
public abstract double d(String x, String y);
}

View File

@ -3,9 +3,7 @@ package edu.mit.simile.vicino.distances;
import com.wcohen.ss.Levenstein;
import com.wcohen.ss.api.StringDistance;
import edu.mit.simile.vicino.Distance;
public class LevenshteinDistance implements Distance {
public class LevenshteinDistance extends MetricDistance {
StringDistance distance;
@ -13,7 +11,7 @@ public class LevenshteinDistance implements Distance {
this.distance = new Levenstein();
}
public double d(String x, String y) {
public double d2(String x, String y) {
return Math.abs(this.distance.score(x, y));
}

View File

@ -1,8 +1,7 @@
package edu.mit.simile.vicino.distances;
import edu.mit.simile.vicino.Distance;
public abstract class MetricDistance implements Distance {
public abstract class MetricDistance extends Distance {
/*
* public float d(String x,String y) {
@ -15,7 +14,9 @@ public abstract class MetricDistance implements Distance {
*/
public double d(String x, String y) {
return d2(x, y);
double result = d2(x, y);
counter += 1;
return result;
}
abstract double d2(String x, String y);

View File

@ -1,14 +1,14 @@
package edu.mit.simile.vicino.distances;
import edu.mit.simile.vicino.Distance;
public abstract class PseudoMetricDistance implements Distance {
public abstract class PseudoMetricDistance extends Distance {
public double d(String x, String y) {
double cxx = d2(x, x);
double cyy = d2(y, y);
double cxy = d2(x, y);
double cyx = d2(y, x);
counter += 4;
return (cxy + cyx) / (cxx + cyy) - 1.0d;
}

View File

@ -11,7 +11,7 @@ import java.util.Set;
import com.metaweb.gridworks.Gridworks;
import edu.mit.simile.vicino.Distance;
import edu.mit.simile.vicino.distances.Distance;
/**
* @author Paolo Ciccarese
@ -60,14 +60,14 @@ public class VPTreeBuilder {
this.nodes.clear();
}
public Map<Serializable,List<? extends Serializable>> getClusters(float radius) {
public Map<Serializable,List<Serializable>> getClusters(double radius) {
VPTree tree = buildVPTree();
VPTreeSeeker seeker = new VPTreeSeeker(distance,tree);
Map<Serializable,List<? extends Serializable>> map = new HashMap<Serializable,List<? extends Serializable>>();
Map<Serializable,List<Serializable>> map = new HashMap<Serializable,List<Serializable>>();
for (Node n : nodes) {
Serializable s = n.get();
List<? extends Serializable> results = seeker.range(s, radius);
List<Serializable> results = seeker.range(s, radius);
map.put(s, results);
}

View File

@ -4,7 +4,7 @@ import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import edu.mit.simile.vicino.Distance;
import edu.mit.simile.vicino.distances.Distance;
/**
* @author Paolo Ciccarese
@ -19,11 +19,11 @@ public class VPTreeSeeker {
this.tree = tree;
}
public List<? extends Serializable> range(Serializable query, float range) {
public List<Serializable> range(Serializable query, double range) {
return rangeTraversal(query, range, tree.getRoot(), new ArrayList<Serializable>());
}
private List<Serializable> rangeTraversal(Serializable query, float range, TNode tNode, List<Serializable> results) {
private List<Serializable> rangeTraversal(Serializable query, double range, TNode tNode, List<Serializable> results) {
if (tNode != null) {
double distance = this.distance.d(query.toString(), tNode.get().toString());

View File

@ -54,7 +54,8 @@ FacetBasedEditDialog.prototype._createDialog = function() {
'Ngram Size: <input type="text" value="1" bind="ngramSize" size="3">' +
'</div>' +
'<div class="knn-controls hidden">' +
'Radius: <input type="text" value="0.1" bind="radius" size="3">' +
'<span style="margin-right: 1em">Radius: <input type="text" value="1.0" bind="radius" size="3"></span>' +
'<span>Block Chars: <input type="text" value="6" bind="ngramBlock" size="3"></span>' +
'</div>' +
'</td>' +
'<td bind="resultSummary" align="right">' +
@ -115,6 +116,15 @@ FacetBasedEditDialog.prototype._createDialog = function() {
}
});
this._elmts.ngramBlock.change(function() {
try {
self._params = { "blocking-ngram-size" : parseInt($(this).val()) };
self._cluster();
} catch (e) {
alert("radius must be a number");
}
});
//this._elmts.clusterButton.click(function() { self._cluster(); });
//this._elmts.unclusterButton.click(function() { self._uncluster(); });