getting closer to the desired functionality... still way too slow though
git-svn-id: http://google-refine.googlecode.com/svn/trunk@256 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
50e58fb863
commit
8ce21461cb
@ -28,7 +28,7 @@ public class BinningClusterer extends Clusterer {
|
||||
|
||||
static protected Map<String, Keyer> _keyers = new HashMap<String, Keyer>();
|
||||
|
||||
List<Map<Object,Integer>> _clusters;
|
||||
List<Map<String,Integer>> _clusters;
|
||||
|
||||
static {
|
||||
_keyers.put("fingerprint", new FingerprintKeyer());
|
||||
@ -44,7 +44,7 @@ public class BinningClusterer extends Clusterer {
|
||||
Object[] _params;
|
||||
JSONObject _config;
|
||||
|
||||
Map<String,Map<Object,Integer>> _map = new HashMap<String,Map<Object,Integer>>();
|
||||
Map<String,Map<String,Integer>> _map = new HashMap<String,Map<String,Integer>>();
|
||||
|
||||
public BinningRowVisitor(Keyer k, JSONObject o) {
|
||||
_keyer = k;
|
||||
@ -63,18 +63,18 @@ public class BinningClusterer extends Clusterer {
|
||||
public boolean visit(Project project, int rowIndex, Row row, boolean contextual) {
|
||||
Cell cell = row.cells.get(_colindex);
|
||||
if (cell != null && cell.value != null) {
|
||||
Object v = cell.value;
|
||||
String v = cell.value.toString();
|
||||
String s = (v instanceof String) ? ((String) v) : v.toString();
|
||||
String key = _keyer.key(s,_params);
|
||||
if (_map.containsKey(key)) {
|
||||
Map<Object,Integer> m = _map.get(key);
|
||||
Map<String,Integer> m = _map.get(key);
|
||||
if (m.containsKey(v)) {
|
||||
m.put(v, m.get(v) + 1);
|
||||
} else {
|
||||
m.put(v,1);
|
||||
}
|
||||
} else {
|
||||
Map<Object,Integer> m = new TreeMap<Object,Integer>();
|
||||
Map<String,Integer> m = new TreeMap<String,Integer>();
|
||||
m.put(v,0);
|
||||
_map.put(key, m);
|
||||
}
|
||||
@ -82,13 +82,13 @@ public class BinningClusterer extends Clusterer {
|
||||
return false;
|
||||
}
|
||||
|
||||
public Map<String,Map<Object,Integer>> getMap() {
|
||||
public Map<String,Map<String,Integer>> getMap() {
|
||||
return _map;
|
||||
}
|
||||
}
|
||||
|
||||
public class SizeComparator implements Comparator<Map<Object,Integer>> {
|
||||
public int compare(Map<Object,Integer> o1, Map<Object,Integer> o2) {
|
||||
public class SizeComparator implements Comparator<Map<String,Integer>> {
|
||||
public int compare(Map<String,Integer> o1, Map<String,Integer> o2) {
|
||||
int s1 = o1.size();
|
||||
int s2 = o2.size();
|
||||
if (o1 == o2) {
|
||||
@ -107,8 +107,8 @@ public class BinningClusterer extends Clusterer {
|
||||
}
|
||||
}
|
||||
|
||||
public class EntriesComparator implements Comparator<Entry<Object,Integer>> {
|
||||
public int compare(Entry<Object,Integer> o1, Entry<Object,Integer> o2) {
|
||||
public class EntriesComparator implements Comparator<Entry<String,Integer>> {
|
||||
public int compare(Entry<String,Integer> o1, Entry<String,Integer> o2) {
|
||||
return o2.getValue() - o1.getValue();
|
||||
}
|
||||
}
|
||||
@ -123,8 +123,8 @@ public class BinningClusterer extends Clusterer {
|
||||
FilteredRows filteredRows = engine.getAllFilteredRows(true);
|
||||
filteredRows.accept(_project, visitor);
|
||||
|
||||
Map<String,Map<Object,Integer>> map = visitor.getMap();
|
||||
_clusters = new ArrayList<Map<Object,Integer>>(map.values());
|
||||
Map<String,Map<String,Integer>> map = visitor.getMap();
|
||||
_clusters = new ArrayList<Map<String,Integer>>(map.values());
|
||||
Collections.sort(_clusters, new SizeComparator());
|
||||
}
|
||||
|
||||
@ -132,12 +132,12 @@ public class BinningClusterer extends Clusterer {
|
||||
EntriesComparator c = new EntriesComparator();
|
||||
|
||||
writer.array();
|
||||
for (Map<Object,Integer> m : _clusters) {
|
||||
for (Map<String,Integer> m : _clusters) {
|
||||
if (m.size() > 1) {
|
||||
writer.array();
|
||||
List<Entry<Object,Integer>> entries = new ArrayList<Entry<Object,Integer>>(m.entrySet());
|
||||
List<Entry<String,Integer>> entries = new ArrayList<Entry<String,Integer>>(m.entrySet());
|
||||
Collections.sort(entries,c);
|
||||
for (Entry<Object,Integer> e : entries) {
|
||||
for (Entry<String,Integer> e : entries) {
|
||||
writer.object();
|
||||
writer.key("v"); writer.value(e.getKey());
|
||||
writer.key("c"); writer.value(e.getValue());
|
||||
|
@ -28,8 +28,8 @@ import com.wcohen.ss.api.Token;
|
||||
import com.wcohen.ss.tokens.NGramTokenizer;
|
||||
import com.wcohen.ss.tokens.SimpleTokenizer;
|
||||
|
||||
import edu.mit.simile.vicino.Distance;
|
||||
import edu.mit.simile.vicino.distances.BZip2Distance;
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
import edu.mit.simile.vicino.distances.GZipDistance;
|
||||
import edu.mit.simile.vicino.distances.JaccardDistance;
|
||||
import edu.mit.simile.vicino.distances.JaroDistance;
|
||||
@ -63,17 +63,17 @@ public class kNNClusterer extends Clusterer {
|
||||
Distance _distance;
|
||||
JSONObject _config;
|
||||
VPTreeBuilder _treeBuilder;
|
||||
float _radius;
|
||||
double _radius = 1.0f;
|
||||
|
||||
public VPTreeClusteringRowVisitor(Distance d, JSONObject o) {
|
||||
_distance = d;
|
||||
_config = o;
|
||||
_treeBuilder = new VPTreeBuilder(_distance);
|
||||
try {
|
||||
_radius = (float) o.getJSONObject("params").getDouble("radius");
|
||||
JSONObject params = o.getJSONObject("params");
|
||||
_radius = params.getDouble("radius");
|
||||
} catch (JSONException e) {
|
||||
Gridworks.warn("No radius found, using default");
|
||||
_radius = 0.1f;
|
||||
Gridworks.warn("No parameters found, using defaults");
|
||||
}
|
||||
}
|
||||
|
||||
@ -87,7 +87,7 @@ public class kNNClusterer extends Clusterer {
|
||||
return false;
|
||||
}
|
||||
|
||||
public Map<Serializable,List<? extends Serializable>> getClusters() {
|
||||
public Map<Serializable,List<Serializable>> getClusters() {
|
||||
return _treeBuilder.getClusters(_radius);
|
||||
}
|
||||
}
|
||||
@ -96,7 +96,8 @@ public class kNNClusterer extends Clusterer {
|
||||
|
||||
Distance _distance;
|
||||
JSONObject _config;
|
||||
float _radius;
|
||||
double _radius = 1.0d;
|
||||
int _blockingNgramSize = 6;
|
||||
HashSet<String> _data;
|
||||
|
||||
public BlockingClusteringRowVisitor(Distance d, JSONObject o) {
|
||||
@ -104,10 +105,11 @@ public class kNNClusterer extends Clusterer {
|
||||
_config = o;
|
||||
_data = new HashSet<String>();
|
||||
try {
|
||||
_radius = (float) o.getJSONObject("params").getDouble("radius");
|
||||
JSONObject params = o.getJSONObject("params");
|
||||
_radius = params.getDouble("radius");
|
||||
_blockingNgramSize = params.getInt("blocking-ngram-size");
|
||||
} catch (JSONException e) {
|
||||
Gridworks.warn("No radius found, using default");
|
||||
_radius = 0.1f;
|
||||
Gridworks.warn("No parameters found, using defaults");
|
||||
}
|
||||
}
|
||||
|
||||
@ -122,7 +124,7 @@ public class kNNClusterer extends Clusterer {
|
||||
}
|
||||
|
||||
public Map<Serializable,Set<Serializable>> getClusters() {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(4,4,false,SimpleTokenizer.DEFAULT_TOKENIZER);
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(_blockingNgramSize,_blockingNgramSize,false,SimpleTokenizer.DEFAULT_TOKENIZER);
|
||||
|
||||
Map<String,List<String>> blocks = new HashMap<String,List<String>>();
|
||||
|
||||
@ -148,9 +150,10 @@ public class kNNClusterer extends Clusterer {
|
||||
for (String a : list) {
|
||||
for (String b : list) {
|
||||
if (a == b) continue;
|
||||
if (clusters.containsKey(a) && clusters.get(a).contains(b)) continue;
|
||||
if (clusters.containsKey(b) && clusters.get(b).contains(a)) continue;
|
||||
double d = _distance.d(a,b);
|
||||
if (d <= _radius) {
|
||||
System.out.println(a + " | " + b + ": " + d);
|
||||
if (d <= _radius || _radius < 0) {
|
||||
Set<Serializable> l = null;
|
||||
if (!clusters.containsKey(a)) {
|
||||
l = new TreeSet<Serializable>();
|
||||
@ -165,6 +168,7 @@ public class kNNClusterer extends Clusterer {
|
||||
}
|
||||
}
|
||||
|
||||
Gridworks.log("Calculated " + _distance.getCount() + " distances");
|
||||
return clusters;
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +0,0 @@
|
||||
package edu.mit.simile.vicino;
|
||||
|
||||
public interface Distance {
|
||||
|
||||
public double d(String x, String y);
|
||||
|
||||
}
|
@ -2,7 +2,7 @@ package edu.mit.simile.vicino;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import edu.mit.simile.vicino.Distance;
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
public class Distributor extends Operator {
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
package edu.mit.simile.vicino;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
public class Meter extends Operator {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
@ -7,6 +7,8 @@ import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
public class Operator {
|
||||
|
||||
static void log(String msg) {
|
||||
|
@ -6,6 +6,7 @@ import java.io.Serializable;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
import edu.mit.simile.vicino.vptree.VPTree;
|
||||
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
|
||||
import edu.mit.simile.vicino.vptree.VPTreeSeeker;
|
||||
|
@ -2,6 +2,8 @@ package edu.mit.simile.vicino;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
public class Tester extends Operator {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
13
src/main/java/edu/mit/simile/vicino/distances/Distance.java
Normal file
13
src/main/java/edu/mit/simile/vicino/distances/Distance.java
Normal file
@ -0,0 +1,13 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
public abstract class Distance {
|
||||
|
||||
int counter = 0;
|
||||
|
||||
public int getCount() {
|
||||
return counter;
|
||||
}
|
||||
|
||||
public abstract double d(String x, String y);
|
||||
|
||||
}
|
@ -3,9 +3,7 @@ package edu.mit.simile.vicino.distances;
|
||||
import com.wcohen.ss.Levenstein;
|
||||
import com.wcohen.ss.api.StringDistance;
|
||||
|
||||
import edu.mit.simile.vicino.Distance;
|
||||
|
||||
public class LevenshteinDistance implements Distance {
|
||||
public class LevenshteinDistance extends MetricDistance {
|
||||
|
||||
StringDistance distance;
|
||||
|
||||
@ -13,7 +11,7 @@ public class LevenshteinDistance implements Distance {
|
||||
this.distance = new Levenstein();
|
||||
}
|
||||
|
||||
public double d(String x, String y) {
|
||||
public double d2(String x, String y) {
|
||||
return Math.abs(this.distance.score(x, y));
|
||||
}
|
||||
|
||||
|
@ -1,8 +1,7 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import edu.mit.simile.vicino.Distance;
|
||||
|
||||
public abstract class MetricDistance implements Distance {
|
||||
public abstract class MetricDistance extends Distance {
|
||||
|
||||
/*
|
||||
* public float d(String x,String y) {
|
||||
@ -15,9 +14,11 @@ public abstract class MetricDistance implements Distance {
|
||||
*/
|
||||
|
||||
public double d(String x, String y) {
|
||||
return d2(x, y);
|
||||
double result = d2(x, y);
|
||||
counter += 1;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
abstract double d2(String x, String y);
|
||||
|
||||
}
|
||||
|
@ -1,14 +1,14 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import edu.mit.simile.vicino.Distance;
|
||||
|
||||
public abstract class PseudoMetricDistance implements Distance {
|
||||
public abstract class PseudoMetricDistance extends Distance {
|
||||
|
||||
public double d(String x, String y) {
|
||||
double cxx = d2(x, x);
|
||||
double cyy = d2(y, y);
|
||||
double cxy = d2(x, y);
|
||||
double cyx = d2(y, x);
|
||||
counter += 4;
|
||||
return (cxy + cyx) / (cxx + cyy) - 1.0d;
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,7 @@ import java.util.Set;
|
||||
|
||||
import com.metaweb.gridworks.Gridworks;
|
||||
|
||||
import edu.mit.simile.vicino.Distance;
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
/**
|
||||
* @author Paolo Ciccarese
|
||||
@ -60,14 +60,14 @@ public class VPTreeBuilder {
|
||||
this.nodes.clear();
|
||||
}
|
||||
|
||||
public Map<Serializable,List<? extends Serializable>> getClusters(float radius) {
|
||||
public Map<Serializable,List<Serializable>> getClusters(double radius) {
|
||||
VPTree tree = buildVPTree();
|
||||
VPTreeSeeker seeker = new VPTreeSeeker(distance,tree);
|
||||
|
||||
Map<Serializable,List<? extends Serializable>> map = new HashMap<Serializable,List<? extends Serializable>>();
|
||||
Map<Serializable,List<Serializable>> map = new HashMap<Serializable,List<Serializable>>();
|
||||
for (Node n : nodes) {
|
||||
Serializable s = n.get();
|
||||
List<? extends Serializable> results = seeker.range(s, radius);
|
||||
List<Serializable> results = seeker.range(s, radius);
|
||||
map.put(s, results);
|
||||
}
|
||||
|
||||
|
@ -4,7 +4,7 @@ import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import edu.mit.simile.vicino.Distance;
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
/**
|
||||
* @author Paolo Ciccarese
|
||||
@ -19,11 +19,11 @@ public class VPTreeSeeker {
|
||||
this.tree = tree;
|
||||
}
|
||||
|
||||
public List<? extends Serializable> range(Serializable query, float range) {
|
||||
public List<Serializable> range(Serializable query, double range) {
|
||||
return rangeTraversal(query, range, tree.getRoot(), new ArrayList<Serializable>());
|
||||
}
|
||||
|
||||
private List<Serializable> rangeTraversal(Serializable query, float range, TNode tNode, List<Serializable> results) {
|
||||
private List<Serializable> rangeTraversal(Serializable query, double range, TNode tNode, List<Serializable> results) {
|
||||
|
||||
if (tNode != null) {
|
||||
double distance = this.distance.d(query.toString(), tNode.get().toString());
|
||||
|
@ -54,7 +54,8 @@ FacetBasedEditDialog.prototype._createDialog = function() {
|
||||
'Ngram Size: <input type="text" value="1" bind="ngramSize" size="3">' +
|
||||
'</div>' +
|
||||
'<div class="knn-controls hidden">' +
|
||||
'Radius: <input type="text" value="0.1" bind="radius" size="3">' +
|
||||
'<span style="margin-right: 1em">Radius: <input type="text" value="1.0" bind="radius" size="3"></span>' +
|
||||
'<span>Block Chars: <input type="text" value="6" bind="ngramBlock" size="3"></span>' +
|
||||
'</div>' +
|
||||
'</td>' +
|
||||
'<td bind="resultSummary" align="right">' +
|
||||
@ -114,6 +115,15 @@ FacetBasedEditDialog.prototype._createDialog = function() {
|
||||
alert("radius must be a number");
|
||||
}
|
||||
});
|
||||
|
||||
this._elmts.ngramBlock.change(function() {
|
||||
try {
|
||||
self._params = { "blocking-ngram-size" : parseInt($(this).val()) };
|
||||
self._cluster();
|
||||
} catch (e) {
|
||||
alert("radius must be a number");
|
||||
}
|
||||
});
|
||||
|
||||
//this._elmts.clusterButton.click(function() { self._cluster(); });
|
||||
//this._elmts.unclusterButton.click(function() { self._uncluster(); });
|
||||
|
Loading…
Reference in New Issue
Block a user