From 50e58fb863a05e5c09a4099e735588c3bcec3d44 Mon Sep 17 00:00:00 2001 From: Stefano Mazzocchi Date: Tue, 9 Mar 2010 09:05:20 +0000 Subject: [PATCH] ngram-blocking gives more expected results... but slow as hell, maybe bug in the vptree code? git-svn-id: http://google-refine.googlecode.com/svn/trunk@255 7d457c2a-affb-35e4-300a-418c747d4874 --- .../clustering/knn/kNNClusterer.java | 94 +++++++++++-------- .../vicino/distances/LevenshteinDistance.java | 2 +- 2 files changed, 58 insertions(+), 38 deletions(-) diff --git a/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java b/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java index 5583d5b52..2353a7dcd 100644 --- a/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java +++ b/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java @@ -9,6 +9,8 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.Set; +import java.util.TreeSet; import org.json.JSONException; import org.json.JSONObject; @@ -22,9 +24,9 @@ import com.metaweb.gridworks.clustering.Clusterer; import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Row; -import com.wcohen.ss.expt.ClusterNGramBlocker; -import com.wcohen.ss.expt.MatchData; -import com.wcohen.ss.expt.Blocker.Pair; +import com.wcohen.ss.api.Token; +import com.wcohen.ss.tokens.NGramTokenizer; +import com.wcohen.ss.tokens.SimpleTokenizer; import edu.mit.simile.vicino.Distance; import edu.mit.simile.vicino.distances.BZip2Distance; @@ -43,7 +45,7 @@ public class kNNClusterer extends Clusterer { static protected Map _distances = new HashMap(); - List> _clusters; + ArrayList> _clusters; static { _distances.put("levenshtein", new LevenshteinDistance()); @@ -94,15 +96,13 @@ public class kNNClusterer extends Clusterer { Distance _distance; JSONObject _config; - MatchData _data; float _radius; - HashSet _set; + HashSet _data; public BlockingClusteringRowVisitor(Distance d, JSONObject o) { _distance = d; _config = o; - _data = new MatchData(); - _set = new HashSet(); + _data = new HashSet(); try { _radius = (float) o.getJSONObject("params").getDouble("radius"); } catch (JSONException e) { @@ -116,41 +116,61 @@ public class kNNClusterer extends Clusterer { if (cell != null && cell.value != null) { Object v = cell.value; String s = (v instanceof String) ? ((String) v) : v.toString().intern(); - if (!_set.contains(s)) { - _set.add(s); - _data.addInstance("", "", s); - } + _data.add(s); } return false; } - public Map> getClusters() { - Map> map = new HashMap>(); - ClusterNGramBlocker blocker = new ClusterNGramBlocker(); - blocker.block(_data); - for (int i = 0; i < blocker.numCorrectPairs(); i++) { - Pair p = blocker.getPair(i); - String a = p.getA().unwrap(); - String b = p.getB().unwrap(); - List l = null; - if (!map.containsKey(a)) { - l = new ArrayList(); - map.put(a, l); - } else { - l = map.get(a); - } - double d = _distance.d(a,b); - System.out.println(a + " | " + b + ": " + d); - if (d <= _radius) { - l.add(b); + public Map> getClusters() { + NGramTokenizer tokenizer = new NGramTokenizer(4,4,false,SimpleTokenizer.DEFAULT_TOKENIZER); + + Map> blocks = new HashMap>(); + + for (String s : _data) { + Token[] tokens = tokenizer.tokenize(s); + for (Token t : tokens) { + String ss = t.getValue(); + List l = null; + if (!blocks.containsKey(ss)) { + l = new ArrayList(); + blocks.put(ss, l); + } else { + l = blocks.get(ss); + } + l.add(s); } } - return map; + + Map> clusters = new HashMap>(); + + for (List list : blocks.values()) { + if (list.size() < 2) continue; + for (String a : list) { + for (String b : list) { + if (a == b) continue; + double d = _distance.d(a,b); + if (d <= _radius) { + System.out.println(a + " | " + b + ": " + d); + Set l = null; + if (!clusters.containsKey(a)) { + l = new TreeSet(); + l.add(a); + clusters.put(a, l); + } else { + l = clusters.get(a); + } + l.add(b); + } + } + } + } + + return clusters; } } - public class SizeComparator implements Comparator> { - public int compare(List o1, List o2) { + public class SizeComparator implements Comparator> { + public int compare(Set o1, Set o2) { return o2.size() - o1.size(); } } @@ -166,14 +186,14 @@ public class kNNClusterer extends Clusterer { FilteredRows filteredRows = engine.getAllFilteredRows(true); filteredRows.accept(_project, visitor); - Map> clusters = visitor.getClusters(); - _clusters = new ArrayList>(clusters.values()); + Map> clusters = visitor.getClusters(); + _clusters = new ArrayList>(clusters.values()); Collections.sort(_clusters, new SizeComparator()); } public void write(JSONWriter writer, Properties options) throws JSONException { writer.array(); - for (List m : _clusters) { + for (Set m : _clusters) { if (m.size() > 1) { writer.array(); for (Serializable s : m) { diff --git a/src/main/java/edu/mit/simile/vicino/distances/LevenshteinDistance.java b/src/main/java/edu/mit/simile/vicino/distances/LevenshteinDistance.java index 93720d072..0e578b48c 100644 --- a/src/main/java/edu/mit/simile/vicino/distances/LevenshteinDistance.java +++ b/src/main/java/edu/mit/simile/vicino/distances/LevenshteinDistance.java @@ -14,7 +14,7 @@ public class LevenshteinDistance implements Distance { } public double d(String x, String y) { - return this.distance.score(x, y); + return Math.abs(this.distance.score(x, y)); } }