ngram-blocking gives more expected results... but slow as hell, maybe bug in the vptree code?

git-svn-id: http://google-refine.googlecode.com/svn/trunk@255 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-09 09:05:20 +00:00
parent 546f87a536
commit 50e58fb863
2 changed files with 58 additions and 38 deletions

View File

@ -9,6 +9,8 @@ import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;
import org.json.JSONException; import org.json.JSONException;
import org.json.JSONObject; import org.json.JSONObject;
@ -22,9 +24,9 @@ import com.metaweb.gridworks.clustering.Clusterer;
import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row; import com.metaweb.gridworks.model.Row;
import com.wcohen.ss.expt.ClusterNGramBlocker; import com.wcohen.ss.api.Token;
import com.wcohen.ss.expt.MatchData; import com.wcohen.ss.tokens.NGramTokenizer;
import com.wcohen.ss.expt.Blocker.Pair; import com.wcohen.ss.tokens.SimpleTokenizer;
import edu.mit.simile.vicino.Distance; import edu.mit.simile.vicino.Distance;
import edu.mit.simile.vicino.distances.BZip2Distance; import edu.mit.simile.vicino.distances.BZip2Distance;
@ -43,7 +45,7 @@ public class kNNClusterer extends Clusterer {
static protected Map<String, Distance> _distances = new HashMap<String, Distance>(); static protected Map<String, Distance> _distances = new HashMap<String, Distance>();
List<List<Serializable>> _clusters; ArrayList<Set<Serializable>> _clusters;
static { static {
_distances.put("levenshtein", new LevenshteinDistance()); _distances.put("levenshtein", new LevenshteinDistance());
@ -94,15 +96,13 @@ public class kNNClusterer extends Clusterer {
Distance _distance; Distance _distance;
JSONObject _config; JSONObject _config;
MatchData _data;
float _radius; float _radius;
HashSet<String> _set; HashSet<String> _data;
public BlockingClusteringRowVisitor(Distance d, JSONObject o) { public BlockingClusteringRowVisitor(Distance d, JSONObject o) {
_distance = d; _distance = d;
_config = o; _config = o;
_data = new MatchData(); _data = new HashSet<String>();
_set = new HashSet<String>();
try { try {
_radius = (float) o.getJSONObject("params").getDouble("radius"); _radius = (float) o.getJSONObject("params").getDouble("radius");
} catch (JSONException e) { } catch (JSONException e) {
@ -116,41 +116,61 @@ public class kNNClusterer extends Clusterer {
if (cell != null && cell.value != null) { if (cell != null && cell.value != null) {
Object v = cell.value; Object v = cell.value;
String s = (v instanceof String) ? ((String) v) : v.toString().intern(); String s = (v instanceof String) ? ((String) v) : v.toString().intern();
if (!_set.contains(s)) { _data.add(s);
_set.add(s);
_data.addInstance("", "", s);
}
} }
return false; return false;
} }
public Map<Serializable,List<Serializable>> getClusters() { public Map<Serializable,Set<Serializable>> getClusters() {
Map<Serializable,List<Serializable>> map = new HashMap<Serializable,List<Serializable>>(); NGramTokenizer tokenizer = new NGramTokenizer(4,4,false,SimpleTokenizer.DEFAULT_TOKENIZER);
ClusterNGramBlocker blocker = new ClusterNGramBlocker();
blocker.block(_data); Map<String,List<String>> blocks = new HashMap<String,List<String>>();
for (int i = 0; i < blocker.numCorrectPairs(); i++) {
Pair p = blocker.getPair(i); for (String s : _data) {
String a = p.getA().unwrap(); Token[] tokens = tokenizer.tokenize(s);
String b = p.getB().unwrap(); for (Token t : tokens) {
List<Serializable> l = null; String ss = t.getValue();
if (!map.containsKey(a)) { List<String> l = null;
l = new ArrayList<Serializable>(); if (!blocks.containsKey(ss)) {
map.put(a, l); l = new ArrayList<String>();
} else { blocks.put(ss, l);
l = map.get(a); } else {
} l = blocks.get(ss);
double d = _distance.d(a,b); }
System.out.println(a + " | " + b + ": " + d); l.add(s);
if (d <= _radius) {
l.add(b);
} }
} }
return map;
Map<Serializable,Set<Serializable>> clusters = new HashMap<Serializable,Set<Serializable>>();
for (List<String> list : blocks.values()) {
if (list.size() < 2) continue;
for (String a : list) {
for (String b : list) {
if (a == b) continue;
double d = _distance.d(a,b);
if (d <= _radius) {
System.out.println(a + " | " + b + ": " + d);
Set<Serializable> l = null;
if (!clusters.containsKey(a)) {
l = new TreeSet<Serializable>();
l.add(a);
clusters.put(a, l);
} else {
l = clusters.get(a);
}
l.add(b);
}
}
}
}
return clusters;
} }
} }
public class SizeComparator implements Comparator<List<Serializable>> { public class SizeComparator implements Comparator<Set<Serializable>> {
public int compare(List<Serializable> o1, List<Serializable> o2) { public int compare(Set<Serializable> o1, Set<Serializable> o2) {
return o2.size() - o1.size(); return o2.size() - o1.size();
} }
} }
@ -166,14 +186,14 @@ public class kNNClusterer extends Clusterer {
FilteredRows filteredRows = engine.getAllFilteredRows(true); FilteredRows filteredRows = engine.getAllFilteredRows(true);
filteredRows.accept(_project, visitor); filteredRows.accept(_project, visitor);
Map<Serializable,List<Serializable>> clusters = visitor.getClusters(); Map<Serializable,Set<Serializable>> clusters = visitor.getClusters();
_clusters = new ArrayList<List<Serializable>>(clusters.values()); _clusters = new ArrayList<Set<Serializable>>(clusters.values());
Collections.sort(_clusters, new SizeComparator()); Collections.sort(_clusters, new SizeComparator());
} }
public void write(JSONWriter writer, Properties options) throws JSONException { public void write(JSONWriter writer, Properties options) throws JSONException {
writer.array(); writer.array();
for (List<Serializable> m : _clusters) { for (Set<Serializable> m : _clusters) {
if (m.size() > 1) { if (m.size() > 1) {
writer.array(); writer.array();
for (Serializable s : m) { for (Serializable s : m) {

View File

@ -14,7 +14,7 @@ public class LevenshteinDistance implements Distance {
} }
public double d(String x, String y) { public double d(String x, String y) {
return this.distance.score(x, y); return Math.abs(this.distance.score(x, y));
} }
} }