make the kNN clustering report the right counts for the facet values (and order them in the clusters by counts)

git-svn-id: http://google-refine.googlecode.com/svn/trunk@286 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-12 19:10:22 +00:00
parent d72c07b715
commit 00a81c5fc4

View File

@ -11,6 +11,7 @@ import java.util.Map;
import java.util.Properties; import java.util.Properties;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.Map.Entry;
import org.json.JSONException; import org.json.JSONException;
import org.json.JSONObject; import org.json.JSONObject;
@ -44,8 +45,10 @@ public class kNNClusterer extends Clusterer {
static protected Map<String, Distance> _distances = new HashMap<String, Distance>(); static protected Map<String, Distance> _distances = new HashMap<String, Distance>();
ArrayList<Set<Serializable>> _clusters; List<Set<Serializable>> _clusters;
Map<Serializable, Integer> _counts = new HashMap<Serializable, Integer>();
static { static {
_distances.put("levenshtein", new LevenshteinDistance()); _distances.put("levenshtein", new LevenshteinDistance());
_distances.put("jaccard", new JaccardDistance()); _distances.put("jaccard", new JaccardDistance());
@ -82,6 +85,7 @@ public class kNNClusterer extends Clusterer {
Object v = cell.value; Object v = cell.value;
String s = (v instanceof String) ? ((String) v) : v.toString(); String s = (v instanceof String) ? ((String) v) : v.toString();
_treeBuilder.populate(s); _treeBuilder.populate(s);
count(s);
} }
return false; return false;
} }
@ -120,6 +124,7 @@ public class kNNClusterer extends Clusterer {
Object v = cell.value; Object v = cell.value;
String s = (v instanceof String) ? ((String) v) : v.toString().intern(); String s = (v instanceof String) ? ((String) v) : v.toString().intern();
_data.add(s); _data.add(s);
count(s);
} }
return false; return false;
} }
@ -183,6 +188,12 @@ public class kNNClusterer extends Clusterer {
return o2.size() - o1.size(); return o2.size() - o1.size();
} }
} }
public class ValuesComparator implements Comparator<Entry<Serializable,Integer>> {
public int compare(Entry<Serializable,Integer> o1, Entry<Serializable,Integer> o2) {
return o2.getValue() - o1.getValue();
}
}
public void initializeFromJSON(Project project, JSONObject o) throws Exception { public void initializeFromJSON(Project project, JSONObject o) throws Exception {
super.initializeFromJSON(project, o); super.initializeFromJSON(project, o);
@ -204,11 +215,17 @@ public class kNNClusterer extends Clusterer {
writer.array(); writer.array();
for (Set<Serializable> m : _clusters) { for (Set<Serializable> m : _clusters) {
if (m.size() > 1) { if (m.size() > 1) {
writer.array(); Map<Serializable,Integer> internal_counts = new HashMap<Serializable,Integer>();
for (Serializable s : m) { for (Serializable s : m) {
internal_counts.put(s,_counts.get(s));
}
List<Entry<Serializable,Integer>> values = new ArrayList<Entry<Serializable,Integer>>(internal_counts.entrySet());
Collections.sort(values, new ValuesComparator());
writer.array();
for (Entry<Serializable,Integer> e : values) {
writer.object(); writer.object();
writer.key("v"); writer.value(s); writer.key("v"); writer.value(e.getKey());
writer.key("c"); writer.value(1); writer.key("c"); writer.value(e.getValue());
writer.endObject(); writer.endObject();
} }
writer.endArray(); writer.endArray();
@ -216,4 +233,12 @@ public class kNNClusterer extends Clusterer {
} }
writer.endArray(); writer.endArray();
} }
private void count(Serializable s) {
if (_counts.containsKey(s)) {
_counts.put(s, _counts.get(s) + 1);
} else {
_counts.put(s, 1);
}
}
} }