make the kNN clustering report the right counts for the facet values (and order them in the clusters by counts)

git-svn-id: http://google-refine.googlecode.com/svn/trunk@286 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-12 19:10:22 +00:00
parent d72c07b715
commit 00a81c5fc4

View File

@ -11,6 +11,7 @@ import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;
import java.util.Map.Entry;
import org.json.JSONException;
import org.json.JSONObject;
@ -44,8 +45,10 @@ public class kNNClusterer extends Clusterer {
static protected Map<String, Distance> _distances = new HashMap<String, Distance>();
ArrayList<Set<Serializable>> _clusters;
List<Set<Serializable>> _clusters;
Map<Serializable, Integer> _counts = new HashMap<Serializable, Integer>();
static {
_distances.put("levenshtein", new LevenshteinDistance());
_distances.put("jaccard", new JaccardDistance());
@ -82,6 +85,7 @@ public class kNNClusterer extends Clusterer {
Object v = cell.value;
String s = (v instanceof String) ? ((String) v) : v.toString();
_treeBuilder.populate(s);
count(s);
}
return false;
}
@ -120,6 +124,7 @@ public class kNNClusterer extends Clusterer {
Object v = cell.value;
String s = (v instanceof String) ? ((String) v) : v.toString().intern();
_data.add(s);
count(s);
}
return false;
}
@ -183,6 +188,12 @@ public class kNNClusterer extends Clusterer {
return o2.size() - o1.size();
}
}
public class ValuesComparator implements Comparator<Entry<Serializable,Integer>> {
public int compare(Entry<Serializable,Integer> o1, Entry<Serializable,Integer> o2) {
return o2.getValue() - o1.getValue();
}
}
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
super.initializeFromJSON(project, o);
@ -204,11 +215,17 @@ public class kNNClusterer extends Clusterer {
writer.array();
for (Set<Serializable> m : _clusters) {
if (m.size() > 1) {
writer.array();
Map<Serializable,Integer> internal_counts = new HashMap<Serializable,Integer>();
for (Serializable s : m) {
internal_counts.put(s,_counts.get(s));
}
List<Entry<Serializable,Integer>> values = new ArrayList<Entry<Serializable,Integer>>(internal_counts.entrySet());
Collections.sort(values, new ValuesComparator());
writer.array();
for (Entry<Serializable,Integer> e : values) {
writer.object();
writer.key("v"); writer.value(s);
writer.key("c"); writer.value(1);
writer.key("v"); writer.value(e.getKey());
writer.key("c"); writer.value(e.getValue());
writer.endObject();
}
writer.endArray();
@ -216,4 +233,12 @@ public class kNNClusterer extends Clusterer {
}
writer.endArray();
}
private void count(Serializable s) {
if (_counts.containsKey(s)) {
_counts.put(s, _counts.get(s) + 1);
} else {
_counts.put(s, 1);
}
}
}