make the kNN clustering report the right counts for the facet values (and order them in the clusters by counts)
git-svn-id: http://google-refine.googlecode.com/svn/trunk@286 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
d72c07b715
commit
00a81c5fc4
@ -11,6 +11,7 @@ import java.util.Map;
|
|||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
import org.json.JSONException;
|
import org.json.JSONException;
|
||||||
import org.json.JSONObject;
|
import org.json.JSONObject;
|
||||||
@ -44,7 +45,9 @@ public class kNNClusterer extends Clusterer {
|
|||||||
|
|
||||||
static protected Map<String, Distance> _distances = new HashMap<String, Distance>();
|
static protected Map<String, Distance> _distances = new HashMap<String, Distance>();
|
||||||
|
|
||||||
ArrayList<Set<Serializable>> _clusters;
|
List<Set<Serializable>> _clusters;
|
||||||
|
|
||||||
|
Map<Serializable, Integer> _counts = new HashMap<Serializable, Integer>();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
_distances.put("levenshtein", new LevenshteinDistance());
|
_distances.put("levenshtein", new LevenshteinDistance());
|
||||||
@ -82,6 +85,7 @@ public class kNNClusterer extends Clusterer {
|
|||||||
Object v = cell.value;
|
Object v = cell.value;
|
||||||
String s = (v instanceof String) ? ((String) v) : v.toString();
|
String s = (v instanceof String) ? ((String) v) : v.toString();
|
||||||
_treeBuilder.populate(s);
|
_treeBuilder.populate(s);
|
||||||
|
count(s);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -120,6 +124,7 @@ public class kNNClusterer extends Clusterer {
|
|||||||
Object v = cell.value;
|
Object v = cell.value;
|
||||||
String s = (v instanceof String) ? ((String) v) : v.toString().intern();
|
String s = (v instanceof String) ? ((String) v) : v.toString().intern();
|
||||||
_data.add(s);
|
_data.add(s);
|
||||||
|
count(s);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -184,6 +189,12 @@ public class kNNClusterer extends Clusterer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public class ValuesComparator implements Comparator<Entry<Serializable,Integer>> {
|
||||||
|
public int compare(Entry<Serializable,Integer> o1, Entry<Serializable,Integer> o2) {
|
||||||
|
return o2.getValue() - o1.getValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
|
public void initializeFromJSON(Project project, JSONObject o) throws Exception {
|
||||||
super.initializeFromJSON(project, o);
|
super.initializeFromJSON(project, o);
|
||||||
_distance = _distances.get(o.getString("function").toLowerCase());
|
_distance = _distances.get(o.getString("function").toLowerCase());
|
||||||
@ -204,11 +215,17 @@ public class kNNClusterer extends Clusterer {
|
|||||||
writer.array();
|
writer.array();
|
||||||
for (Set<Serializable> m : _clusters) {
|
for (Set<Serializable> m : _clusters) {
|
||||||
if (m.size() > 1) {
|
if (m.size() > 1) {
|
||||||
writer.array();
|
Map<Serializable,Integer> internal_counts = new HashMap<Serializable,Integer>();
|
||||||
for (Serializable s : m) {
|
for (Serializable s : m) {
|
||||||
|
internal_counts.put(s,_counts.get(s));
|
||||||
|
}
|
||||||
|
List<Entry<Serializable,Integer>> values = new ArrayList<Entry<Serializable,Integer>>(internal_counts.entrySet());
|
||||||
|
Collections.sort(values, new ValuesComparator());
|
||||||
|
writer.array();
|
||||||
|
for (Entry<Serializable,Integer> e : values) {
|
||||||
writer.object();
|
writer.object();
|
||||||
writer.key("v"); writer.value(s);
|
writer.key("v"); writer.value(e.getKey());
|
||||||
writer.key("c"); writer.value(1);
|
writer.key("c"); writer.value(e.getValue());
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
writer.endArray();
|
writer.endArray();
|
||||||
@ -216,4 +233,12 @@ public class kNNClusterer extends Clusterer {
|
|||||||
}
|
}
|
||||||
writer.endArray();
|
writer.endArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void count(Serializable s) {
|
||||||
|
if (_counts.containsKey(s)) {
|
||||||
|
_counts.put(s, _counts.get(s) + 1);
|
||||||
|
} else {
|
||||||
|
_counts.put(s, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user