diff --git a/src/main/java/edu/mit/simile/vicino/Cluster.java b/src/main/java/edu/mit/simile/vicino/Cluster.java index a80a76863..dd5b9c327 100644 --- a/src/main/java/edu/mit/simile/vicino/Cluster.java +++ b/src/main/java/edu/mit/simile/vicino/Cluster.java @@ -44,7 +44,7 @@ public class Cluster extends Operator { log("VPTree found " + vptree_clusters.size() + " in " + vptree_elapsed + " ms with " + vptree_distances + " distances\n"); log("NGram found " + ngram_clusters.size() + " in " + ngram_elapsed + " ms with " + ngram_distances + " distances\n"); - + if (vptree_clusters.size() > ngram_clusters.size()) { log("VPTree clusterer found these clusters the other method couldn't: "); diff(vptree_clusters,ngram_clusters); @@ -52,6 +52,8 @@ public class Cluster extends Operator { log("NGram clusterer found these clusters the other method couldn't: "); diff(ngram_clusters,vptree_clusters); } + + System.exit(0); } private void diff(List> more, List> base) { @@ -63,11 +65,15 @@ public class Cluster extends Operator { for (Set s : more) { if (!holder.contains(s)) { - for (Serializable ss : s) { - log(ss.toString()); - } - log(""); + printCluster(s); } } } + + private void printCluster(Set cluster) { + for (Serializable s : cluster) { + log(s.toString()); + } + log(""); + } } diff --git a/src/main/java/edu/mit/simile/vicino/clustering/NGramClusterer.java b/src/main/java/edu/mit/simile/vicino/clustering/NGramClusterer.java index 13ece9103..bbf3dacbf 100644 --- a/src/main/java/edu/mit/simile/vicino/clustering/NGramClusterer.java +++ b/src/main/java/edu/mit/simile/vicino/clustering/NGramClusterer.java @@ -4,11 +4,17 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.Map.Entry; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import com.wcohen.ss.api.Token; @@ -42,7 +48,111 @@ public class NGramClusterer extends Clusterer { } } + public class BlockEvaluator implements Callable>> { + + int start; + int stop; + double radius; + + List> blocks; + Map> cluster_map; + + public BlockEvaluator(List> blocks, double radius, int start, int stop) { + this.blocks = blocks; + this.start = start; + this.stop = stop; + this.radius = radius; + } + + public Map> call() { + Map> cluster_map = new HashMap>(); + + for (int i = start; i < stop; i++) { + Set set = blocks.get(i); + if (set.size() < 2) continue; + for (String a : set) { + for (String b : set) { + if (a == b) continue; + if (cluster_map.containsKey(a) && cluster_map.get(a).contains(b)) continue; + if (cluster_map.containsKey(b) && cluster_map.get(b).contains(a)) continue; + double d = _distance.d(a,b); + if (d <= radius || radius < 0) { + Set l = null; + if (!cluster_map.containsKey(a)) { + l = new TreeSet(); + l.add(a); + cluster_map.put(a, l); + } else { + l = cluster_map.get(a); + } + l.add(b); + } + } + } + } + + return cluster_map; + } + } + + private static final ExecutorService executor = Executors.newCachedThreadPool(); + + private static final boolean MULTITHREADED = true; + public List> getClusters(double radius) { + if (MULTITHREADED) { + return getClustersMultiThread(radius); + } else { + return getClustersSingleThread(radius); + } + } + + public List> getClustersMultiThread(double radius) { + + int cores = Runtime.getRuntime().availableProcessors(); + int size = blocks.size(); + int range = size / cores + 1; + + List>> cluster_maps = new ArrayList>>(cores); + + List evaluators = new ArrayList(cores); + for (int i = 0; i < cores; i++) { + int range_start = range * i; + int range_end = range * (i + 1); + if (range_end > size) range_end = size; + evaluators.add(new BlockEvaluator(new ArrayList>(blocks.values()),radius,range_start,range_end)); + } + + try { + List>>> futures = executor.invokeAll(evaluators); + for (Future>> future : futures) { + cluster_maps.add(future.get()); + } + } catch (InterruptedException e1) { + e1.printStackTrace(); + } catch (ExecutionException e) { + e.printStackTrace(); + } + + Set> clusters = new HashSet>(); + + for (Map> cluster_map : cluster_maps) { + for (Entry> e : cluster_map.entrySet()) { + Set v = e.getValue(); + if (v.size() > 1) { + clusters.add(v); + } + } + } + + List> sorted_clusters = new ArrayList>(clusters); + + Collections.sort(sorted_clusters, new SizeComparator()); + + return sorted_clusters; + } + + public List> getClustersSingleThread(double radius) { Map> cluster_map = new HashMap>();