From 227b30c860c16a9afc9ce2ad188e8951e0506bf9 Mon Sep 17 00:00:00 2001 From: Stefano Mazzocchi Date: Mon, 15 Mar 2010 04:30:49 +0000 Subject: [PATCH] more optimizations for clustering git-svn-id: http://google-refine.googlecode.com/svn/trunk@296 7d457c2a-affb-35e4-300a-418c747d4874 --- .../java/edu/mit/simile/vicino/Cluster.java | 36 ++++-- .../edu/mit/simile/vicino/NGramTokenizer.java | 110 +++++++++++------ .../java/edu/mit/simile/vicino/Operator.java | 32 +++-- .../vicino/clustering/NGramClusterer.java | 3 +- .../vicino/clustering/VPTreeClusterer.java | 1 + .../simile/vicino/vptree/VPTreeBuilder.java | 112 ++++++++++++------ 6 files changed, 189 insertions(+), 105 deletions(-) diff --git a/src/main/java/edu/mit/simile/vicino/Cluster.java b/src/main/java/edu/mit/simile/vicino/Cluster.java index fc412bd07..a80a76863 100644 --- a/src/main/java/edu/mit/simile/vicino/Cluster.java +++ b/src/main/java/edu/mit/simile/vicino/Cluster.java @@ -1,6 +1,7 @@ package edu.mit.simile.vicino; import java.io.Serializable; +import java.util.HashSet; import java.util.List; import java.util.Set; @@ -42,20 +43,31 @@ public class Cluster extends Operator { distance.resetCounter(); log("VPTree found " + vptree_clusters.size() + " in " + vptree_elapsed + " ms with " + vptree_distances + " distances\n"); - for (Set s : vptree_clusters) { - for (Serializable ss : s) { - log(" " + ss); - } - log(""); - } - log("NGram found " + ngram_clusters.size() + " in " + ngram_elapsed + " ms with " + ngram_distances + " distances\n"); - for (Set s : ngram_clusters) { - for (Serializable ss : s) { - log(" " + ss); - } - log(""); + + if (vptree_clusters.size() > ngram_clusters.size()) { + log("VPTree clusterer found these clusters the other method couldn't: "); + diff(vptree_clusters,ngram_clusters); + } else if (ngram_clusters.size() > vptree_clusters.size()) { + log("NGram clusterer found these clusters the other method couldn't: "); + diff(ngram_clusters,vptree_clusters); + } + } + + private void diff(List> more, List> base) { + Set> holder = new HashSet>(base.size()); + + for (Set s : base) { + holder.add(s); } + for (Set s : more) { + if (!holder.contains(s)) { + for (Serializable ss : s) { + log(ss.toString()); + } + log(""); + } + } } } diff --git a/src/main/java/edu/mit/simile/vicino/NGramTokenizer.java b/src/main/java/edu/mit/simile/vicino/NGramTokenizer.java index 9e7cb1d7f..6d4891e82 100644 --- a/src/main/java/edu/mit/simile/vicino/NGramTokenizer.java +++ b/src/main/java/edu/mit/simile/vicino/NGramTokenizer.java @@ -3,58 +3,92 @@ package edu.mit.simile.vicino; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.regex.Pattern; import com.wcohen.ss.api.Token; import com.wcohen.ss.api.Tokenizer; -import com.wcohen.ss.tokens.BasicToken; -import com.wcohen.ss.tokens.SimpleTokenizer; -/** - * Wraps another tokenizer, and adds all computes all ngrams of - * characters from a single token produced by the inner tokenizer. - */ public class NGramTokenizer implements Tokenizer { - private int minNGramSize; - private int maxNGramSize; - private boolean keepOldTokens; - private Tokenizer innerTokenizer; - - public static NGramTokenizer DEFAULT_TOKENIZER = new NGramTokenizer(3,5,true,SimpleTokenizer.DEFAULT_TOKENIZER); + private int ngram_size; - public NGramTokenizer(int minNGramSize,int maxNGramSize,boolean keepOldTokens,Tokenizer innerTokenizer) { - this.minNGramSize = minNGramSize; - this.maxNGramSize = maxNGramSize; - this.keepOldTokens = keepOldTokens; - this.innerTokenizer = innerTokenizer; + public NGramTokenizer(int ngram_size) { + this.ngram_size = ngram_size; } - public Token[] tokenize(String input) { - Token[] initialTokens = innerTokenizer.tokenize(input); + public Token[] tokenize(String str) { + str = normalize(str); List tokens = new ArrayList(); - for (int i = 0; i < initialTokens.length; i++) { - String str = initialTokens[i].getValue(); - if (keepOldTokens) tokens.add( intern(str) ); - for (int lo = 0; lo < str.length(); lo++) { - for (int len = minNGramSize; len <= maxNGramSize; len++) { - if (lo + len < str.length()) { - tokens.add(innerTokenizer.intern(str.substring(lo,lo+len))); - } - } + for (int i = 0; i < str.length(); i++) { + int index = i + ngram_size; + if (index <= str.length()) { + tokens.add(intern(str.substring(i,index))); } } return (Token[]) tokens.toArray(new BasicToken[tokens.size()]); } - - public Token intern(String s) { - return innerTokenizer.intern(s); - } - - public Iterator tokenIterator() { - return innerTokenizer.tokenIterator(); - } - public int maxTokenIndex() { - return innerTokenizer.maxTokenIndex(); + static final Pattern extra = Pattern.compile("\\p{Cntrl}|\\p{Punct}"); + static final Pattern whitespace = Pattern.compile("\\p{Space}+"); + + private String normalize(String s) { + s = s.trim(); + s = extra.matcher(s).replaceAll(""); + s = whitespace.matcher(s).replaceAll(" "); + s = s.toLowerCase(); + return s.intern(); + } + + private int nextId = 0; + private Map tokMap = new TreeMap(); + + public Token intern(String s) { + s = s.toLowerCase().intern(); + Token tok = tokMap.get(s); + if (tok == null) { + tok = new BasicToken(++nextId, s); + tokMap.put(s, tok); + } + return tok; + } + + public Iterator tokenIterator() { + return tokMap.values().iterator(); + } + + public int maxTokenIndex() { + return nextId; + } + + public class BasicToken implements Token, Comparable { + private final int index; + private final String value; + + BasicToken(int index, String value) { + this.index = index; + this.value = value; + } + + public String getValue() { + return value; + } + + public int getIndex() { + return index; + } + + public int compareTo(Token t) { + return index - t.getIndex(); + } + + public int hashCode() { + return value.hashCode(); + } + + public String toString() { + return "[token#" + getIndex() + ":" + getValue() + "]"; + } } } diff --git a/src/main/java/edu/mit/simile/vicino/Operator.java b/src/main/java/edu/mit/simile/vicino/Operator.java index c4cff31c3..b064860cf 100644 --- a/src/main/java/edu/mit/simile/vicino/Operator.java +++ b/src/main/java/edu/mit/simile/vicino/Operator.java @@ -2,8 +2,9 @@ package edu.mit.simile.vicino; import java.io.BufferedReader; import java.io.File; -import java.io.FileReader; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; @@ -20,30 +21,27 @@ public class Operator { } static List getStrings(String fileName) throws IOException { - ArrayList strings = new ArrayList(); + List strings = new ArrayList(); File file = new File(fileName); if (file.isDirectory()) { File[] files = file.listFiles(); - for (int i = 0; i < files.length; i++) { - BufferedReader input = new BufferedReader(new FileReader(files[i])); - StringBuffer b = new StringBuffer(); - String line; - while ((line = input.readLine()) != null) { - b.append(line.trim()); - } - input.close(); - strings.add(b.toString()); + for (File f : files) { + getStrings(f, strings); } } else { - BufferedReader input = new BufferedReader(new FileReader(fileName)); - String line; - while ((line = input.readLine()) != null) { - strings.add(line.trim()); - } - input.close(); + getStrings(file, strings); } return strings; } + + static void getStrings(File file, List strings) throws IOException { + BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); + String line; + while ((line = input.readLine()) != null) { + strings.add(line.trim().intern()); + } + input.close(); + } } diff --git a/src/main/java/edu/mit/simile/vicino/clustering/NGramClusterer.java b/src/main/java/edu/mit/simile/vicino/clustering/NGramClusterer.java index fa08b4382..13ece9103 100644 --- a/src/main/java/edu/mit/simile/vicino/clustering/NGramClusterer.java +++ b/src/main/java/edu/mit/simile/vicino/clustering/NGramClusterer.java @@ -11,7 +11,6 @@ import java.util.TreeSet; import java.util.Map.Entry; import com.wcohen.ss.api.Token; -import com.wcohen.ss.tokens.SimpleTokenizer; import edu.mit.simile.vicino.NGramTokenizer; import edu.mit.simile.vicino.distances.Distance; @@ -24,7 +23,7 @@ public class NGramClusterer extends Clusterer { Map> blocks = new HashMap>(); public NGramClusterer(Distance d, int blockSize) { - _tokenizer = new NGramTokenizer(blockSize,blockSize,false,SimpleTokenizer.DEFAULT_TOKENIZER); + _tokenizer = new NGramTokenizer(blockSize); _distance = d; } diff --git a/src/main/java/edu/mit/simile/vicino/clustering/VPTreeClusterer.java b/src/main/java/edu/mit/simile/vicino/clustering/VPTreeClusterer.java index 73059de85..a75a2022e 100644 --- a/src/main/java/edu/mit/simile/vicino/clustering/VPTreeClusterer.java +++ b/src/main/java/edu/mit/simile/vicino/clustering/VPTreeClusterer.java @@ -30,6 +30,7 @@ public class VPTreeClusterer extends Clusterer { public List> getClusters(double radius) { VPTree tree = _treeBuilder.buildVPTree(); + System.out.println("distances after the tree: " + _distance.getCount()); Set nodes = _treeBuilder.getNodes(); VPTreeSeeker seeker = new VPTreeSeeker(_distance,tree); diff --git a/src/main/java/edu/mit/simile/vicino/vptree/VPTreeBuilder.java b/src/main/java/edu/mit/simile/vicino/vptree/VPTreeBuilder.java index 4d02bb2cf..644c3983f 100755 --- a/src/main/java/edu/mit/simile/vicino/vptree/VPTreeBuilder.java +++ b/src/main/java/edu/mit/simile/vicino/vptree/VPTreeBuilder.java @@ -15,17 +15,20 @@ import edu.mit.simile.vicino.distances.Distance; public class VPTreeBuilder { private static final boolean DEBUG = false; + private static final boolean OPTIMIZED = false; + private static final int sample_size = 10; private Random generator = new Random(System.currentTimeMillis()); private final Distance distance; private Set nodes = new HashSet(); - + /** * Defines a VPTree Builder for a specific distance. * - * @param distance The class implementing the distance. + * @param distance + * The class implementing the distance. */ public VPTreeBuilder(Distance distance) { this.distance = distance; @@ -34,7 +37,7 @@ public class VPTreeBuilder { public Set getNodes() { return this.nodes; } - + public void populate(Serializable s) { nodes.add(new Node(s)); } @@ -49,7 +52,7 @@ public class VPTreeBuilder { Node[] nodes_array = this.nodes.toArray(new Node[this.nodes.size()]); VPTree tree = new VPTree(); if (nodes_array.length > 0) { - tree.setRoot(makeNode(nodes_array, 0, nodes_array.length-1)); + tree.setRoot(makeNode(nodes_array, 0, nodes_array.length - 1)); } return tree; } @@ -61,75 +64,116 @@ public class VPTreeBuilder { } return buildVPTree(); } - + public void reset() { this.nodes.clear(); } - + private TNode makeNode(Node nodes[], int begin, int end) { int delta = end - begin; if (DEBUG) System.out.println("\ndelta: " + delta); - + if (delta == 0) { TNode vpNode = new TNode(nodes[begin].get()); vpNode.setMedian(0); return vpNode; - } else if(delta < 0) { + } else if (delta < 0) { return null; } - - Node randomNode = nodes[begin + getRandomIndex(delta)]; + + Node randomNode = getVantagePoint(nodes, begin, end); TNode vpNode = new TNode(randomNode.get()); - + if (DEBUG) System.out.println("\nvp-node: " + vpNode.get().toString()); - - calculateDistances (vpNode , nodes, begin, end); - orderDistances (nodes, begin, end); - fixVantagPoint (randomNode , nodes, begin, end); - + + calculateDistances(vpNode, nodes, begin, end); + orderDistances(nodes, begin, end); + fixVantagPoint(randomNode, nodes, begin, end); + if (DEBUG) { for (int i = begin; i <= end; i++) { System.out.println(" +-- " + nodes[i].getDistance() + " --> " + nodes[i].get()); } } - + float median = (float) median(nodes, begin, end); vpNode.setMedian(median); - + int i = 0; for (i = begin + 1; i < end; i++) { if (nodes[i].getDistance() >= median) { - vpNode.setLeft(makeNode(nodes, begin+1, i-1)); + vpNode.setLeft(makeNode(nodes, begin + 1, i - 1)); break; } } vpNode.setRight(makeNode(nodes, i, end)); - + return vpNode; } + + private Node getVantagePoint(Node nodes[], int begin, int end) { + if (OPTIMIZED) { + Node buffer[] = new Node[sample_size]; + for (int i = 0; i < sample_size; i++) { + buffer[i] = getRandomNode(nodes,begin,end); + } - public double median(Node nodes[], int begin, int end) { - int middle = (end-begin) / 2; // subscript of middle element - - if ((end-begin) % 2 == 0) { - return nodes[begin+middle].getDistance(); + double bestSpread = 0; + Node bestNode = buffer[0]; + for (int i = 0; i < sample_size; i++) { + calculateDistances(new TNode(buffer[i]), buffer, 0, buffer.length - 1); + orderDistances(nodes, begin, end); + double median = (double) median(nodes, begin, end); + double spread = deviation(buffer, median); + System.out.println(" " + spread); + if (spread > bestSpread) { + bestSpread = spread; + bestNode = buffer[i]; + } + } + + System.out.println("best: " + bestSpread); + return bestNode; } else { - return (nodes[begin+middle].getDistance() + nodes[begin+middle+1].getDistance()) / 2.0d; + return getRandomNode(nodes,begin,end); } } + + private Node getRandomNode(Node nodes[], int begin, int end) { + return nodes[begin + generator.nextInt(end - begin)]; + } + private double deviation(Node buffer[], double median) { + double sum = 0; + for (int i = 0; i < buffer.length; i++) { + sum += Math.pow(buffer[i].getDistance() - median, 2); + } + return sum / buffer.length; + } + + public double median(Node nodes[], int begin, int end) { + int delta = end - begin; + int middle = delta / 2; + + if (delta % 2 == 0) { + return nodes[begin + middle].getDistance(); + } else { + return (nodes[begin + middle].getDistance() + nodes[begin + middle + 1].getDistance()) / 2.0d; + } + } + private void calculateDistances(TNode pivot, Node nodes[], int begin, int end) { + Serializable x = pivot.get(); for (int i = begin; i <= end; i++) { - Serializable x = pivot.get(); Serializable y = nodes[i].get(); - double d = (x == y) ? 0.0d : distance.d(x.toString(), y.toString()); + double d = (x == y || x.equals(y)) ? 0.0d : distance.d(x.toString(), y.toString()); nodes[i].setDistance(d); } } - - private void fixVantagPoint (Node pivot, Node nodes[], int begin, int end) { + + private void fixVantagPoint(Node pivot, Node nodes[], int begin, int end) { for (int i = begin; i < end; i++) { if (nodes[i] == pivot) { if (i > begin) { @@ -137,16 +181,12 @@ public class VPTreeBuilder { nodes[begin] = pivot; nodes[i] = tmp; break; - } + } } } } - + private void orderDistances(Node nodes[], int begin, int end) { NodeSorter.sort(nodes, begin, end); } - - private int getRandomIndex(int max) { - return generator.nextInt(max); - } }