diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/BinningClusterer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/BinningClusterer.java index 46f717b9a..4a1aa8d9f 100644 --- a/src/main/java/com/metaweb/gridworks/clustering/binning/BinningClusterer.java +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/BinningClusterer.java @@ -77,7 +77,7 @@ public class BinningClusterer extends Clusterer { } } else { Map m = new TreeMap(); - m.put(v,0); + m.put(v,1); _map.put(key, m); } } diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/DoubleMetaphoneKeyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/DoubleMetaphoneKeyer.java index 554f84e59..cbdb09c9b 100644 --- a/src/main/java/com/metaweb/gridworks/clustering/binning/DoubleMetaphoneKeyer.java +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/DoubleMetaphoneKeyer.java @@ -4,7 +4,12 @@ import org.apache.commons.codec.language.DoubleMetaphone; public class DoubleMetaphoneKeyer extends Keyer { - private DoubleMetaphone _metaphone2 = new DoubleMetaphone(); + private DoubleMetaphone _metaphone2; + + public DoubleMetaphoneKeyer() { + _metaphone2 = new DoubleMetaphone(); + _metaphone2.setMaxCodeLen(2000); + } public String key(String s, Object... o) { return _metaphone2.doubleMetaphone(s); diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/FingerprintKeyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/FingerprintKeyer.java index 6a63fa25b..50e009009 100644 --- a/src/main/java/com/metaweb/gridworks/clustering/binning/FingerprintKeyer.java +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/FingerprintKeyer.java @@ -21,11 +21,229 @@ public class FingerprintKeyer extends Keyer { } StringBuffer b = new StringBuffer(); Iterator i = set.iterator(); - while (i.hasNext()) { + while (i.hasNext()) { // join ordered fragments back together b.append(i.next()); b.append(' '); } - return b.toString(); // join ordered fragments back together + return asciify(b.toString()); // find ASCII equivalent to characters } + protected String asciify(String s) { + char[] c = s.toCharArray(); + StringBuffer b = new StringBuffer(); + for (int i = 0; i < c.length; i++) { + b.append(translate(c[i])); + } + return b.toString(); + } + + /** + * Translate the given unicode char in the closest ASCII representation + * NOTE: this function deals only with latin-1 supplement and latin-1 extended code charts + */ + private char translate(char c) { + switch(c) { + case '\u00C0': + case '\u00C1': + case '\u00C2': + case '\u00C3': + case '\u00C4': + case '\u00C5': + case '\u00E0': + case '\u00E1': + case '\u00E2': + case '\u00E3': + case '\u00E4': + case '\u00E5': + case '\u0100': + case '\u0101': + case '\u0102': + case '\u0103': + case '\u0104': + case '\u0105': + return 'a'; + case '\u00C7': + case '\u00E7': + case '\u0106': + case '\u0107': + case '\u0108': + case '\u0109': + case '\u010A': + case '\u010B': + case '\u010C': + case '\u010D': + return 'c'; + case '\u00D0': + case '\u00F0': + case '\u010E': + case '\u010F': + case '\u0110': + case '\u0111': + return 'd'; + case '\u00C8': + case '\u00C9': + case '\u00CA': + case '\u00CB': + case '\u00E8': + case '\u00E9': + case '\u00EA': + case '\u00EB': + case '\u0112': + case '\u0113': + case '\u0114': + case '\u0115': + case '\u0116': + case '\u0117': + case '\u0118': + case '\u0119': + case '\u011A': + case '\u011B': + return 'e'; + case '\u011C': + case '\u011D': + case '\u011E': + case '\u011F': + case '\u0120': + case '\u0121': + case '\u0122': + case '\u0123': + return 'g'; + case '\u0124': + case '\u0125': + case '\u0126': + case '\u0127': + return 'h'; + case '\u00CC': + case '\u00CD': + case '\u00CE': + case '\u00CF': + case '\u00EC': + case '\u00ED': + case '\u00EE': + case '\u00EF': + case '\u0128': + case '\u0129': + case '\u012A': + case '\u012B': + case '\u012C': + case '\u012D': + case '\u012E': + case '\u012F': + case '\u0130': + case '\u0131': + return 'i'; + case '\u0134': + case '\u0135': + return 'j'; + case '\u0136': + case '\u0137': + case '\u0138': + return 'k'; + case '\u0139': + case '\u013A': + case '\u013B': + case '\u013C': + case '\u013D': + case '\u013E': + case '\u013F': + case '\u0140': + case '\u0141': + case '\u0142': + return 'l'; + case '\u00D1': + case '\u00F1': + case '\u0143': + case '\u0144': + case '\u0145': + case '\u0146': + case '\u0147': + case '\u0148': + case '\u0149': + case '\u014A': + case '\u014B': + return 'n'; + case '\u00D2': + case '\u00D3': + case '\u00D4': + case '\u00D5': + case '\u00D6': + case '\u00D8': + case '\u00F2': + case '\u00F3': + case '\u00F4': + case '\u00F5': + case '\u00F6': + case '\u00F8': + case '\u014C': + case '\u014D': + case '\u014E': + case '\u014F': + case '\u0150': + case '\u0151': + return 'o'; + case '\u0154': + case '\u0155': + case '\u0156': + case '\u0157': + case '\u0158': + case '\u0159': + return 'r'; + case '\u015A': + case '\u015B': + case '\u015C': + case '\u015D': + case '\u015E': + case '\u015F': + case '\u0160': + case '\u0161': + case '\u017F': + return 's'; + case '\u0162': + case '\u0163': + case '\u0164': + case '\u0165': + case '\u0166': + case '\u0167': + return 't'; + case '\u00D9': + case '\u00DA': + case '\u00DB': + case '\u00DC': + case '\u00F9': + case '\u00FA': + case '\u00FB': + case '\u00FC': + case '\u0168': + case '\u0169': + case '\u016A': + case '\u016B': + case '\u016C': + case '\u016D': + case '\u016E': + case '\u016F': + case '\u0170': + case '\u0171': + case '\u0172': + case '\u0173': + return 'u'; + case '\u0174': + case '\u0175': + return 'w'; + case '\u00DD': + case '\u00FD': + case '\u00FF': + case '\u0176': + case '\u0177': + case '\u0178': + return 'y'; + case '\u0179': + case '\u017A': + case '\u017B': + case '\u017C': + case '\u017D': + case '\u017E': + return 'z'; + } + return c; + } } diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/MetaphoneKeyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/MetaphoneKeyer.java index fe2ddf5a8..cd93a0841 100644 --- a/src/main/java/com/metaweb/gridworks/clustering/binning/MetaphoneKeyer.java +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/MetaphoneKeyer.java @@ -4,7 +4,12 @@ import org.apache.commons.codec.language.Metaphone; public class MetaphoneKeyer extends Keyer { - private Metaphone _metaphone = new Metaphone(); + private Metaphone _metaphone; + + public MetaphoneKeyer() { + _metaphone = new Metaphone(); + _metaphone.setMaxCodeLen(2000); + } public String key(String s, Object... o) { return _metaphone.metaphone(s); diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/NGramFingerprintKeyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/NGramFingerprintKeyer.java index bfdc84a77..f0bee4f15 100644 --- a/src/main/java/com/metaweb/gridworks/clustering/binning/NGramFingerprintKeyer.java +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/NGramFingerprintKeyer.java @@ -4,7 +4,7 @@ import java.util.Iterator; import java.util.TreeSet; import java.util.regex.Pattern; -public class NGramFingerprintKeyer extends Keyer { +public class NGramFingerprintKeyer extends FingerprintKeyer { static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}"); @@ -18,10 +18,10 @@ public class NGramFingerprintKeyer extends Keyer { TreeSet set = ngram_split(s,ngram_size); StringBuffer b = new StringBuffer(); Iterator i = set.iterator(); - while (i.hasNext()) { + while (i.hasNext()) { // join ordered fragments back together b.append(i.next()); } - return b.toString(); // join ordered fragments back together + return asciify(b.toString()); // find ASCII equivalent to characters } protected TreeSet ngram_split(String s, int size) { diff --git a/src/main/java/com/metaweb/gridworks/clustering/binning/SoundexKeyer.java b/src/main/java/com/metaweb/gridworks/clustering/binning/SoundexKeyer.java index 65c0801da..c81c7fc80 100644 --- a/src/main/java/com/metaweb/gridworks/clustering/binning/SoundexKeyer.java +++ b/src/main/java/com/metaweb/gridworks/clustering/binning/SoundexKeyer.java @@ -4,7 +4,11 @@ import org.apache.commons.codec.language.Soundex; public class SoundexKeyer extends Keyer { - private Soundex _soundex = new Soundex(); + private Soundex _soundex; + + public SoundexKeyer() { + _soundex = new Soundex(); + } public String key(String s, Object... o) { return _soundex.soundex(s); diff --git a/src/main/java/com/metaweb/gridworks/clustering/knn/NGramTokenizer.java b/src/main/java/com/metaweb/gridworks/clustering/knn/NGramTokenizer.java new file mode 100644 index 000000000..a9b6c731f --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/clustering/knn/NGramTokenizer.java @@ -0,0 +1,60 @@ +package com.metaweb.gridworks.clustering.knn; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import com.wcohen.ss.api.Token; +import com.wcohen.ss.api.Tokenizer; +import com.wcohen.ss.tokens.BasicToken; +import com.wcohen.ss.tokens.SimpleTokenizer; + +/** + * Wraps another tokenizer, and adds all computes all ngrams of + * characters from a single token produced by the inner tokenizer. + */ +public class NGramTokenizer implements Tokenizer { + + private int minNGramSize; + private int maxNGramSize; + private boolean keepOldTokens; + private Tokenizer innerTokenizer; + + public static NGramTokenizer DEFAULT_TOKENIZER = new NGramTokenizer(3,5,true,SimpleTokenizer.DEFAULT_TOKENIZER); + + public NGramTokenizer(int minNGramSize,int maxNGramSize,boolean keepOldTokens,Tokenizer innerTokenizer) { + this.minNGramSize = minNGramSize; + this.maxNGramSize = maxNGramSize; + this.keepOldTokens = keepOldTokens; + this.innerTokenizer = innerTokenizer; + } + + public Token[] tokenize(String input) { + Token[] initialTokens = innerTokenizer.tokenize(input); + List tokens = new ArrayList(); + for (int i = 0; i < initialTokens.length; i++) { + String str = initialTokens[i].getValue(); + if (keepOldTokens) tokens.add( intern(str) ); + for (int lo = 0; lo < str.length(); lo++) { + for (int len = minNGramSize; len <= maxNGramSize; len++) { + if (lo + len < str.length()) { + tokens.add(innerTokenizer.intern(str.substring(lo,lo+len))); + } + } + } + } + return (Token[]) tokens.toArray(new BasicToken[tokens.size()]); + } + + public Token intern(String s) { + return innerTokenizer.intern(s); + } + + public Iterator tokenIterator() { + return innerTokenizer.tokenIterator(); + } + + public int maxTokenIndex() { + return innerTokenizer.maxTokenIndex(); + } +} diff --git a/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java b/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java index 3427db336..d89f49950 100644 --- a/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java +++ b/src/main/java/com/metaweb/gridworks/clustering/knn/kNNClusterer.java @@ -25,7 +25,6 @@ import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Row; import com.wcohen.ss.api.Token; -import com.wcohen.ss.tokens.NGramTokenizer; import com.wcohen.ss.tokens.SimpleTokenizer; import edu.mit.simile.vicino.distances.BZip2Distance; @@ -146,7 +145,7 @@ public class kNNClusterer extends Clusterer { } int block_count = 0; - + Map> clusters = new HashMap>(); for (List list : blocks.values()) { diff --git a/src/main/java/edu/mit/simile/vicino/distances/PseudoMetricDistance.java b/src/main/java/edu/mit/simile/vicino/distances/PseudoMetricDistance.java index 53be92d85..c1c9e4d62 100644 --- a/src/main/java/edu/mit/simile/vicino/distances/PseudoMetricDistance.java +++ b/src/main/java/edu/mit/simile/vicino/distances/PseudoMetricDistance.java @@ -9,7 +9,7 @@ public abstract class PseudoMetricDistance extends Distance { double cxy = d2(x, y); double cyx = d2(y, x); counter += 4; - return (cxy + cyx) / (cxx + cyy) - 1.0d; + return 10.0d * ((cxy + cyx) / (cxx + cyy) - 1.0d); } protected abstract double d2(String x, String y); diff --git a/src/main/webapp/scripts/dialogs/facet-based-edit-dialog.js b/src/main/webapp/scripts/dialogs/facet-based-edit-dialog.js index 9d8e8431f..ff6d405f1 100644 --- a/src/main/webapp/scripts/dialogs/facet-based-edit-dialog.js +++ b/src/main/webapp/scripts/dialogs/facet-based-edit-dialog.js @@ -16,7 +16,7 @@ FacetBasedEditDialog.prototype._createDialog = function() { var frame = DialogSystem.createDialog(); frame.width("900px"); - var header = $('
').addClass("dialog-header").text("Cluster & Edit column " + this._columnName).appendTo(frame); + var header = $('
').addClass("dialog-header").text('Cluster & Edit column "' + this._columnName + '"').appendTo(frame); var body = $('
').addClass("dialog-body").appendTo(frame); var footer = $( '' + '' + '' + @@ -152,7 +144,7 @@ FacetBasedEditDialog.prototype._renderTable = function(clusters) { $(trHead.insertCell(0)).text("Cluster Size"); $(trHead.insertCell(1)).text("Row Count"); $(trHead.insertCell(2)).text("Values in Cluster"); - $(trHead.insertCell(3)).text("Edit?"); + $(trHead.insertCell(3)).text("Merge?"); $(trHead.insertCell(4)).text("New Cell Value"); var renderCluster = function(cluster) { @@ -169,9 +161,13 @@ FacetBasedEditDialog.prototype._renderTable = function(clusters) { for (var c = 0; c < choices.length; c++) { var choice = choices[c]; var li = $('
  • ').appendTo(ul); - $('').text(choice.v).appendTo(li); + $('').text(choice.v).click(function() { + var parent = $(this).closest("tr"); + parent.find("input[type='text']").val($(this).text()); + parent.find("input:not(:checked)").attr('checked', true).change(); + return false; + }).appendTo(li); $('').text("(" + choice.c + " rows)").addClass("facet-based-edit-dialog-entry-count").appendTo(li); - rowCount += choice.c; } $(tr.insertCell(2)).append(ul); diff --git a/src/main/webapp/styles/dialogs/facet-based-edit-dialog.css b/src/main/webapp/styles/dialogs/facet-based-edit-dialog.css index 3af04195f..0d49473d8 100644 --- a/src/main/webapp/styles/dialogs/facet-based-edit-dialog.css +++ b/src/main/webapp/styles/dialogs/facet-based-edit-dialog.css @@ -46,6 +46,14 @@ table.facet-based-edit-dialog-entry-table input { padding: 0 0.1em; } +table.facet-based-edit-dialog-entry-table a { + text-decoration: none; +} + +table.facet-based-edit-dialog-entry-table a:hover { + text-decoration: underline +} + .facet-based-edit-dialog-entry-count { color: #aaa; margin-left: 0.5em;