From 1695e2f8f15ad10f038ff996b621e1d20e6c5dc7 Mon Sep 17 00:00:00 2001 From: Stefano Mazzocchi Date: Thu, 4 Mar 2010 02:37:25 +0000 Subject: [PATCH] add the ngramFingerprint function git-svn-id: http://google-refine.googlecode.com/svn/trunk@191 7d457c2a-affb-35e4-300a-418c747d4874 --- .../functions/strings/NGramFingerprint.java | 61 +++++++++++++++++++ .../gel/ControlFunctionRegistry.java | 2 + 2 files changed, 63 insertions(+) create mode 100644 src/main/java/com/metaweb/gridworks/expr/functions/strings/NGramFingerprint.java diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/NGramFingerprint.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/NGramFingerprint.java new file mode 100644 index 000000000..de28573bd --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/NGramFingerprint.java @@ -0,0 +1,61 @@ +package com.metaweb.gridworks.expr.functions.strings; + +import java.util.Iterator; +import java.util.Properties; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import org.json.JSONException; +import org.json.JSONWriter; + +import com.metaweb.gridworks.expr.EvalError; +import com.metaweb.gridworks.gel.ControlFunctionRegistry; +import com.metaweb.gridworks.gel.Function; + +public class NGramFingerprint implements Function { + + static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}"); + + public Object call(Properties bindings, Object[] args) { + if (args.length == 1 || args.length == 2) { + if (args[0] != null) { + int ngram_size = 1; + if (args.length == 2 && args[1] != null) { + ngram_size = (args[1] instanceof Number) ? ((Number) args[1]).intValue() : Integer.parseInt(args[1].toString()); + } + Object o = args[0]; + String s = (o instanceof String) ? (String) o : o.toString(); + s = s.toLowerCase(); // then lowercase it + s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars + TreeSet set = ngram_split(s,ngram_size); + StringBuffer b = new StringBuffer(); + Iterator i = set.iterator(); + while (i.hasNext()) { + b.append(i.next()); + } + return b.toString(); // join ordered fragments back together + } + return null; + } + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects at least a string"); + } + + protected TreeSet ngram_split(String s, int size) { + TreeSet set = new TreeSet(); + char[] chars = s.toCharArray(); + for (int i = 0; i + size <= chars.length; i++) { + set.add(new String(chars,i,size)); + } + return set; + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Returns the n-gram fingerprint of s"); + writer.key("params"); writer.value("string s, number n"); + writer.key("returns"); writer.value("string"); + writer.endObject(); + } +} diff --git a/src/main/java/com/metaweb/gridworks/gel/ControlFunctionRegistry.java b/src/main/java/com/metaweb/gridworks/gel/ControlFunctionRegistry.java index 511c73d35..1efbc00c6 100644 --- a/src/main/java/com/metaweb/gridworks/gel/ControlFunctionRegistry.java +++ b/src/main/java/com/metaweb/gridworks/gel/ControlFunctionRegistry.java @@ -34,6 +34,7 @@ import com.metaweb.gridworks.expr.functions.strings.Fingerprint; import com.metaweb.gridworks.expr.functions.strings.IndexOf; import com.metaweb.gridworks.expr.functions.strings.LastIndexOf; import com.metaweb.gridworks.expr.functions.strings.MD5; +import com.metaweb.gridworks.expr.functions.strings.NGramFingerprint; import com.metaweb.gridworks.expr.functions.strings.Partition; import com.metaweb.gridworks.expr.functions.strings.Phonetic; import com.metaweb.gridworks.expr.functions.strings.RPartition; @@ -131,6 +132,7 @@ public class ControlFunctionRegistry { registerFunction("diff", new Diff()); registerFunction("chomp", new Diff()); registerFunction("fingerprint", new Fingerprint()); + registerFunction("ngramFingerprint", new NGramFingerprint()); registerFunction("phonetic", new Phonetic()); registerFunction("reinterpret", new Reinterpret());