add the ngramFingerprint function
git-svn-id: http://google-refine.googlecode.com/svn/trunk@191 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
5c3ca7723a
commit
1695e2f8f1
@ -0,0 +1,61 @@
|
||||
package com.metaweb.gridworks.expr.functions.strings;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Properties;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.expr.EvalError;
|
||||
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
|
||||
import com.metaweb.gridworks.gel.Function;
|
||||
|
||||
public class NGramFingerprint implements Function {
|
||||
|
||||
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}|\\p{Space}");
|
||||
|
||||
public Object call(Properties bindings, Object[] args) {
|
||||
if (args.length == 1 || args.length == 2) {
|
||||
if (args[0] != null) {
|
||||
int ngram_size = 1;
|
||||
if (args.length == 2 && args[1] != null) {
|
||||
ngram_size = (args[1] instanceof Number) ? ((Number) args[1]).intValue() : Integer.parseInt(args[1].toString());
|
||||
}
|
||||
Object o = args[0];
|
||||
String s = (o instanceof String) ? (String) o : o.toString();
|
||||
s = s.toLowerCase(); // then lowercase it
|
||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||
TreeSet<String> set = ngram_split(s,ngram_size);
|
||||
StringBuffer b = new StringBuffer();
|
||||
Iterator<String> i = set.iterator();
|
||||
while (i.hasNext()) {
|
||||
b.append(i.next());
|
||||
}
|
||||
return b.toString(); // join ordered fragments back together
|
||||
}
|
||||
return null;
|
||||
}
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects at least a string");
|
||||
}
|
||||
|
||||
protected TreeSet<String> ngram_split(String s, int size) {
|
||||
TreeSet<String> set = new TreeSet<String>();
|
||||
char[] chars = s.toCharArray();
|
||||
for (int i = 0; i + size <= chars.length; i++) {
|
||||
set.add(new String(chars,i,size));
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
public void write(JSONWriter writer, Properties options)
|
||||
throws JSONException {
|
||||
|
||||
writer.object();
|
||||
writer.key("description"); writer.value("Returns the n-gram fingerprint of s");
|
||||
writer.key("params"); writer.value("string s, number n");
|
||||
writer.key("returns"); writer.value("string");
|
||||
writer.endObject();
|
||||
}
|
||||
}
|
@ -34,6 +34,7 @@ import com.metaweb.gridworks.expr.functions.strings.Fingerprint;
|
||||
import com.metaweb.gridworks.expr.functions.strings.IndexOf;
|
||||
import com.metaweb.gridworks.expr.functions.strings.LastIndexOf;
|
||||
import com.metaweb.gridworks.expr.functions.strings.MD5;
|
||||
import com.metaweb.gridworks.expr.functions.strings.NGramFingerprint;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Partition;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Phonetic;
|
||||
import com.metaweb.gridworks.expr.functions.strings.RPartition;
|
||||
@ -131,6 +132,7 @@ public class ControlFunctionRegistry {
|
||||
registerFunction("diff", new Diff());
|
||||
registerFunction("chomp", new Diff());
|
||||
registerFunction("fingerprint", new Fingerprint());
|
||||
registerFunction("ngramFingerprint", new NGramFingerprint());
|
||||
registerFunction("phonetic", new Phonetic());
|
||||
registerFunction("reinterpret", new Reinterpret());
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user