- diff now can act before two dates (still to be fully tested)
- added string fingerprinting function (useful for clustering) - fixed unicode() function which wasn't returning correct values - added a toString method to EvalError to know what error that was - fixed a NPE in TextTransformationOperation git-svn-id: http://google-refine.googlecode.com/svn/trunk@153 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
f1923758e7
commit
d9e67ac806
@ -34,6 +34,7 @@ import com.metaweb.gridworks.expr.functions.math.Round;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Contains;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Diff;
|
||||
import com.metaweb.gridworks.expr.functions.strings.EndsWith;
|
||||
import com.metaweb.gridworks.expr.functions.strings.Fingerprint;
|
||||
import com.metaweb.gridworks.expr.functions.strings.IndexOf;
|
||||
import com.metaweb.gridworks.expr.functions.strings.LastIndexOf;
|
||||
import com.metaweb.gridworks.expr.functions.strings.MD5;
|
||||
@ -125,6 +126,7 @@ public class ControlFunctionRegistry {
|
||||
registerFunction("unicode", new Unicode());
|
||||
registerFunction("diff", new Diff());
|
||||
registerFunction("chomp", new Diff());
|
||||
registerFunction("fingerprint", new Fingerprint());
|
||||
|
||||
registerFunction("indexOf", new IndexOf());
|
||||
registerFunction("lastIndexOf", new LastIndexOf());
|
||||
|
@ -14,6 +14,10 @@ public class EvalError implements Jsonizable {
|
||||
this.message = message;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.message;
|
||||
}
|
||||
|
||||
public void write(JSONWriter writer, Properties options)
|
||||
throws JSONException {
|
||||
|
||||
|
@ -1,21 +1,50 @@
|
||||
package com.metaweb.gridworks.expr.functions.strings;
|
||||
|
||||
import java.util.Calendar;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.expr.CalendarParser;
|
||||
import com.metaweb.gridworks.expr.CalendarParserException;
|
||||
import com.metaweb.gridworks.expr.Function;
|
||||
|
||||
public class Diff implements Function {
|
||||
|
||||
public Object call(Properties bindings, Object[] args) {
|
||||
if (args.length == 2) {
|
||||
if (args.length >= 2 && args.length <= 3) {
|
||||
Object o1 = args[0];
|
||||
Object o2 = args[1];
|
||||
if (o1 != null && o2 != null && o1 instanceof String && o2 instanceof String) {
|
||||
if (o1 != null && o2 != null) {
|
||||
if (o1 instanceof String && o2 instanceof String) {
|
||||
return StringUtils.difference((String) o1,(String) o2);
|
||||
} else if (o1 instanceof Calendar) {
|
||||
if (args.length == 3) {
|
||||
Object o3 = args[3];
|
||||
if (o3 != null && o3 instanceof String) {
|
||||
try {
|
||||
String unit = ((String) o3).toLowerCase();
|
||||
Calendar c1 = (Calendar) o1;
|
||||
Calendar c2 = (o2 instanceof Calendar) ? (Calendar) o2 : CalendarParser.parse((o2 instanceof String) ? (String) o2 : o2.toString());
|
||||
long delta = (c1.getTimeInMillis() - c2.getTimeInMillis()) / 1000;
|
||||
if ("seconds".equals(unit)) return delta;
|
||||
delta /= 60;
|
||||
if ("minutes".equals(unit)) return delta;
|
||||
delta /= 60;
|
||||
if ("hours".equals(unit)) return delta;
|
||||
long days = delta / 24;
|
||||
if ("days".equals(unit)) return days;
|
||||
if ("weeks".equals(unit)) return days / 7;
|
||||
if ("months".equals(unit)) return days / 30;
|
||||
if ("years".equals(unit)) return days / 365;
|
||||
} catch (CalendarParserException e) {
|
||||
// we should throw at this point because it's important to know that date parsing failed
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
@ -25,9 +54,9 @@ public class Diff implements Function {
|
||||
throws JSONException {
|
||||
|
||||
writer.object();
|
||||
writer.key("description"); writer.value("Compares two Strings, and returns the portion where they differ. (More precisely, return the remainder of the second String, starting from where it's different from the first.)");
|
||||
writer.key("params"); writer.value("string s, string v");
|
||||
writer.key("returns"); writer.value("string");
|
||||
writer.key("description"); writer.value("For strings, returns the portion where they differ. For dates, it returns the difference in given time units");
|
||||
writer.key("params"); writer.value("o1, o2, time unit (optional)");
|
||||
writer.key("returns"); writer.value("string for strings, number for dates");
|
||||
writer.endObject();
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,43 @@
|
||||
package com.metaweb.gridworks.expr.functions.strings;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Properties;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.expr.ControlFunctionRegistry;
|
||||
import com.metaweb.gridworks.expr.EvalError;
|
||||
import com.metaweb.gridworks.expr.Function;
|
||||
|
||||
public class Fingerprint implements Function {
|
||||
|
||||
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}");
|
||||
|
||||
public Object call(Properties bindings, Object[] args) {
|
||||
if (args.length == 1 && args[0] != null) {
|
||||
Object o = args[0];
|
||||
String s = (o instanceof String) ? (String) o : o.toString();
|
||||
s = s.trim(); // first off, remove whitespace around the string
|
||||
s = s.toLowerCase(); // then lowercase it
|
||||
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
|
||||
String[] frags = StringUtils.split(s); // split by whitespace
|
||||
Arrays.sort(frags); // sort the fragments
|
||||
return StringUtils.join(frags," "); // rejoin them with a single space between them
|
||||
|
||||
}
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a string");
|
||||
}
|
||||
|
||||
public void write(JSONWriter writer, Properties options)
|
||||
throws JSONException {
|
||||
|
||||
writer.object();
|
||||
writer.key("description"); writer.value("Returns the fingerprint of s, a derived string that aims to be a more canonical form of it (this is mostly useful for finding clusters of strings related to the same information).");
|
||||
writer.key("params"); writer.value("string s");
|
||||
writer.key("returns"); writer.value("string");
|
||||
writer.endObject();
|
||||
}
|
||||
}
|
@ -15,9 +15,9 @@ public class Unicode implements Function {
|
||||
if (args.length == 1 && args[0] != null) {
|
||||
Object o = args[0];
|
||||
String s = (o instanceof String) ? (String) o : o.toString();
|
||||
int[] output = new int[s.length()];
|
||||
Integer[] output = new Integer[s.length()];
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
output[i] = Character.getNumericValue(s.codePointAt(i));
|
||||
output[i] = s.codePointAt(i);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ public class TextTransformOperation extends EngineDependentMassCellOperation {
|
||||
|
||||
Object v = eval.evaluate(bindings);
|
||||
if ((cell != null && cell.value != null) || v != null) {
|
||||
Cell newCell = new Cell(v, cell.recon);
|
||||
Cell newCell = new Cell(v, (cell != null) ? cell.recon : null);
|
||||
|
||||
CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell);
|
||||
cellChanges.add(cellChange);
|
||||
|
Loading…
Reference in New Issue
Block a user