- diff now can act before two dates (still to be fully tested)

- added string fingerprinting function (useful for clustering)
- fixed unicode() function which wasn't returning correct values
- added a toString method to EvalError to know what error that was
- fixed a NPE in TextTransformationOperation


git-svn-id: http://google-refine.googlecode.com/svn/trunk@153 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-02-28 00:55:09 +00:00
parent f1923758e7
commit d9e67ac806
6 changed files with 87 additions and 9 deletions

View File

@ -34,6 +34,7 @@ import com.metaweb.gridworks.expr.functions.math.Round;
import com.metaweb.gridworks.expr.functions.strings.Contains;
import com.metaweb.gridworks.expr.functions.strings.Diff;
import com.metaweb.gridworks.expr.functions.strings.EndsWith;
import com.metaweb.gridworks.expr.functions.strings.Fingerprint;
import com.metaweb.gridworks.expr.functions.strings.IndexOf;
import com.metaweb.gridworks.expr.functions.strings.LastIndexOf;
import com.metaweb.gridworks.expr.functions.strings.MD5;
@ -125,6 +126,7 @@ public class ControlFunctionRegistry {
registerFunction("unicode", new Unicode());
registerFunction("diff", new Diff());
registerFunction("chomp", new Diff());
registerFunction("fingerprint", new Fingerprint());
registerFunction("indexOf", new IndexOf());
registerFunction("lastIndexOf", new LastIndexOf());

View File

@ -14,6 +14,10 @@ public class EvalError implements Jsonizable {
this.message = message;
}
public String toString() {
return this.message;
}
public void write(JSONWriter writer, Properties options)
throws JSONException {

View File

@ -1,21 +1,50 @@
package com.metaweb.gridworks.expr.functions.strings;
import java.util.Calendar;
import java.util.Properties;
import org.apache.commons.lang.StringUtils;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.expr.CalendarParser;
import com.metaweb.gridworks.expr.CalendarParserException;
import com.metaweb.gridworks.expr.Function;
public class Diff implements Function {
public Object call(Properties bindings, Object[] args) {
if (args.length == 2) {
if (args.length >= 2 && args.length <= 3) {
Object o1 = args[0];
Object o2 = args[1];
if (o1 != null && o2 != null && o1 instanceof String && o2 instanceof String) {
if (o1 != null && o2 != null) {
if (o1 instanceof String && o2 instanceof String) {
return StringUtils.difference((String) o1,(String) o2);
} else if (o1 instanceof Calendar) {
if (args.length == 3) {
Object o3 = args[3];
if (o3 != null && o3 instanceof String) {
try {
String unit = ((String) o3).toLowerCase();
Calendar c1 = (Calendar) o1;
Calendar c2 = (o2 instanceof Calendar) ? (Calendar) o2 : CalendarParser.parse((o2 instanceof String) ? (String) o2 : o2.toString());
long delta = (c1.getTimeInMillis() - c2.getTimeInMillis()) / 1000;
if ("seconds".equals(unit)) return delta;
delta /= 60;
if ("minutes".equals(unit)) return delta;
delta /= 60;
if ("hours".equals(unit)) return delta;
long days = delta / 24;
if ("days".equals(unit)) return days;
if ("weeks".equals(unit)) return days / 7;
if ("months".equals(unit)) return days / 30;
if ("years".equals(unit)) return days / 365;
} catch (CalendarParserException e) {
// we should throw at this point because it's important to know that date parsing failed
}
}
}
}
}
}
return null;
@ -25,9 +54,9 @@ public class Diff implements Function {
throws JSONException {
writer.object();
writer.key("description"); writer.value("Compares two Strings, and returns the portion where they differ. (More precisely, return the remainder of the second String, starting from where it's different from the first.)");
writer.key("params"); writer.value("string s, string v");
writer.key("returns"); writer.value("string");
writer.key("description"); writer.value("For strings, returns the portion where they differ. For dates, it returns the difference in given time units");
writer.key("params"); writer.value("o1, o2, time unit (optional)");
writer.key("returns"); writer.value("string for strings, number for dates");
writer.endObject();
}
}

View File

@ -0,0 +1,43 @@
package com.metaweb.gridworks.expr.functions.strings;
import java.util.Arrays;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.expr.ControlFunctionRegistry;
import com.metaweb.gridworks.expr.EvalError;
import com.metaweb.gridworks.expr.Function;
public class Fingerprint implements Function {
static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}");
public Object call(Properties bindings, Object[] args) {
if (args.length == 1 && args[0] != null) {
Object o = args[0];
String s = (o instanceof String) ? (String) o : o.toString();
s = s.trim(); // first off, remove whitespace around the string
s = s.toLowerCase(); // then lowercase it
s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars
String[] frags = StringUtils.split(s); // split by whitespace
Arrays.sort(frags); // sort the fragments
return StringUtils.join(frags," "); // rejoin them with a single space between them
}
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a string");
}
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("description"); writer.value("Returns the fingerprint of s, a derived string that aims to be a more canonical form of it (this is mostly useful for finding clusters of strings related to the same information).");
writer.key("params"); writer.value("string s");
writer.key("returns"); writer.value("string");
writer.endObject();
}
}

View File

@ -15,9 +15,9 @@ public class Unicode implements Function {
if (args.length == 1 && args[0] != null) {
Object o = args[0];
String s = (o instanceof String) ? (String) o : o.toString();
int[] output = new int[s.length()];
Integer[] output = new Integer[s.length()];
for (int i = 0; i < s.length(); i++) {
output[i] = Character.getNumericValue(s.codePointAt(i));
output[i] = s.codePointAt(i);
}
return output;
}

View File

@ -88,7 +88,7 @@ public class TextTransformOperation extends EngineDependentMassCellOperation {
Object v = eval.evaluate(bindings);
if ((cell != null && cell.value != null) || v != null) {
Cell newCell = new Cell(v, cell.recon);
Cell newCell = new Cell(v, (cell != null) ? cell.recon : null);
CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell);
cellChanges.add(cellChange);