From d9e67ac806c1fab14e782d8f04436e4e0ee4b721 Mon Sep 17 00:00:00 2001 From: Stefano Mazzocchi Date: Sun, 28 Feb 2010 00:55:09 +0000 Subject: [PATCH] - diff now can act before two dates (still to be fully tested) - added string fingerprinting function (useful for clustering) - fixed unicode() function which wasn't returning correct values - added a toString method to EvalError to know what error that was - fixed a NPE in TextTransformationOperation git-svn-id: http://google-refine.googlecode.com/svn/trunk@153 7d457c2a-affb-35e4-300a-418c747d4874 --- .../expr/ControlFunctionRegistry.java | 2 + .../com/metaweb/gridworks/expr/EvalError.java | 4 ++ .../expr/functions/strings/Diff.java | 41 +++++++++++++++--- .../expr/functions/strings/Fingerprint.java | 43 +++++++++++++++++++ .../expr/functions/strings/Unicode.java | 4 +- .../operations/TextTransformOperation.java | 2 +- 6 files changed, 87 insertions(+), 9 deletions(-) create mode 100644 src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java diff --git a/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java b/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java index 9dab18ecc..d68262db4 100644 --- a/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java +++ b/src/main/java/com/metaweb/gridworks/expr/ControlFunctionRegistry.java @@ -34,6 +34,7 @@ import com.metaweb.gridworks.expr.functions.math.Round; import com.metaweb.gridworks.expr.functions.strings.Contains; import com.metaweb.gridworks.expr.functions.strings.Diff; import com.metaweb.gridworks.expr.functions.strings.EndsWith; +import com.metaweb.gridworks.expr.functions.strings.Fingerprint; import com.metaweb.gridworks.expr.functions.strings.IndexOf; import com.metaweb.gridworks.expr.functions.strings.LastIndexOf; import com.metaweb.gridworks.expr.functions.strings.MD5; @@ -125,6 +126,7 @@ public class ControlFunctionRegistry { registerFunction("unicode", new Unicode()); registerFunction("diff", new Diff()); registerFunction("chomp", new Diff()); + registerFunction("fingerprint", new Fingerprint()); registerFunction("indexOf", new IndexOf()); registerFunction("lastIndexOf", new LastIndexOf()); diff --git a/src/main/java/com/metaweb/gridworks/expr/EvalError.java b/src/main/java/com/metaweb/gridworks/expr/EvalError.java index ffb4e559d..ee855a9fc 100644 --- a/src/main/java/com/metaweb/gridworks/expr/EvalError.java +++ b/src/main/java/com/metaweb/gridworks/expr/EvalError.java @@ -14,6 +14,10 @@ public class EvalError implements Jsonizable { this.message = message; } + public String toString() { + return this.message; + } + public void write(JSONWriter writer, Properties options) throws JSONException { diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Diff.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Diff.java index 7eb4bd653..b073d59eb 100644 --- a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Diff.java +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Diff.java @@ -1,21 +1,50 @@ package com.metaweb.gridworks.expr.functions.strings; +import java.util.Calendar; import java.util.Properties; import org.apache.commons.lang.StringUtils; import org.json.JSONException; import org.json.JSONWriter; +import com.metaweb.gridworks.expr.CalendarParser; +import com.metaweb.gridworks.expr.CalendarParserException; import com.metaweb.gridworks.expr.Function; public class Diff implements Function { public Object call(Properties bindings, Object[] args) { - if (args.length == 2) { + if (args.length >= 2 && args.length <= 3) { Object o1 = args[0]; Object o2 = args[1]; - if (o1 != null && o2 != null && o1 instanceof String && o2 instanceof String) { - return StringUtils.difference((String) o1,(String) o2); + if (o1 != null && o2 != null) { + if (o1 instanceof String && o2 instanceof String) { + return StringUtils.difference((String) o1,(String) o2); + } else if (o1 instanceof Calendar) { + if (args.length == 3) { + Object o3 = args[3]; + if (o3 != null && o3 instanceof String) { + try { + String unit = ((String) o3).toLowerCase(); + Calendar c1 = (Calendar) o1; + Calendar c2 = (o2 instanceof Calendar) ? (Calendar) o2 : CalendarParser.parse((o2 instanceof String) ? (String) o2 : o2.toString()); + long delta = (c1.getTimeInMillis() - c2.getTimeInMillis()) / 1000; + if ("seconds".equals(unit)) return delta; + delta /= 60; + if ("minutes".equals(unit)) return delta; + delta /= 60; + if ("hours".equals(unit)) return delta; + long days = delta / 24; + if ("days".equals(unit)) return days; + if ("weeks".equals(unit)) return days / 7; + if ("months".equals(unit)) return days / 30; + if ("years".equals(unit)) return days / 365; + } catch (CalendarParserException e) { + // we should throw at this point because it's important to know that date parsing failed + } + } + } + } } } return null; @@ -25,9 +54,9 @@ public class Diff implements Function { throws JSONException { writer.object(); - writer.key("description"); writer.value("Compares two Strings, and returns the portion where they differ. (More precisely, return the remainder of the second String, starting from where it's different from the first.)"); - writer.key("params"); writer.value("string s, string v"); - writer.key("returns"); writer.value("string"); + writer.key("description"); writer.value("For strings, returns the portion where they differ. For dates, it returns the difference in given time units"); + writer.key("params"); writer.value("o1, o2, time unit (optional)"); + writer.key("returns"); writer.value("string for strings, number for dates"); writer.endObject(); } } diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java new file mode 100644 index 000000000..117a7067a --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Fingerprint.java @@ -0,0 +1,43 @@ +package com.metaweb.gridworks.expr.functions.strings; + +import java.util.Arrays; +import java.util.Properties; +import java.util.regex.Pattern; + +import org.apache.commons.lang.StringUtils; +import org.json.JSONException; +import org.json.JSONWriter; + +import com.metaweb.gridworks.expr.ControlFunctionRegistry; +import com.metaweb.gridworks.expr.EvalError; +import com.metaweb.gridworks.expr.Function; + +public class Fingerprint implements Function { + + static final Pattern alphanum = Pattern.compile("\\p{Punct}|\\p{Cntrl}"); + + public Object call(Properties bindings, Object[] args) { + if (args.length == 1 && args[0] != null) { + Object o = args[0]; + String s = (o instanceof String) ? (String) o : o.toString(); + s = s.trim(); // first off, remove whitespace around the string + s = s.toLowerCase(); // then lowercase it + s = alphanum.matcher(s).replaceAll(""); // then remove all punctuation and control chars + String[] frags = StringUtils.split(s); // split by whitespace + Arrays.sort(frags); // sort the fragments + return StringUtils.join(frags," "); // rejoin them with a single space between them + + } + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a string"); + } + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Returns the fingerprint of s, a derived string that aims to be a more canonical form of it (this is mostly useful for finding clusters of strings related to the same information)."); + writer.key("params"); writer.value("string s"); + writer.key("returns"); writer.value("string"); + writer.endObject(); + } +} diff --git a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java index 5047915db..60cbdf4ea 100644 --- a/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java +++ b/src/main/java/com/metaweb/gridworks/expr/functions/strings/Unicode.java @@ -15,9 +15,9 @@ public class Unicode implements Function { if (args.length == 1 && args[0] != null) { Object o = args[0]; String s = (o instanceof String) ? (String) o : o.toString(); - int[] output = new int[s.length()]; + Integer[] output = new Integer[s.length()]; for (int i = 0; i < s.length(); i++) { - output[i] = Character.getNumericValue(s.codePointAt(i)); + output[i] = s.codePointAt(i); } return output; } diff --git a/src/main/java/com/metaweb/gridworks/operations/TextTransformOperation.java b/src/main/java/com/metaweb/gridworks/operations/TextTransformOperation.java index 17568401b..238b50b4a 100644 --- a/src/main/java/com/metaweb/gridworks/operations/TextTransformOperation.java +++ b/src/main/java/com/metaweb/gridworks/operations/TextTransformOperation.java @@ -88,7 +88,7 @@ public class TextTransformOperation extends EngineDependentMassCellOperation { Object v = eval.evaluate(bindings); if ((cell != null && cell.value != null) || v != null) { - Cell newCell = new Cell(v, cell.recon); + Cell newCell = new Cell(v, (cell != null) ? cell.recon : null); CellChange cellChange = new CellChange(rowIndex, cellIndex, cell, newCell); cellChanges.add(cellChange);