From c5312a2e6acfbb0d5503dfa3742339cf34fb65a0 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Wed, 23 Feb 2011 19:40:35 +0000 Subject: [PATCH] Issue 338 - patch from Thad Guidry to provide function which calls JSoup ownText() method git-svn-id: http://google-refine.googlecode.com/svn/trunk@2025 7d457c2a-affb-35e4-300a-418c747d4874 --- .../refine/expr/functions/html/HtmlText.java | 4 +- .../refine/expr/functions/html/OwnText.java | 73 +++++++++++++++++++ .../refine/grel/ControlFunctionRegistry.java | 5 +- 3 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 main/src/com/google/refine/expr/functions/html/OwnText.java diff --git a/main/src/com/google/refine/expr/functions/html/HtmlText.java b/main/src/com/google/refine/expr/functions/html/HtmlText.java index 66a2dcb85..b3dc2bcdc 100644 --- a/main/src/com/google/refine/expr/functions/html/HtmlText.java +++ b/main/src/com/google/refine/expr/functions/html/HtmlText.java @@ -1,6 +1,6 @@ /* -Copyright 2010, Google Inc. +Copyright 2010,2011 Google Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -64,7 +64,7 @@ public class HtmlText implements Function { throws JSONException { writer.object(); - writer.key("description"); writer.value("Selects the text from within an element"); + writer.key("description"); writer.value("Selects the text from within an element (including all child elements)"); writer.key("params"); writer.value("Element e"); writer.key("returns"); writer.value("String text"); writer.endObject(); diff --git a/main/src/com/google/refine/expr/functions/html/OwnText.java b/main/src/com/google/refine/expr/functions/html/OwnText.java new file mode 100644 index 000000000..3fdd22e2c --- /dev/null +++ b/main/src/com/google/refine/expr/functions/html/OwnText.java @@ -0,0 +1,73 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.expr.functions.html; + +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONWriter; +import org.jsoup.nodes.Element; + +import com.google.refine.expr.EvalError; +import com.google.refine.grel.ControlFunctionRegistry; +import com.google.refine.grel.Function; + +public class OwnText implements Function { + + public Object call(Properties bindings, Object[] args) { + if (args.length >= 1) { + Object o1 = args[0]; + if (o1 != null && o1 instanceof Element) { + Element e1 = (Element)o1; + return e1.ownText(); + + }else{ + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function"); + } + } + return null; + } + + + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Gets the text owned by this element only; does not get the combined text of all children."); + writer.key("params"); writer.value("Element e"); + writer.key("returns"); writer.value("String ownText"); + writer.endObject(); + } +} + diff --git a/main/src/com/google/refine/grel/ControlFunctionRegistry.java b/main/src/com/google/refine/grel/ControlFunctionRegistry.java index bccd8a436..26affe98a 100644 --- a/main/src/com/google/refine/grel/ControlFunctionRegistry.java +++ b/main/src/com/google/refine/grel/ControlFunctionRegistry.java @@ -1,6 +1,6 @@ /* -Copyright 2010, Google Inc. +Copyright 2010,2011 Google Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -101,6 +101,7 @@ import com.google.refine.expr.functions.strings.Fingerprint; import com.google.refine.expr.functions.html.HtmlAttr; import com.google.refine.expr.functions.html.HtmlText; import com.google.refine.expr.functions.html.InnerHtml; +import com.google.refine.expr.functions.html.OwnText; import com.google.refine.expr.functions.strings.IndexOf; import com.google.refine.expr.functions.strings.LastIndexOf; import com.google.refine.expr.functions.strings.MD5; @@ -225,11 +226,13 @@ public class ControlFunctionRegistry { registerFunction("ngram", new NGram()); registerFunction("match", new Match()); + // HTML functions from JSoup registerFunction("parseHtml", new ParseHtml()); registerFunction("select", new SelectHtml()); registerFunction("htmlAttr", new HtmlAttr()); registerFunction("htmlText", new HtmlText()); registerFunction("innerHtml", new InnerHtml()); + registerFunction("ownText", new OwnText()); registerFunction("indexOf", new IndexOf()); registerFunction("lastIndexOf", new LastIndexOf());