diff --git a/main/src/com/google/refine/expr/functions/xml/WholeText.java b/main/src/com/google/refine/expr/functions/xml/WholeText.java new file mode 100644 index 000000000..61d667023 --- /dev/null +++ b/main/src/com/google/refine/expr/functions/xml/WholeText.java @@ -0,0 +1,76 @@ +/* + +Copyright 2010,2011 Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.expr.functions.xml; + +import java.util.Properties; + +import org.jsoup.nodes.Element; + +import com.google.refine.expr.EvalError; +import com.google.refine.grel.ControlFunctionRegistry; +import com.google.refine.grel.Function; + +public class WholeText implements Function { + + @Override + public Object call(Properties bindings, Object[] args) { + if (args.length == 1) { + Object o1 = args[0]; + if (o1 != null && o1 instanceof Element) { + Element e1 = (Element)o1; + return e1.wholeText(); + + }else{ + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml() and select(query) prior to using this function"); + } + } + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single XML or HTML element as an argument"); + } + + + @Override + public String getDescription() { + return "Selects the (unencoded) text of an element and its children, including any newlines and spaces, and returns unencoded, un-normalized text"; + } + + @Override + public String getParams() { + return "Element e"; + } + + @Override + public String getReturns() { + return "String text"; + } +} diff --git a/main/src/com/google/refine/grel/ControlFunctionRegistry.java b/main/src/com/google/refine/grel/ControlFunctionRegistry.java index 3a6502ba1..fb66b9fea 100644 --- a/main/src/com/google/refine/grel/ControlFunctionRegistry.java +++ b/main/src/com/google/refine/grel/ControlFunctionRegistry.java @@ -137,6 +137,7 @@ import com.google.refine.expr.functions.xml.InnerXml; import com.google.refine.expr.functions.xml.OwnText; import com.google.refine.expr.functions.xml.ParseXml; import com.google.refine.expr.functions.xml.SelectXml; +import com.google.refine.expr.functions.xml.WholeText; import com.google.refine.expr.functions.xml.XmlAttr; import com.google.refine.expr.functions.xml.XmlText; import com.google.refine.grel.controls.Filter; @@ -256,6 +257,7 @@ public class ControlFunctionRegistry { registerFunction("innerXml", new InnerXml()); registerFunction("innerHtml", new InnerHtml()); registerFunction("ownText", new OwnText()); + registerFunction("wholeText", new WholeText()); registerFunction("indexOf", new IndexOf()); registerFunction("lastIndexOf", new LastIndexOf()); diff --git a/main/tests/server/src/com/google/refine/expr/functions/html/ParseHtmlTests.java b/main/tests/server/src/com/google/refine/expr/functions/html/ParseHtmlTests.java index d81a78b39..7b403b08e 100644 --- a/main/tests/server/src/com/google/refine/expr/functions/html/ParseHtmlTests.java +++ b/main/tests/server/src/com/google/refine/expr/functions/html/ParseHtmlTests.java @@ -51,6 +51,13 @@ public class ParseHtmlTests extends RefineTest { "

para1 strong text

\n" + "

para2

\n" + " \n" + + "
\n" + + " Me : Make a 2nd game ?\n" + + "
Dev : Nah man , too much work.\n" + + "
Me : So what's it gonna be ?\n" + + "
Dev : REMASTER !!!!\n" + + "
" + + "
" + " \n" + ""; @@ -76,6 +83,8 @@ public class ParseHtmlTests extends RefineTest { Assert.assertEquals(invoke("htmlAttr",Jsoup.parse(h).select("div").first(),"class"),"class1"); Assert.assertEquals(invoke("htmlText",Jsoup.parse(h).select("div").first()),"para1 strong text para2"); Assert.assertEquals(invoke("ownText",Jsoup.parse(h).select("p").first()),"para1"); + Assert.assertTrue(invoke("wholeText",Jsoup.parse(h).select("div.commentthread_comment_text").first()) instanceof String); + Assert.assertEquals(invoke("wholeText",Jsoup.parse(h).select("div.commentthread_comment_text").first()),"\n Me : Make a 2nd game ?\n Dev : Nah man , too much work.\n Me : So what's it gonna be ?\n Dev : REMASTER !!!!\n "); } }