Adds new Jsoup wholeText() function and tests (#3181)
* Adds new Jsoup wholeText() function and tests - Ref: https://github.com/jhy/jsoup/blob/master/CHANGES#L275 - Ref: https://jsoup.org/apidocs/org/jsoup/nodes/Element.html#wholeText() * update the description of function * Update main/src/com/google/refine/expr/functions/xml/WholeText.java
This commit is contained in:
parent
7c0607b890
commit
3f6d1eabba
76
main/src/com/google/refine/expr/functions/xml/WholeText.java
Normal file
76
main/src/com/google/refine/expr/functions/xml/WholeText.java
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
Copyright 2010,2011 Google Inc.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following disclaimer
|
||||||
|
in the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
* Neither the name of Google Inc. nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.google.refine.expr.functions.xml;
|
||||||
|
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
|
import com.google.refine.expr.EvalError;
|
||||||
|
import com.google.refine.grel.ControlFunctionRegistry;
|
||||||
|
import com.google.refine.grel.Function;
|
||||||
|
|
||||||
|
public class WholeText implements Function {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object call(Properties bindings, Object[] args) {
|
||||||
|
if (args.length == 1) {
|
||||||
|
Object o1 = args[0];
|
||||||
|
if (o1 != null && o1 instanceof Element) {
|
||||||
|
Element e1 = (Element)o1;
|
||||||
|
return e1.wholeText();
|
||||||
|
|
||||||
|
}else{
|
||||||
|
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml() and select(query) prior to using this function");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single XML or HTML element as an argument");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getDescription() {
|
||||||
|
return "Selects the (unencoded) text of an element and its children, including any newlines and spaces, and returns unencoded, un-normalized text";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getParams() {
|
||||||
|
return "Element e";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getReturns() {
|
||||||
|
return "String text";
|
||||||
|
}
|
||||||
|
}
|
@ -137,6 +137,7 @@ import com.google.refine.expr.functions.xml.InnerXml;
|
|||||||
import com.google.refine.expr.functions.xml.OwnText;
|
import com.google.refine.expr.functions.xml.OwnText;
|
||||||
import com.google.refine.expr.functions.xml.ParseXml;
|
import com.google.refine.expr.functions.xml.ParseXml;
|
||||||
import com.google.refine.expr.functions.xml.SelectXml;
|
import com.google.refine.expr.functions.xml.SelectXml;
|
||||||
|
import com.google.refine.expr.functions.xml.WholeText;
|
||||||
import com.google.refine.expr.functions.xml.XmlAttr;
|
import com.google.refine.expr.functions.xml.XmlAttr;
|
||||||
import com.google.refine.expr.functions.xml.XmlText;
|
import com.google.refine.expr.functions.xml.XmlText;
|
||||||
import com.google.refine.grel.controls.Filter;
|
import com.google.refine.grel.controls.Filter;
|
||||||
@ -256,6 +257,7 @@ public class ControlFunctionRegistry {
|
|||||||
registerFunction("innerXml", new InnerXml());
|
registerFunction("innerXml", new InnerXml());
|
||||||
registerFunction("innerHtml", new InnerHtml());
|
registerFunction("innerHtml", new InnerHtml());
|
||||||
registerFunction("ownText", new OwnText());
|
registerFunction("ownText", new OwnText());
|
||||||
|
registerFunction("wholeText", new WholeText());
|
||||||
|
|
||||||
registerFunction("indexOf", new IndexOf());
|
registerFunction("indexOf", new IndexOf());
|
||||||
registerFunction("lastIndexOf", new LastIndexOf());
|
registerFunction("lastIndexOf", new LastIndexOf());
|
||||||
|
@ -51,6 +51,13 @@ public class ParseHtmlTests extends RefineTest {
|
|||||||
" <p>para1 <strong>strong text</strong></p>\n" +
|
" <p>para1 <strong>strong text</strong></p>\n" +
|
||||||
" <p>para2</p>\n" +
|
" <p>para2</p>\n" +
|
||||||
" </div>\n" +
|
" </div>\n" +
|
||||||
|
" <div class=\"commentthread_comment_text\" id=\"comment_content_257769\">\n" +
|
||||||
|
" Me : Make a 2nd game ?\n" +
|
||||||
|
" <br>Dev : Nah man , too much work.\n" +
|
||||||
|
" <br>Me : So what's it gonna be ?\n" +
|
||||||
|
" <br>Dev : REMASTER !!!!\n" +
|
||||||
|
" <br>" +
|
||||||
|
"</div>" +
|
||||||
" </body>\n" +
|
" </body>\n" +
|
||||||
"</html>";
|
"</html>";
|
||||||
|
|
||||||
@ -76,6 +83,8 @@ public class ParseHtmlTests extends RefineTest {
|
|||||||
Assert.assertEquals(invoke("htmlAttr",Jsoup.parse(h).select("div").first(),"class"),"class1");
|
Assert.assertEquals(invoke("htmlAttr",Jsoup.parse(h).select("div").first(),"class"),"class1");
|
||||||
Assert.assertEquals(invoke("htmlText",Jsoup.parse(h).select("div").first()),"para1 strong text para2");
|
Assert.assertEquals(invoke("htmlText",Jsoup.parse(h).select("div").first()),"para1 strong text para2");
|
||||||
Assert.assertEquals(invoke("ownText",Jsoup.parse(h).select("p").first()),"para1");
|
Assert.assertEquals(invoke("ownText",Jsoup.parse(h).select("p").first()),"para1");
|
||||||
|
Assert.assertTrue(invoke("wholeText",Jsoup.parse(h).select("div.commentthread_comment_text").first()) instanceof String);
|
||||||
|
Assert.assertEquals(invoke("wholeText",Jsoup.parse(h).select("div.commentthread_comment_text").first()),"\n Me : Make a 2nd game ?\n Dev : Nah man , too much work.\n Me : So what's it gonna be ?\n Dev : REMASTER !!!!\n ");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user