From b5355b3c1243bce0f2df345c2fdfcdb991cc8d30 Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Sun, 18 Nov 2018 13:12:41 +0000 Subject: [PATCH 01/10] Correct typo --- main/src/com/google/refine/expr/functions/html/SelectHtml.java | 2 +- .../refine/tests/expr/functions/html/SelectHtmlTests.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/main/src/com/google/refine/expr/functions/html/SelectHtml.java b/main/src/com/google/refine/expr/functions/html/SelectHtml.java index 04528852b..114e7bd74 100644 --- a/main/src/com/google/refine/expr/functions/html/SelectHtml.java +++ b/main/src/com/google/refine/expr/functions/html/SelectHtml.java @@ -68,7 +68,7 @@ public class SelectHtml implements Function { throws JSONException { writer.object(); - writer.key("description"); writer.value("Selects an element from an HTML elementn using selector syntax"); + writer.key("description"); writer.value("Selects an element from an HTML element using selector syntax"); writer.key("params"); writer.value("Element e, String s"); writer.key("returns"); writer.value("HTML Elements"); writer.endObject(); diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/html/SelectHtmlTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/html/SelectHtmlTests.java index 62e237c65..351dd91bc 100644 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/html/SelectHtmlTests.java +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/html/SelectHtmlTests.java @@ -8,7 +8,7 @@ import com.google.refine.tests.util.TestUtils; public class SelectHtmlTests { @Test public void serializeSelectHtml() { - String json = "{\"description\":\"Selects an element from an HTML elementn using selector syntax\",\"params\":\"Element e, String s\",\"returns\":\"HTML Elements\"}"; + String json = "{\"description\":\"Selects an element from an HTML element using selector syntax\",\"params\":\"Element e, String s\",\"returns\":\"HTML Elements\"}"; TestUtils.isSerializedTo(new SelectHtml(), json); } } From e38dcdf7a76b21812101828de6e1bdefb6d1d803 Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Tue, 20 Nov 2018 20:18:58 +0000 Subject: [PATCH 02/10] Add intial tests for XML parsing --- .../expr/functions/xml/InnerXmlTests.java | 15 ++++ .../expr/functions/xml/OwnTextTests.java | 15 ++++ .../expr/functions/xml/ParseXmlTests.java | 85 +++++++++++++++++++ .../expr/functions/xml/SelectXmlTests.java | 15 ++++ .../expr/functions/xml/xmlAttrTests.java | 15 ++++ .../expr/functions/xml/xmlTextTests.java | 15 ++++ 6 files changed, 160 insertions(+) create mode 100644 main/tests/server/src/com/google/refine/tests/expr/functions/xml/InnerXmlTests.java create mode 100644 main/tests/server/src/com/google/refine/tests/expr/functions/xml/OwnTextTests.java create mode 100644 main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java create mode 100644 main/tests/server/src/com/google/refine/tests/expr/functions/xml/SelectXmlTests.java create mode 100644 main/tests/server/src/com/google/refine/tests/expr/functions/xml/xmlAttrTests.java create mode 100644 main/tests/server/src/com/google/refine/tests/expr/functions/xml/xmlTextTests.java diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/InnerXmlTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/InnerXmlTests.java new file mode 100644 index 000000000..af39cbd65 --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/InnerXmlTests.java @@ -0,0 +1,15 @@ +package com.google.refine.tests.expr.functions.xml; + +import org.testng.annotations.Test; + +import com.google.refine.expr.functions.xml.InnerXml; +import com.google.refine.tests.util.TestUtils; + +public class InnerXmlTests { + @Test + public void serializeInnerXml() { + String json = "{\"description\":\"The innerXml/innerHtml of an XML/HTML element\",\"params\":\"Element e\",\"returns\":\"String innerXml/innerHtml\"}"; + TestUtils.isSerializedTo(new InnerXml(), json); + } +} + diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/OwnTextTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/OwnTextTests.java new file mode 100644 index 000000000..f1b7c688b --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/OwnTextTests.java @@ -0,0 +1,15 @@ +package com.google.refine.tests.expr.functions.html; + +import org.testng.annotations.Test; + +import com.google.refine.expr.functions.html.OwnText; +import com.google.refine.tests.util.TestUtils; + +public class OwnTextTests { + @Test + public void serializeOwnText() { + String json = "{\"description\":\"Gets the text owned by this XML/HTML element only; does not get the combined text of all children.\",\"params\":\"Element e\",\"returns\":\"String ownText\"}"; + TestUtils.isSerializedTo(new OwnText(), json); + } +} + diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java new file mode 100644 index 000000000..8da1d99fb --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java @@ -0,0 +1,85 @@ +package com.google.refine.tests.expr.functions.xml; + +import org.testng.annotations.Test; + +import java.util.Properties; + +import org.slf4j.LoggerFactory; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeTest; + +import com.google.refine.expr.EvalError; +import com.google.refine.expr.functions.xml.ParseXml; +import com.google.refine.grel.ControlFunctionRegistry; +import com.google.refine.grel.Function; +import com.google.refine.tests.RefineTest; +import com.google.refine.tests.util.TestUtils; + + +public class ParseXmlTests extends RefineTest { + + static Properties bindings; + static String x = "\n" + + "\n" + + " \n" + + " John Doe\n" + + " head1\n" + + " head2\n" + + " body1\n" + + " \n" + + " \n" + + " \n" + + " Héloïse Dupont\n" + + " head3\n" + + " body2\n" + + " \n" + + " \n" + + ""; + + @Override + @BeforeTest + public void init() { + logger = LoggerFactory.getLogger(this.getClass()); + } + + @BeforeMethod + public void SetUp() { + bindings = new Properties(); + } + + @AfterMethod + public void TearDown() { + bindings = null; + } + + /** + * Lookup a control function by name and invoke it with a variable number of args + */ + private static Object invoke(String name,Object... args) { + // registry uses static initializer, so no need to set it up + Function function = ControlFunctionRegistry.getFunction(name); + if (function == null) { + throw new IllegalArgumentException("Unknown function "+name); + } + if (args == null) { + return function.call(bindings,new Object[0]); + } else { + return function.call(bindings,args); + } + } + + @Test + public void serializeParseXml() { + String json = "{\"description\":\"Parses a string as XML\",\"params\":\"string s\",\"returns\":\"XML object\"}"; + TestUtils.isSerializedTo(new ParseXml(), json); + } + + @Test + public void testParseXml() { + Assert.assertTrue(invoke("parseXml") instanceof EvalError); + Assert.assertTrue(invoke("parseXml","x") instanceof org.jsoup.nodes.Document); + } +} + diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/SelectXmlTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/SelectXmlTests.java new file mode 100644 index 000000000..2bb6459de --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/SelectXmlTests.java @@ -0,0 +1,15 @@ +package com.google.refine.tests.expr.functions.xml; + +import org.testng.annotations.Test; + +import com.google.refine.expr.functions.xml.SelectXml; +import com.google.refine.tests.util.TestUtils; + +public class SelectXmlTests { + @Test + public void serializeSelectXml() { + String json = "{\"description\":\"Selects an element from an XML or HTML element using selector syntax.\",\"params\":\"Element e, String s\",\"returns\":\"XML/HTML Elements\"}"; + TestUtils.isSerializedTo(new SelectXml(), json); + } +} + diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/xmlAttrTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/xmlAttrTests.java new file mode 100644 index 000000000..2486452c8 --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/xmlAttrTests.java @@ -0,0 +1,15 @@ +package com.google.refine.tests.expr.functions.xml; + +import org.testng.annotations.Test; + +import com.google.refine.expr.functions.xml.XmlAttr; +import com.google.refine.tests.util.TestUtils; + +public class xmlAttrTests { + @Test + public void serializeXmlAttr() { + String json = "{\"description\":\"Selects a value from an attribute on an xml or html Element.\",\"params\":\"Element e, String s\",\"returns\":\"String attribute Value\"}"; + TestUtils.isSerializedTo(new XmlAttr(), json); + } +} + diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/xmlTextTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/xmlTextTests.java new file mode 100644 index 000000000..c13b96367 --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/xmlTextTests.java @@ -0,0 +1,15 @@ +package com.google.refine.tests.expr.functions.xml; + +import org.testng.annotations.Test; + +import com.google.refine.expr.functions.xml.XmlText; +import com.google.refine.tests.util.TestUtils; + +public class xmlTextTests { + @Test + public void serializeXmlText() { + String json = "{\"description\":\"Selects the text from within an element (including all child elements)\",\"params\":\"Element e\",\"returns\":\"String text\"}"; + TestUtils.isSerializedTo(new XmlText(), json); + } +} + From c4f2ccd35371f848e97182f004d47500c36a9f4d Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Tue, 20 Nov 2018 20:19:46 +0000 Subject: [PATCH 03/10] Extend parseHtml tests --- .../expr/functions/html/ParseHtmlTests.java | 66 ++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java index 3182a9ee1..8edf4775c 100644 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java @@ -2,14 +2,78 @@ package com.google.refine.tests.expr.functions.html; import org.testng.annotations.Test; +import java.util.Properties; + +import org.slf4j.LoggerFactory; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeTest; + +import com.google.refine.expr.EvalError; import com.google.refine.expr.functions.html.ParseHtml; +import com.google.refine.grel.ControlFunctionRegistry; +import com.google.refine.grel.Function; +import com.google.refine.tests.RefineTest; import com.google.refine.tests.util.TestUtils; -public class ParseHtmlTests { +public class ParseHtmlTests extends RefineTest { + + static Properties bindings; + static String h = "\n" + + "\n" + + "\n" + + " \n" + + "

head1

\n" + + "
\n" + + "

para1

\n" + + "

para2

\n" + + "
\n" + + " \n" + + ""; + + @Override + @BeforeTest + public void init() { + logger = LoggerFactory.getLogger(this.getClass()); + } + + @BeforeMethod + public void SetUp() { + bindings = new Properties(); + } + + @AfterMethod + public void TearDown() { + bindings = null; + } + + /** + * Lookup a control function by name and invoke it with a variable number of args + */ + private static Object invoke(String name,Object... args) { + // registry uses static initializer, so no need to set it up + Function function = ControlFunctionRegistry.getFunction(name); + if (function == null) { + throw new IllegalArgumentException("Unknown function "+name); + } + if (args == null) { + return function.call(bindings,new Object[0]); + } else { + return function.call(bindings,args); + } + } + @Test public void serializeParseHtml() { String json = "{\"description\":\"Parses a string as HTML\",\"params\":\"string s\",\"returns\":\"HTML object\"}"; TestUtils.isSerializedTo(new ParseHtml(), json); } + + @Test + public void testParseHtml() { + Assert.assertTrue(invoke("parseHtml") instanceof EvalError); + Assert.assertTrue(invoke("parseHtml","h") instanceof org.jsoup.nodes.Document); + } } From 5678c44673470486c665aadeaa205c3efbcc5f70 Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Tue, 20 Nov 2018 20:20:04 +0000 Subject: [PATCH 04/10] Update JSOUP version --- main/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/pom.xml b/main/pom.xml index 720b0f64b..d6cec0813 100644 --- a/main/pom.xml +++ b/main/pom.xml @@ -345,7 +345,7 @@ org.jsoup jsoup - 1.4.1 + 1.11.3 net.sf.opencsv From bae3dbb81259b5c027876190c5627f1705aa092f Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Tue, 20 Nov 2018 20:20:51 +0000 Subject: [PATCH 05/10] Add XML parsing and update HTML parsing to use same classes --- .../refine/expr/functions/html/InnerHtml.java | 5 +- .../refine/expr/functions/html/ParseHtml.java | 9 +- .../refine/expr/functions/xml/InnerXml.java | 84 +++++++++++++++++++ .../refine/expr/functions/xml/OwnText.java | 75 +++++++++++++++++ .../refine/expr/functions/xml/ParseXml.java | 82 ++++++++++++++++++ .../refine/expr/functions/xml/SelectXml.java | 77 +++++++++++++++++ .../refine/expr/functions/xml/XmlAttr.java | 77 +++++++++++++++++ .../refine/expr/functions/xml/XmlText.java | 75 +++++++++++++++++ .../refine/grel/ControlFunctionRegistry.java | 23 +++-- 9 files changed, 494 insertions(+), 13 deletions(-) create mode 100644 main/src/com/google/refine/expr/functions/xml/InnerXml.java create mode 100644 main/src/com/google/refine/expr/functions/xml/OwnText.java create mode 100644 main/src/com/google/refine/expr/functions/xml/ParseXml.java create mode 100644 main/src/com/google/refine/expr/functions/xml/SelectXml.java create mode 100644 main/src/com/google/refine/expr/functions/xml/XmlAttr.java create mode 100644 main/src/com/google/refine/expr/functions/xml/XmlText.java diff --git a/main/src/com/google/refine/expr/functions/html/InnerHtml.java b/main/src/com/google/refine/expr/functions/html/InnerHtml.java index 2ac98e37d..5ccafdc6b 100644 --- a/main/src/com/google/refine/expr/functions/html/InnerHtml.java +++ b/main/src/com/google/refine/expr/functions/html/InnerHtml.java @@ -40,6 +40,7 @@ import org.json.JSONWriter; import org.jsoup.nodes.Element; import com.google.refine.expr.EvalError; +import com.google.refine.expr.functions.xml.InnerXml; import com.google.refine.grel.ControlFunctionRegistry; import com.google.refine.grel.Function; @@ -50,9 +51,7 @@ public class InnerHtml implements Function { if (args.length >= 1) { Object o1 = args[0]; if (o1 != null && o1 instanceof Element) { - Element e1 = (Element)o1; - return e1.html(); - + return new InnerXml().call(bindings, args, "html"); }else{ return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function"); } diff --git a/main/src/com/google/refine/expr/functions/html/ParseHtml.java b/main/src/com/google/refine/expr/functions/html/ParseHtml.java index 503171bab..28bf387eb 100644 --- a/main/src/com/google/refine/expr/functions/html/ParseHtml.java +++ b/main/src/com/google/refine/expr/functions/html/ParseHtml.java @@ -39,19 +39,22 @@ import org.json.JSONException; import org.json.JSONWriter; import org.jsoup.Jsoup; +import com.google.refine.expr.EvalError; +import com.google.refine.expr.functions.xml.ParseXml; +import com.google.refine.grel.ControlFunctionRegistry; import com.google.refine.grel.Function; public class ParseHtml implements Function { @Override public Object call(Properties bindings, Object[] args) { - if (args.length >= 1) { + if (args.length == 1) { Object o1 = args[0]; if (o1 != null && o1 instanceof String) { - return Jsoup.parse(o1.toString()); + return new ParseXml().call(bindings,args,"html"); } } - return null; + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single String as an argument"); } diff --git a/main/src/com/google/refine/expr/functions/xml/InnerXml.java b/main/src/com/google/refine/expr/functions/xml/InnerXml.java new file mode 100644 index 000000000..a37323c62 --- /dev/null +++ b/main/src/com/google/refine/expr/functions/xml/InnerXml.java @@ -0,0 +1,84 @@ +/* + +Copyright 2010, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.expr.functions.xml; + +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONWriter; +import org.jsoup.nodes.Element; + +import com.google.refine.expr.EvalError; +import com.google.refine.grel.ControlFunctionRegistry; +import com.google.refine.grel.Function; + +public class InnerXml implements Function { + + @Override + public Object call(Properties bindings, Object[] args) { + return call(bindings,args,"xml"); + } + + public Object call(Properties bindings, Object[] args, String mode) { + if (args.length == 1) { + Object o1 = args[0]; + if (o1 != null && o1 instanceof Element) { + Element e1 = (Element)o1; + if(mode == "xml") { + return e1.children().toString(); + } else if (mode == "html") { + return e1.html(); + } else { + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " unable to determine whether XML or HTML is being used."); + } + }else{ + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml() and select(query) prior to using this function"); + } + } + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single XML or HTML element as an argument"); + } + + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("The innerXml/innerHtml of an XML/HTML element"); + writer.key("params"); writer.value("Element e"); + writer.key("returns"); writer.value("String innerXml/innerHtml"); + writer.endObject(); + } +} + diff --git a/main/src/com/google/refine/expr/functions/xml/OwnText.java b/main/src/com/google/refine/expr/functions/xml/OwnText.java new file mode 100644 index 000000000..b4df9c6d1 --- /dev/null +++ b/main/src/com/google/refine/expr/functions/xml/OwnText.java @@ -0,0 +1,75 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.expr.functions.xml; + +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONWriter; +import org.jsoup.nodes.Element; + +import com.google.refine.expr.EvalError; +import com.google.refine.grel.ControlFunctionRegistry; +import com.google.refine.grel.Function; + +public class OwnText implements Function { + + @Override + public Object call(Properties bindings, Object[] args) { + if (args.length >= 1) { + Object o1 = args[0]; + if (o1 != null && o1 instanceof Element) { + Element e1 = (Element)o1; + return e1.ownText(); + + }else{ + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function"); + } + } + return null; + } + + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Gets the text owned by this XML/HTML element only; does not get the combined text of all children."); + writer.key("params"); writer.value("Element e"); + writer.key("returns"); writer.value("String ownText"); + writer.endObject(); + } +} + diff --git a/main/src/com/google/refine/expr/functions/xml/ParseXml.java b/main/src/com/google/refine/expr/functions/xml/ParseXml.java new file mode 100644 index 000000000..30f47aed8 --- /dev/null +++ b/main/src/com/google/refine/expr/functions/xml/ParseXml.java @@ -0,0 +1,82 @@ +/* + +Copyright 2010, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.expr.functions.xml; + +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONWriter; +import org.jsoup.Jsoup; +import org.jsoup.parser.Parser; + +import com.google.refine.expr.EvalError; +import com.google.refine.grel.ControlFunctionRegistry; +import com.google.refine.grel.Function; + +public class ParseXml implements Function { + + @Override + public Object call(Properties bindings, Object[] args) { + return call(bindings,args,"xml"); + } + + public Object call(Properties bindings, Object[] args, String mode) { + if (args.length == 1) { + Object o1 = args[0]; + if (o1 != null && o1 instanceof String) { + if (mode == "html") { + return Jsoup.parse(o1.toString()); + } else if (mode == "xml") { + return Jsoup.parse(o1.toString(), "",Parser.xmlParser()); + } else { + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " unable to identify which parser to use"); + } + } + } + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single String as an argument"); + } + + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Parses a string as XML"); + writer.key("params"); writer.value("string s"); + writer.key("returns"); writer.value("XML object"); + writer.endObject(); + } +} + diff --git a/main/src/com/google/refine/expr/functions/xml/SelectXml.java b/main/src/com/google/refine/expr/functions/xml/SelectXml.java new file mode 100644 index 000000000..8bb8d3a84 --- /dev/null +++ b/main/src/com/google/refine/expr/functions/xml/SelectXml.java @@ -0,0 +1,77 @@ +/* + +Copyright 2010, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.expr.functions.xml; + +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONWriter; +import org.jsoup.nodes.Element; + +import com.google.refine.expr.EvalError; +import com.google.refine.grel.ControlFunctionRegistry; +import com.google.refine.grel.Function; + +public class SelectXml implements Function { + + @Override + public Object call(Properties bindings, Object[] args) { + if (args.length == 2) { + Object o1 = args[0]; + Object o2 = args[1]; + if (o1 != null && o1 instanceof Element) { + Element e1 = (Element)o1; + if(o2 != null && o2 instanceof String){ + return e1.select(o2.toString()); + } + }else{ + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml()"); + } + } + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects two arguments"); + } + + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Selects an element from an XML or HTML element using selector syntax."); + writer.key("params"); writer.value("Element e, String s"); + writer.key("returns"); writer.value("XML/HTML Elements"); + writer.endObject(); + } +} + diff --git a/main/src/com/google/refine/expr/functions/xml/XmlAttr.java b/main/src/com/google/refine/expr/functions/xml/XmlAttr.java new file mode 100644 index 000000000..269cb6796 --- /dev/null +++ b/main/src/com/google/refine/expr/functions/xml/XmlAttr.java @@ -0,0 +1,77 @@ +/* + +Copyright 2010, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.expr.functions.xml; + +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONWriter; +import org.jsoup.nodes.Element; + +import com.google.refine.expr.EvalError; +import com.google.refine.grel.ControlFunctionRegistry; +import com.google.refine.grel.Function; + +public class XmlAttr implements Function { + + @Override + public Object call(Properties bindings, Object[] args) { + if (args.length >= 2) { + Object o1 = args[0]; + Object o2 = args[1]; + if (o1 != null && o1 instanceof Element) { + Element e1 = (Element)o1; + if(o2 != null && o2 instanceof String){ + return e1.attr(o2.toString()); + } + }else{ + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml() and select() prior to using this function"); + } + } + return null; + } + + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Selects a value from an attribute on an xml or html Element."); + writer.key("params"); writer.value("Element e, String s"); + writer.key("returns"); writer.value("String attribute Value"); + writer.endObject(); + } +} + diff --git a/main/src/com/google/refine/expr/functions/xml/XmlText.java b/main/src/com/google/refine/expr/functions/xml/XmlText.java new file mode 100644 index 000000000..90dd0fa41 --- /dev/null +++ b/main/src/com/google/refine/expr/functions/xml/XmlText.java @@ -0,0 +1,75 @@ +/* + +Copyright 2010,2011 Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.expr.functions.xml; + +import java.util.Properties; + +import org.json.JSONException; +import org.json.JSONWriter; +import org.jsoup.nodes.Element; + +import com.google.refine.expr.EvalError; +import com.google.refine.grel.ControlFunctionRegistry; +import com.google.refine.grel.Function; + +public class XmlText implements Function { + + @Override + public Object call(Properties bindings, Object[] args) { + if (args.length >= 1) { + Object o1 = args[0]; + if (o1 != null && o1 instanceof Element) { + Element e1 = (Element)o1; + return e1.text(); + + }else{ + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml() and select(query) prior to using this function"); + } + } + return null; + } + + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + + writer.object(); + writer.key("description"); writer.value("Selects the text from within an element (including all child elements)"); + writer.key("params"); writer.value("Element e"); + writer.key("returns"); writer.value("String text"); + writer.endObject(); + } +} + diff --git a/main/src/com/google/refine/grel/ControlFunctionRegistry.java b/main/src/com/google/refine/grel/ControlFunctionRegistry.java index edb57e2f6..e52247024 100644 --- a/main/src/com/google/refine/grel/ControlFunctionRegistry.java +++ b/main/src/com/google/refine/grel/ControlFunctionRegistry.java @@ -61,12 +61,14 @@ import com.google.refine.expr.functions.booleans.Xor; import com.google.refine.expr.functions.date.DatePart; import com.google.refine.expr.functions.date.Inc; import com.google.refine.expr.functions.date.Now; -import com.google.refine.expr.functions.html.HtmlAttr; -import com.google.refine.expr.functions.html.HtmlText; import com.google.refine.expr.functions.html.InnerHtml; -import com.google.refine.expr.functions.html.OwnText; import com.google.refine.expr.functions.html.ParseHtml; -import com.google.refine.expr.functions.html.SelectHtml; +import com.google.refine.expr.functions.xml.XmlAttr; +import com.google.refine.expr.functions.xml.XmlText; +import com.google.refine.expr.functions.xml.InnerXml; +import com.google.refine.expr.functions.xml.OwnText; +import com.google.refine.expr.functions.xml.ParseXml; +import com.google.refine.expr.functions.xml.SelectXml; import com.google.refine.expr.functions.math.ACos; import com.google.refine.expr.functions.math.ASin; import com.google.refine.expr.functions.math.ATan; @@ -236,11 +238,18 @@ public class ControlFunctionRegistry { // HTML functions from JSoup registerFunction("parseHtml", new ParseHtml()); - registerFunction("select", new SelectHtml()); - registerFunction("htmlAttr", new HtmlAttr()); - registerFunction("htmlText", new HtmlText()); + registerFunction("select", new SelectXml()); + registerFunction("htmlAttr", new XmlAttr()); + registerFunction("htmlText", new XmlText()); registerFunction("innerHtml", new InnerHtml()); registerFunction("ownText", new OwnText()); + + // XML functions from JSoup + registerFunction("parseXml", new ParseXml()); + registerFunction("selectx", new SelectXml()); + registerFunction("xmlAttr", new XmlAttr()); + registerFunction("xmlText", new XmlText()); + registerFunction("innerXml", new InnerXml()); registerFunction("indexOf", new IndexOf()); registerFunction("lastIndexOf", new LastIndexOf()); From 1f023b53af490d2a28e7cb135d8d8560a4254baa Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Tue, 20 Nov 2018 20:21:30 +0000 Subject: [PATCH 06/10] Remove deprecated HTML functions and tests --- .../refine/expr/functions/html/HtmlAttr.java | 77 ------------------- .../refine/expr/functions/html/HtmlText.java | 75 ------------------ .../refine/expr/functions/html/OwnText.java | 75 ------------------ .../expr/functions/html/SelectHtml.java | 77 ------------------- .../expr/functions/html/HtmlAttrTests.java | 15 ---- .../expr/functions/html/HtmlTextTests.java | 15 ---- .../expr/functions/html/OwnTextTests.java | 15 ---- .../expr/functions/html/SelectHtmlTests.java | 15 ---- 8 files changed, 364 deletions(-) delete mode 100644 main/src/com/google/refine/expr/functions/html/HtmlAttr.java delete mode 100644 main/src/com/google/refine/expr/functions/html/HtmlText.java delete mode 100644 main/src/com/google/refine/expr/functions/html/OwnText.java delete mode 100644 main/src/com/google/refine/expr/functions/html/SelectHtml.java delete mode 100644 main/tests/server/src/com/google/refine/tests/expr/functions/html/HtmlAttrTests.java delete mode 100644 main/tests/server/src/com/google/refine/tests/expr/functions/html/HtmlTextTests.java delete mode 100644 main/tests/server/src/com/google/refine/tests/expr/functions/html/OwnTextTests.java delete mode 100644 main/tests/server/src/com/google/refine/tests/expr/functions/html/SelectHtmlTests.java diff --git a/main/src/com/google/refine/expr/functions/html/HtmlAttr.java b/main/src/com/google/refine/expr/functions/html/HtmlAttr.java deleted file mode 100644 index 139567ef9..000000000 --- a/main/src/com/google/refine/expr/functions/html/HtmlAttr.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - -Copyright 2010, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.expr.functions.html; - -import java.util.Properties; - -import org.json.JSONException; -import org.json.JSONWriter; -import org.jsoup.nodes.Element; - -import com.google.refine.expr.EvalError; -import com.google.refine.grel.ControlFunctionRegistry; -import com.google.refine.grel.Function; - -public class HtmlAttr implements Function { - - @Override - public Object call(Properties bindings, Object[] args) { - if (args.length >= 2) { - Object o1 = args[0]; - Object o2 = args[1]; - if (o1 != null && o1 instanceof Element) { - Element e1 = (Element)o1; - if(o2 != null && o2 instanceof String){ - return e1.attr(o2.toString()); - } - }else{ - return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function"); - } - } - return null; - } - - - @Override - public void write(JSONWriter writer, Properties options) - throws JSONException { - - writer.object(); - writer.key("description"); writer.value("Selects a value from an attribute on an Html Element"); - writer.key("params"); writer.value("Element e, String s"); - writer.key("returns"); writer.value("String attribute Value"); - writer.endObject(); - } -} - diff --git a/main/src/com/google/refine/expr/functions/html/HtmlText.java b/main/src/com/google/refine/expr/functions/html/HtmlText.java deleted file mode 100644 index 0f0d2b382..000000000 --- a/main/src/com/google/refine/expr/functions/html/HtmlText.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - -Copyright 2010,2011 Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.expr.functions.html; - -import java.util.Properties; - -import org.json.JSONException; -import org.json.JSONWriter; -import org.jsoup.nodes.Element; - -import com.google.refine.expr.EvalError; -import com.google.refine.grel.ControlFunctionRegistry; -import com.google.refine.grel.Function; - -public class HtmlText implements Function { - - @Override - public Object call(Properties bindings, Object[] args) { - if (args.length >= 1) { - Object o1 = args[0]; - if (o1 != null && o1 instanceof Element) { - Element e1 = (Element)o1; - return e1.text(); - - }else{ - return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function"); - } - } - return null; - } - - - @Override - public void write(JSONWriter writer, Properties options) - throws JSONException { - - writer.object(); - writer.key("description"); writer.value("Selects the text from within an element (including all child elements)"); - writer.key("params"); writer.value("Element e"); - writer.key("returns"); writer.value("String text"); - writer.endObject(); - } -} - diff --git a/main/src/com/google/refine/expr/functions/html/OwnText.java b/main/src/com/google/refine/expr/functions/html/OwnText.java deleted file mode 100644 index eda388251..000000000 --- a/main/src/com/google/refine/expr/functions/html/OwnText.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - -Copyright 2011, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.expr.functions.html; - -import java.util.Properties; - -import org.json.JSONException; -import org.json.JSONWriter; -import org.jsoup.nodes.Element; - -import com.google.refine.expr.EvalError; -import com.google.refine.grel.ControlFunctionRegistry; -import com.google.refine.grel.Function; - -public class OwnText implements Function { - - @Override - public Object call(Properties bindings, Object[] args) { - if (args.length >= 1) { - Object o1 = args[0]; - if (o1 != null && o1 instanceof Element) { - Element e1 = (Element)o1; - return e1.ownText(); - - }else{ - return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function"); - } - } - return null; - } - - - @Override - public void write(JSONWriter writer, Properties options) - throws JSONException { - - writer.object(); - writer.key("description"); writer.value("Gets the text owned by this HTML element only; does not get the combined text of all children."); - writer.key("params"); writer.value("Element e"); - writer.key("returns"); writer.value("String ownText"); - writer.endObject(); - } -} - diff --git a/main/src/com/google/refine/expr/functions/html/SelectHtml.java b/main/src/com/google/refine/expr/functions/html/SelectHtml.java deleted file mode 100644 index 114e7bd74..000000000 --- a/main/src/com/google/refine/expr/functions/html/SelectHtml.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - -Copyright 2010, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.expr.functions.html; - -import java.util.Properties; - -import org.json.JSONException; -import org.json.JSONWriter; -import org.jsoup.nodes.Element; - -import com.google.refine.expr.EvalError; -import com.google.refine.grel.ControlFunctionRegistry; -import com.google.refine.grel.Function; - -public class SelectHtml implements Function { - - @Override - public Object call(Properties bindings, Object[] args) { - if (args.length >= 2) { - Object o1 = args[0]; - Object o2 = args[1]; - if (o1 != null && o1 instanceof Element) { - Element e1 = (Element)o1; - if(o2 != null && o2 instanceof String){ - return e1.select(o2.toString()); - } - }else{ - return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string)"); - } - } - return null; - } - - - @Override - public void write(JSONWriter writer, Properties options) - throws JSONException { - - writer.object(); - writer.key("description"); writer.value("Selects an element from an HTML element using selector syntax"); - writer.key("params"); writer.value("Element e, String s"); - writer.key("returns"); writer.value("HTML Elements"); - writer.endObject(); - } -} - diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/html/HtmlAttrTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/html/HtmlAttrTests.java deleted file mode 100644 index a73b22ebc..000000000 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/html/HtmlAttrTests.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.google.refine.tests.expr.functions.html; - -import org.testng.annotations.Test; - -import com.google.refine.expr.functions.html.HtmlAttr; -import com.google.refine.tests.util.TestUtils; - -public class HtmlAttrTests { - @Test - public void serializeHtmlAttr() { - String json = "{\"description\":\"Selects a value from an attribute on an Html Element\",\"params\":\"Element e, String s\",\"returns\":\"String attribute Value\"}"; - TestUtils.isSerializedTo(new HtmlAttr(), json); - } -} - diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/html/HtmlTextTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/html/HtmlTextTests.java deleted file mode 100644 index af0ea8678..000000000 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/html/HtmlTextTests.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.google.refine.tests.expr.functions.html; - -import org.testng.annotations.Test; - -import com.google.refine.expr.functions.html.HtmlText; -import com.google.refine.tests.util.TestUtils; - -public class HtmlTextTests { - @Test - public void serializeHtmlText() { - String json = "{\"description\":\"Selects the text from within an element (including all child elements)\",\"params\":\"Element e\",\"returns\":\"String text\"}"; - TestUtils.isSerializedTo(new HtmlText(), json); - } -} - diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/html/OwnTextTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/html/OwnTextTests.java deleted file mode 100644 index 8eb8bb01e..000000000 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/html/OwnTextTests.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.google.refine.tests.expr.functions.html; - -import org.testng.annotations.Test; - -import com.google.refine.expr.functions.html.OwnText; -import com.google.refine.tests.util.TestUtils; - -public class OwnTextTests { - @Test - public void serializeOwnText() { - String json = "{\"description\":\"Gets the text owned by this HTML element only; does not get the combined text of all children.\",\"params\":\"Element e\",\"returns\":\"String ownText\"}"; - TestUtils.isSerializedTo(new OwnText(), json); - } -} - diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/html/SelectHtmlTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/html/SelectHtmlTests.java deleted file mode 100644 index 351dd91bc..000000000 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/html/SelectHtmlTests.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.google.refine.tests.expr.functions.html; - -import org.testng.annotations.Test; - -import com.google.refine.expr.functions.html.SelectHtml; -import com.google.refine.tests.util.TestUtils; - -public class SelectHtmlTests { - @Test - public void serializeSelectHtml() { - String json = "{\"description\":\"Selects an element from an HTML element using selector syntax\",\"params\":\"Element e, String s\",\"returns\":\"HTML Elements\"}"; - TestUtils.isSerializedTo(new SelectHtml(), json); - } -} - From d144a5dccf0b386c233a0abf057c1398918e6a85 Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Wed, 21 Nov 2018 15:58:59 +0000 Subject: [PATCH 07/10] Stricter re number of arguments for parseXml/Html and clearer err msgs --- .../com/google/refine/expr/functions/html/InnerHtml.java | 4 ++-- .../com/google/refine/expr/functions/html/ParseHtml.java | 1 - main/src/com/google/refine/expr/functions/xml/OwnText.java | 6 +++--- main/src/com/google/refine/expr/functions/xml/XmlAttr.java | 4 ++-- main/src/com/google/refine/expr/functions/xml/XmlText.java | 4 ++-- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/main/src/com/google/refine/expr/functions/html/InnerHtml.java b/main/src/com/google/refine/expr/functions/html/InnerHtml.java index 5ccafdc6b..5e9853d46 100644 --- a/main/src/com/google/refine/expr/functions/html/InnerHtml.java +++ b/main/src/com/google/refine/expr/functions/html/InnerHtml.java @@ -48,7 +48,7 @@ public class InnerHtml implements Function { @Override public Object call(Properties bindings, Object[] args) { - if (args.length >= 1) { + if (args.length == 1) { Object o1 = args[0]; if (o1 != null && o1 instanceof Element) { return new InnerXml().call(bindings, args, "html"); @@ -56,7 +56,7 @@ public class InnerHtml implements Function { return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function"); } } - return null; + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single String as an argument"); } diff --git a/main/src/com/google/refine/expr/functions/html/ParseHtml.java b/main/src/com/google/refine/expr/functions/html/ParseHtml.java index 28bf387eb..14ae4b3cd 100644 --- a/main/src/com/google/refine/expr/functions/html/ParseHtml.java +++ b/main/src/com/google/refine/expr/functions/html/ParseHtml.java @@ -37,7 +37,6 @@ import java.util.Properties; import org.json.JSONException; import org.json.JSONWriter; -import org.jsoup.Jsoup; import com.google.refine.expr.EvalError; import com.google.refine.expr.functions.xml.ParseXml; diff --git a/main/src/com/google/refine/expr/functions/xml/OwnText.java b/main/src/com/google/refine/expr/functions/xml/OwnText.java index b4df9c6d1..8066b9473 100644 --- a/main/src/com/google/refine/expr/functions/xml/OwnText.java +++ b/main/src/com/google/refine/expr/functions/xml/OwnText.java @@ -47,17 +47,17 @@ public class OwnText implements Function { @Override public Object call(Properties bindings, Object[] args) { - if (args.length >= 1) { + if (args.length == 1) { Object o1 = args[0]; if (o1 != null && o1 instanceof Element) { Element e1 = (Element)o1; return e1.ownText(); }else{ - return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function"); + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseHtml(string) and select(query) prior to using this function"); } } - return null; + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single XML or HTML element as an argument"); } diff --git a/main/src/com/google/refine/expr/functions/xml/XmlAttr.java b/main/src/com/google/refine/expr/functions/xml/XmlAttr.java index 269cb6796..c4746038d 100644 --- a/main/src/com/google/refine/expr/functions/xml/XmlAttr.java +++ b/main/src/com/google/refine/expr/functions/xml/XmlAttr.java @@ -47,7 +47,7 @@ public class XmlAttr implements Function { @Override public Object call(Properties bindings, Object[] args) { - if (args.length >= 2) { + if (args.length == 2) { Object o1 = args[0]; Object o2 = args[1]; if (o1 != null && o1 instanceof Element) { @@ -59,7 +59,7 @@ public class XmlAttr implements Function { return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml() and select() prior to using this function"); } } - return null; + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects two arguments"); } diff --git a/main/src/com/google/refine/expr/functions/xml/XmlText.java b/main/src/com/google/refine/expr/functions/xml/XmlText.java index 90dd0fa41..014eb8016 100644 --- a/main/src/com/google/refine/expr/functions/xml/XmlText.java +++ b/main/src/com/google/refine/expr/functions/xml/XmlText.java @@ -47,7 +47,7 @@ public class XmlText implements Function { @Override public Object call(Properties bindings, Object[] args) { - if (args.length >= 1) { + if (args.length == 1) { Object o1 = args[0]; if (o1 != null && o1 instanceof Element) { Element e1 = (Element)o1; @@ -57,7 +57,7 @@ public class XmlText implements Function { return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml() and select(query) prior to using this function"); } } - return null; + return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single XML or HTML element as an argument"); } From 58352e4e561951abdf1746ee6a8b985b439e3667 Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Wed, 21 Nov 2018 15:59:15 +0000 Subject: [PATCH 08/10] Update tests for XML and HTML functions --- .../refine/tests/expr/functions/html/ParseHtmlTests.java | 8 +++++++- .../refine/tests/expr/functions/xml/OwnTextTests.java | 4 ++-- .../refine/tests/expr/functions/xml/ParseXmlTests.java | 6 ++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java index 8edf4775c..0442acb1b 100644 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java @@ -1,5 +1,6 @@ package com.google.refine.tests.expr.functions.html; +import org.jsoup.Jsoup; import org.testng.annotations.Test; import java.util.Properties; @@ -26,7 +27,7 @@ public class ParseHtmlTests extends RefineTest { " \n" + "

head1

\n" + "
\n" + - "

para1

\n" + + "

para1 strong text

\n" + "

para2

\n" + "
\n" + " \n" + @@ -74,6 +75,11 @@ public class ParseHtmlTests extends RefineTest { public void testParseHtml() { Assert.assertTrue(invoke("parseHtml") instanceof EvalError); Assert.assertTrue(invoke("parseHtml","h") instanceof org.jsoup.nodes.Document); + Assert.assertTrue(invoke("select",Jsoup.parse(h),"p") instanceof org.jsoup.select.Elements); + Assert.assertTrue(invoke("innerHtml",Jsoup.parse(h).select("p").first()) instanceof String); + Assert.assertEquals(invoke("innerHtml",Jsoup.parse(h).select("p").first()),"para1 strong text"); + Assert.assertEquals(invoke("htmlAttr",Jsoup.parse(h).select("div").first(),"class"),"class1"); + Assert.assertEquals(invoke("ownText",Jsoup.parse(h).select("p").first()),"para1"); } } diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/OwnTextTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/OwnTextTests.java index f1b7c688b..e699ba829 100644 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/OwnTextTests.java +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/OwnTextTests.java @@ -1,8 +1,8 @@ -package com.google.refine.tests.expr.functions.html; +package com.google.refine.tests.expr.functions.xml; import org.testng.annotations.Test; -import com.google.refine.expr.functions.html.OwnText; +import com.google.refine.expr.functions.xml.OwnText; import com.google.refine.tests.util.TestUtils; public class OwnTextTests { diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java index 8da1d99fb..c1babf345 100644 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java @@ -1,5 +1,7 @@ package com.google.refine.tests.expr.functions.xml; +import org.jsoup.parser.Parser; +import org.jsoup.Jsoup; import org.testng.annotations.Test; import java.util.Properties; @@ -80,6 +82,10 @@ public class ParseXmlTests extends RefineTest { public void testParseXml() { Assert.assertTrue(invoke("parseXml") instanceof EvalError); Assert.assertTrue(invoke("parseXml","x") instanceof org.jsoup.nodes.Document); + Assert.assertTrue(invoke("select",Jsoup.parse(x,"",Parser.xmlParser()),"foaf|Person") instanceof org.jsoup.select.Elements); + Assert.assertEquals(invoke("innerXml",Jsoup.parse(x,"",Parser.xmlParser()).select("foaf|Person").first()),"\n John Doe\n\n\n head1\n\n\n head2\n\n\n body1\n\n"); + Assert.assertEquals(invoke("xmlAttr",Jsoup.parse(x,"",Parser.xmlParser()).select("foaf|homepage").first(),"rdf:resource"),"http://www.example.com"); + Assert.assertEquals(invoke("ownText",Jsoup.parse(x,"",Parser.xmlParser()).select("BODY").first()),"body1"); } } From 281d757f1c2ab0fe8674a2064b47519aefb6e6a9 Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Thu, 22 Nov 2018 00:37:43 +0000 Subject: [PATCH 09/10] Remove superfluous function --- .../google/refine/grel/ControlFunctionRegistry.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/main/src/com/google/refine/grel/ControlFunctionRegistry.java b/main/src/com/google/refine/grel/ControlFunctionRegistry.java index e52247024..397bb193d 100644 --- a/main/src/com/google/refine/grel/ControlFunctionRegistry.java +++ b/main/src/com/google/refine/grel/ControlFunctionRegistry.java @@ -236,20 +236,17 @@ public class ControlFunctionRegistry { registerFunction("match", new Match()); registerFunction("find", new Find()); - // HTML functions from JSoup + // XML and HTML functions from JSoup + registerFunction("parseXml", new ParseXml()); registerFunction("parseHtml", new ParseHtml()); registerFunction("select", new SelectXml()); + registerFunction("xmlAttr", new XmlAttr()); registerFunction("htmlAttr", new XmlAttr()); + registerFunction("xmlText", new XmlText()); registerFunction("htmlText", new XmlText()); + registerFunction("innerXml", new InnerXml()); registerFunction("innerHtml", new InnerHtml()); registerFunction("ownText", new OwnText()); - - // XML functions from JSoup - registerFunction("parseXml", new ParseXml()); - registerFunction("selectx", new SelectXml()); - registerFunction("xmlAttr", new XmlAttr()); - registerFunction("xmlText", new XmlText()); - registerFunction("innerXml", new InnerXml()); registerFunction("indexOf", new IndexOf()); registerFunction("lastIndexOf", new LastIndexOf()); From 78360f5e0c61f5d03367a0d9eccbd1c7baea06ef Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Thu, 22 Nov 2018 00:38:08 +0000 Subject: [PATCH 10/10] Add tests for htmlText and xmlText --- .../google/refine/tests/expr/functions/html/ParseHtmlTests.java | 1 + .../google/refine/tests/expr/functions/xml/ParseXmlTests.java | 1 + 2 files changed, 2 insertions(+) diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java index 0442acb1b..b6838331b 100644 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/html/ParseHtmlTests.java @@ -79,6 +79,7 @@ public class ParseHtmlTests extends RefineTest { Assert.assertTrue(invoke("innerHtml",Jsoup.parse(h).select("p").first()) instanceof String); Assert.assertEquals(invoke("innerHtml",Jsoup.parse(h).select("p").first()),"para1 strong text"); Assert.assertEquals(invoke("htmlAttr",Jsoup.parse(h).select("div").first(),"class"),"class1"); + Assert.assertEquals(invoke("htmlText",Jsoup.parse(h).select("div").first()),"para1 strong text para2"); Assert.assertEquals(invoke("ownText",Jsoup.parse(h).select("p").first()),"para1"); } } diff --git a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java index c1babf345..e58f6b101 100644 --- a/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java +++ b/main/tests/server/src/com/google/refine/tests/expr/functions/xml/ParseXmlTests.java @@ -86,6 +86,7 @@ public class ParseXmlTests extends RefineTest { Assert.assertEquals(invoke("innerXml",Jsoup.parse(x,"",Parser.xmlParser()).select("foaf|Person").first()),"\n John Doe\n\n\n head1\n\n\n head2\n\n\n body1\n\n"); Assert.assertEquals(invoke("xmlAttr",Jsoup.parse(x,"",Parser.xmlParser()).select("foaf|homepage").first(),"rdf:resource"),"http://www.example.com"); Assert.assertEquals(invoke("ownText",Jsoup.parse(x,"",Parser.xmlParser()).select("BODY").first()),"body1"); + Assert.assertEquals(invoke("xmlText",Jsoup.parse(x,"",Parser.xmlParser()).select("foaf|Person").first()),"John Doe head1 head2 body1"); } }