Add XML parsing and update HTML parsing to use same classes

This commit is contained in:
Owen Stephens 2018-11-20 20:20:51 +00:00
parent 5678c44673
commit bae3dbb812
9 changed files with 494 additions and 13 deletions

View File

@ -40,6 +40,7 @@ import org.json.JSONWriter;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import com.google.refine.expr.EvalError; import com.google.refine.expr.EvalError;
import com.google.refine.expr.functions.xml.InnerXml;
import com.google.refine.grel.ControlFunctionRegistry; import com.google.refine.grel.ControlFunctionRegistry;
import com.google.refine.grel.Function; import com.google.refine.grel.Function;
@ -50,9 +51,7 @@ public class InnerHtml implements Function {
if (args.length >= 1) { if (args.length >= 1) {
Object o1 = args[0]; Object o1 = args[0];
if (o1 != null && o1 instanceof Element) { if (o1 != null && o1 instanceof Element) {
Element e1 = (Element)o1; return new InnerXml().call(bindings, args, "html");
return e1.html();
}else{ }else{
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function"); return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function");
} }

View File

@ -39,19 +39,22 @@ import org.json.JSONException;
import org.json.JSONWriter; import org.json.JSONWriter;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import com.google.refine.expr.EvalError;
import com.google.refine.expr.functions.xml.ParseXml;
import com.google.refine.grel.ControlFunctionRegistry;
import com.google.refine.grel.Function; import com.google.refine.grel.Function;
public class ParseHtml implements Function { public class ParseHtml implements Function {
@Override @Override
public Object call(Properties bindings, Object[] args) { public Object call(Properties bindings, Object[] args) {
if (args.length >= 1) { if (args.length == 1) {
Object o1 = args[0]; Object o1 = args[0];
if (o1 != null && o1 instanceof String) { if (o1 != null && o1 instanceof String) {
return Jsoup.parse(o1.toString()); return new ParseXml().call(bindings,args,"html");
} }
} }
return null; return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single String as an argument");
} }

View File

@ -0,0 +1,84 @@
/*
Copyright 2010, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.google.refine.expr.functions.xml;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
import org.jsoup.nodes.Element;
import com.google.refine.expr.EvalError;
import com.google.refine.grel.ControlFunctionRegistry;
import com.google.refine.grel.Function;
public class InnerXml implements Function {
@Override
public Object call(Properties bindings, Object[] args) {
return call(bindings,args,"xml");
}
public Object call(Properties bindings, Object[] args, String mode) {
if (args.length == 1) {
Object o1 = args[0];
if (o1 != null && o1 instanceof Element) {
Element e1 = (Element)o1;
if(mode == "xml") {
return e1.children().toString();
} else if (mode == "html") {
return e1.html();
} else {
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " unable to determine whether XML or HTML is being used.");
}
}else{
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml() and select(query) prior to using this function");
}
}
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single XML or HTML element as an argument");
}
@Override
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("description"); writer.value("The innerXml/innerHtml of an XML/HTML element");
writer.key("params"); writer.value("Element e");
writer.key("returns"); writer.value("String innerXml/innerHtml");
writer.endObject();
}
}

View File

@ -0,0 +1,75 @@
/*
Copyright 2011, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.google.refine.expr.functions.xml;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
import org.jsoup.nodes.Element;
import com.google.refine.expr.EvalError;
import com.google.refine.grel.ControlFunctionRegistry;
import com.google.refine.grel.Function;
public class OwnText implements Function {
@Override
public Object call(Properties bindings, Object[] args) {
if (args.length >= 1) {
Object o1 = args[0];
if (o1 != null && o1 instanceof Element) {
Element e1 = (Element)o1;
return e1.ownText();
}else{
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an HTML Element. Please first use parseHtml(string) and select(query) prior to using this function");
}
}
return null;
}
@Override
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("description"); writer.value("Gets the text owned by this XML/HTML element only; does not get the combined text of all children.");
writer.key("params"); writer.value("Element e");
writer.key("returns"); writer.value("String ownText");
writer.endObject();
}
}

View File

@ -0,0 +1,82 @@
/*
Copyright 2010, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.google.refine.expr.functions.xml;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
import org.jsoup.Jsoup;
import org.jsoup.parser.Parser;
import com.google.refine.expr.EvalError;
import com.google.refine.grel.ControlFunctionRegistry;
import com.google.refine.grel.Function;
public class ParseXml implements Function {
@Override
public Object call(Properties bindings, Object[] args) {
return call(bindings,args,"xml");
}
public Object call(Properties bindings, Object[] args, String mode) {
if (args.length == 1) {
Object o1 = args[0];
if (o1 != null && o1 instanceof String) {
if (mode == "html") {
return Jsoup.parse(o1.toString());
} else if (mode == "xml") {
return Jsoup.parse(o1.toString(), "",Parser.xmlParser());
} else {
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " unable to identify which parser to use");
}
}
}
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a single String as an argument");
}
@Override
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("description"); writer.value("Parses a string as XML");
writer.key("params"); writer.value("string s");
writer.key("returns"); writer.value("XML object");
writer.endObject();
}
}

View File

@ -0,0 +1,77 @@
/*
Copyright 2010, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.google.refine.expr.functions.xml;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
import org.jsoup.nodes.Element;
import com.google.refine.expr.EvalError;
import com.google.refine.grel.ControlFunctionRegistry;
import com.google.refine.grel.Function;
public class SelectXml implements Function {
@Override
public Object call(Properties bindings, Object[] args) {
if (args.length == 2) {
Object o1 = args[0];
Object o2 = args[1];
if (o1 != null && o1 instanceof Element) {
Element e1 = (Element)o1;
if(o2 != null && o2 instanceof String){
return e1.select(o2.toString());
}
}else{
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml()");
}
}
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects two arguments");
}
@Override
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("description"); writer.value("Selects an element from an XML or HTML element using selector syntax.");
writer.key("params"); writer.value("Element e, String s");
writer.key("returns"); writer.value("XML/HTML Elements");
writer.endObject();
}
}

View File

@ -0,0 +1,77 @@
/*
Copyright 2010, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.google.refine.expr.functions.xml;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
import org.jsoup.nodes.Element;
import com.google.refine.expr.EvalError;
import com.google.refine.grel.ControlFunctionRegistry;
import com.google.refine.grel.Function;
public class XmlAttr implements Function {
@Override
public Object call(Properties bindings, Object[] args) {
if (args.length >= 2) {
Object o1 = args[0];
Object o2 = args[1];
if (o1 != null && o1 instanceof Element) {
Element e1 = (Element)o1;
if(o2 != null && o2 instanceof String){
return e1.attr(o2.toString());
}
}else{
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml() and select() prior to using this function");
}
}
return null;
}
@Override
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("description"); writer.value("Selects a value from an attribute on an xml or html Element.");
writer.key("params"); writer.value("Element e, String s");
writer.key("returns"); writer.value("String attribute Value");
writer.endObject();
}
}

View File

@ -0,0 +1,75 @@
/*
Copyright 2010,2011 Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.google.refine.expr.functions.xml;
import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
import org.jsoup.nodes.Element;
import com.google.refine.expr.EvalError;
import com.google.refine.grel.ControlFunctionRegistry;
import com.google.refine.grel.Function;
public class XmlText implements Function {
@Override
public Object call(Properties bindings, Object[] args) {
if (args.length >= 1) {
Object o1 = args[0];
if (o1 != null && o1 instanceof Element) {
Element e1 = (Element)o1;
return e1.text();
}else{
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " failed as the first parameter is not an XML or HTML Element. Please first use parseXml() or parseHtml() and select(query) prior to using this function");
}
}
return null;
}
@Override
public void write(JSONWriter writer, Properties options)
throws JSONException {
writer.object();
writer.key("description"); writer.value("Selects the text from within an element (including all child elements)");
writer.key("params"); writer.value("Element e");
writer.key("returns"); writer.value("String text");
writer.endObject();
}
}

View File

@ -61,12 +61,14 @@ import com.google.refine.expr.functions.booleans.Xor;
import com.google.refine.expr.functions.date.DatePart; import com.google.refine.expr.functions.date.DatePart;
import com.google.refine.expr.functions.date.Inc; import com.google.refine.expr.functions.date.Inc;
import com.google.refine.expr.functions.date.Now; import com.google.refine.expr.functions.date.Now;
import com.google.refine.expr.functions.html.HtmlAttr;
import com.google.refine.expr.functions.html.HtmlText;
import com.google.refine.expr.functions.html.InnerHtml; import com.google.refine.expr.functions.html.InnerHtml;
import com.google.refine.expr.functions.html.OwnText;
import com.google.refine.expr.functions.html.ParseHtml; import com.google.refine.expr.functions.html.ParseHtml;
import com.google.refine.expr.functions.html.SelectHtml; import com.google.refine.expr.functions.xml.XmlAttr;
import com.google.refine.expr.functions.xml.XmlText;
import com.google.refine.expr.functions.xml.InnerXml;
import com.google.refine.expr.functions.xml.OwnText;
import com.google.refine.expr.functions.xml.ParseXml;
import com.google.refine.expr.functions.xml.SelectXml;
import com.google.refine.expr.functions.math.ACos; import com.google.refine.expr.functions.math.ACos;
import com.google.refine.expr.functions.math.ASin; import com.google.refine.expr.functions.math.ASin;
import com.google.refine.expr.functions.math.ATan; import com.google.refine.expr.functions.math.ATan;
@ -236,12 +238,19 @@ public class ControlFunctionRegistry {
// HTML functions from JSoup // HTML functions from JSoup
registerFunction("parseHtml", new ParseHtml()); registerFunction("parseHtml", new ParseHtml());
registerFunction("select", new SelectHtml()); registerFunction("select", new SelectXml());
registerFunction("htmlAttr", new HtmlAttr()); registerFunction("htmlAttr", new XmlAttr());
registerFunction("htmlText", new HtmlText()); registerFunction("htmlText", new XmlText());
registerFunction("innerHtml", new InnerHtml()); registerFunction("innerHtml", new InnerHtml());
registerFunction("ownText", new OwnText()); registerFunction("ownText", new OwnText());
// XML functions from JSoup
registerFunction("parseXml", new ParseXml());
registerFunction("selectx", new SelectXml());
registerFunction("xmlAttr", new XmlAttr());
registerFunction("xmlText", new XmlText());
registerFunction("innerXml", new InnerXml());
registerFunction("indexOf", new IndexOf()); registerFunction("indexOf", new IndexOf());
registerFunction("lastIndexOf", new LastIndexOf()); registerFunction("lastIndexOf", new LastIndexOf());
registerFunction("startsWith", new StartsWith()); registerFunction("startsWith", new StartsWith());