XmlImportUtilities.detectPathFromTag and XmlImportUtilities.detectRecordElement methods now use a generic TreeParser interface. A lightweight wrapper XmlParser wraps XMLStreamReader to provide parsing for xml data.

This is another small step towards a generic importer for tree structured data.  My plan is to refactor more of XmlImportUtilities' methods to use the TreeParser interface so that XmlStreamReader is no longer called directly from XmlImportUtilities.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@1322 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-09-27 17:40:51 +00:00
parent 1bda46d40f
commit d71c563831
7 changed files with 144 additions and 50 deletions

View File

@ -16,7 +16,7 @@ import com.google.refine.model.Cell;
import com.google.refine.model.Column;
import com.google.refine.model.Project;
public class TreeImporter {
public abstract class TreeImporter {
final static Logger logger = LoggerFactory.getLogger("TreeImporter");
/**
@ -90,7 +90,7 @@ public class TreeImporter {
public List<List<Cell>> rows = new LinkedList<List<Cell>>();
}
static public void sortRecordElementCandidates(List<RecordElementCandidate> list) {
static protected void sortRecordElementCandidates(List<RecordElementCandidate> list) {
Collections.sort(list, new Comparator<RecordElementCandidate>() {
public int compare(RecordElementCandidate o1, RecordElementCandidate o2) {
return o2.count - o1.count;

View File

@ -8,6 +8,7 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import javax.servlet.ServletException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
@ -16,6 +17,7 @@ import javax.xml.stream.XMLStreamReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.refine.importers.parsers.TreeParser;
import com.google.refine.model.Cell;
import com.google.refine.model.Project;
import com.google.refine.model.Row;
@ -23,13 +25,11 @@ import com.google.refine.model.Row;
public class XmlImportUtilities extends TreeImporter {
final static Logger logger = LoggerFactory.getLogger("XmlImporterUtilities");
static public String[] detectPathFromTag(InputStream inputStream, String tag) {
static public String[] detectPathFromTag(TreeParser parser, String tag) {
try {
XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream);
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.START_ELEMENT) {
if (eventType == XMLStreamConstants.START_ELEMENT) { //FIXME uses Xml
List<String> path = detectRecordElement(parser, tag);
if (path != null) {
String[] path2 = new String[path.size()];
@ -47,13 +47,13 @@ public class XmlImportUtilities extends TreeImporter {
return null;
}
/**
* Looks for an element with the given tag name in the Xml being parsed, returning the path hierarchy to reach it.
*
* @param parser
* @param tag
* The Xml element name (can be qualified) to search for
* The element name (can be qualified) to search for
* @return
* If the tag is found, an array of strings is returned.
* If the tag is at the top level, the tag will be the only item in the array.
@ -61,33 +61,43 @@ public class XmlImportUtilities extends TreeImporter {
* Null if the the tag is not found.
* @throws XMLStreamException
*/
static protected List<String> detectRecordElement(XMLStreamReader parser, String tag) throws XMLStreamException {
if(parser.getEventType() == XMLStreamConstants.START_DOCUMENT)
parser.next();
String localName = parser.getLocalName();
String fullName = composeName(parser.getPrefix(), localName);
if (tag.equals(parser.getLocalName()) || tag.equals(fullName)) {
List<String> path = new LinkedList<String>();
path.add(localName);
static protected List<String> detectRecordElement(TreeParser parser, String tag) throws ServletException {
try{
if(parser.getEventType() == XMLStreamConstants.START_DOCUMENT) //FIXME uses Xml, and is not generic
parser.next();
String localName = parser.getLocalName();
String fullName = composeName(parser.getPrefix(), localName);
if (tag.equals(parser.getLocalName()) || tag.equals(fullName)) {
List<String> path = new LinkedList<String>();
path.add(localName);
return path;
}
return path;
}
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.END_ELEMENT) {
break;
} else if (eventType == XMLStreamConstants.START_ELEMENT) {
List<String> path = detectRecordElement(parser, tag);
if (path != null) {
path.add(0, localName);
return path;
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.END_ELEMENT) { //FIXME uses Xml, and is not generic
break;
} else if (eventType == XMLStreamConstants.START_ELEMENT) { //FIXME uses Xml, and is not generic
List<String> path = detectRecordElement(parser, tag);
if (path != null) {
path.add(0, localName);
return path;
}
}
}
}catch(ServletException e){
// silent
// e.printStackTrace();
}
return null;
}
static protected String composeName(String prefix, String localName) {
return prefix != null && prefix.length() > 0 ? (prefix + ":" + localName) : localName;
}
/**
* Seeks for recurring XML element in an InputStream
* which are likely candidates for being data records
@ -334,10 +344,6 @@ public class XmlImportUtilities extends TreeImporter {
}
}
static protected String composeName(String prefix, String localName) {
return prefix != null && prefix.length() > 0 ? (prefix + ":" + localName) : localName;
}
/**
*
* @param project

View File

@ -11,6 +11,8 @@ import org.slf4j.LoggerFactory;
import com.google.refine.ProjectMetadata;
import com.google.refine.importers.TreeImporter.ImportColumnGroup;
import com.google.refine.importers.parsers.TreeParser;
import com.google.refine.importers.parsers.XmlParser;
import com.google.refine.model.Project;
public class XmlImporter implements StreamImporter {
@ -44,9 +46,19 @@ public class XmlImporter implements StreamImporter {
}
if (options.containsKey("importer-record-tag")) {
recordPath = XmlImportUtilities.detectPathFromTag(
new ByteArrayInputStream(buffer, 0, bytes_read),
InputStream iStream = new ByteArrayInputStream(buffer, 0, bytes_read);
TreeParser parser = null;
try{
parser = new XmlParser(iStream);
recordPath = XmlImportUtilities.detectPathFromTag(
parser,
options.getProperty("importer-record-tag"));
}catch(Exception e){
// silent
// e.printStackTrace();
}
} else {
recordPath = XmlImportUtilities.detectRecordElement(
new ByteArrayInputStream(buffer, 0, bytes_read));

View File

@ -0,0 +1,11 @@
package com.google.refine.importers.parsers;
import javax.servlet.ServletException;
public interface TreeParser {
public int next() throws ServletException;
public int getEventType();
public boolean hasNext() throws ServletException;
public String getLocalName();
public String getPrefix();
}

View File

@ -0,0 +1,54 @@
package com.google.refine.importers.parsers;
import java.io.InputStream;
import javax.servlet.ServletException;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
public class XmlParser implements TreeParser{
XMLStreamReader parser = null;
public XmlParser(InputStream inputStream){
try {
parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream);
} catch (XMLStreamException e) {
// silent
// e.printStackTrace();
} catch (FactoryConfigurationError e) {
// silent
// e.printStackTrace();
}
}
public int next() throws ServletException{
try {
return parser.next();
} catch (XMLStreamException e) {
//TODO log and return
throw new ServletException(e.getMessage());
}
}
public int getEventType(){
return parser.getEventType();
}
public boolean hasNext() throws ServletException{
try {
return parser.hasNext();
} catch (XMLStreamException e) {
throw new ServletException(e.getMessage());
}
}
public String getLocalName(){
return parser.getLocalName();
}
public String getPrefix(){
return parser.getPrefix();
}
}

View File

@ -2,15 +2,17 @@ package com.google.refine.tests.importers;
import java.util.List;
import javax.servlet.ServletException;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import com.google.refine.importers.XmlImportUtilities;
import com.google.refine.importers.parsers.TreeParser;
import com.google.refine.model.Project;
public class XmlImportUtilitiesStub extends XmlImportUtilities {
public List<String> detectRecordElementWrapper(XMLStreamReader parser, String tag) throws XMLStreamException{
public List<String> detectRecordElementWrapper(TreeParser parser, String tag) throws ServletException{
return super.detectRecordElement(parser, tag);
}

View File

@ -6,6 +6,7 @@ import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import javax.servlet.ServletException;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
@ -21,6 +22,7 @@ import org.testng.annotations.Test;
import com.google.refine.importers.TreeImporter.ImportColumn;
import com.google.refine.importers.TreeImporter.ImportColumnGroup;
import com.google.refine.importers.TreeImporter.ImportRecord;
import com.google.refine.importers.parsers.XmlParser;
import com.google.refine.model.Project;
import com.google.refine.model.Row;
import com.google.refine.tests.RefineTest;
@ -67,8 +69,9 @@ public class XmlImportUtilitiesTests extends RefineTest {
public void detectPathFromTagTest(){
loadXml("<?xml version=\"1.0\"?><library><book id=\"1\"><author>author1</author><genre>genre1</genre></book></library>");
String tag = "library";
String[] response = XmlImportUtilitiesStub.detectPathFromTag(inputStream, tag);
//createParser();
XmlParser xmlParser = new XmlParser(inputStream);
String[] response = XmlImportUtilitiesStub.detectPathFromTag(xmlParser, tag);
Assert.assertNotNull(response);
Assert.assertEquals(response.length, 1);
Assert.assertEquals(response[0], "library");
@ -78,7 +81,10 @@ public class XmlImportUtilitiesTests extends RefineTest {
public void detectPathFromTagWithNestedElement(){
loadXml("<?xml version=\"1.0\"?><library><book id=\"1\"><author>author1</author><genre>genre1</genre></book></library>");
String tag = "book";
String[] response = XmlImportUtilitiesStub.detectPathFromTag(inputStream, tag);
//createParser();
XmlParser xmlParser = new XmlParser(inputStream);
String[] response = XmlImportUtilitiesStub.detectPathFromTag(xmlParser, tag);
Assert.assertNotNull(response);
Assert.assertEquals(response.length, 2);
Assert.assertEquals(response[0], "library");
@ -88,14 +94,15 @@ public class XmlImportUtilitiesTests extends RefineTest {
@Test
public void detectRecordElementTest(){
loadXml("<?xml version=\"1.0\"?><library><book id=\"1\"><author>author1</author><genre>genre1</genre></book></library>");
createParser();
//createParser();
XmlParser xmlParser = new XmlParser(inputStream);
String tag="library";
List<String> response = new ArrayList<String>();
try {
response = SUT.detectRecordElementWrapper(parser, tag);
} catch (XMLStreamException e) {
Assert.fail();
response = SUT.detectRecordElementWrapper(xmlParser, tag);
} catch (ServletException e) {
Assert.fail(e.getMessage());
}
Assert.assertNotNull(response);
Assert.assertEquals(response.size(), 1);
@ -105,14 +112,15 @@ public class XmlImportUtilitiesTests extends RefineTest {
@Test
public void detectRecordElementCanHandleWithNestedElements(){
loadXml("<?xml version=\"1.0\"?><library><book id=\"1\"><author>author1</author><genre>genre1</genre></book></library>");
createParser();
//createParser();
XmlParser xmlParser = new XmlParser(inputStream);
String tag="book";
List<String> response = new ArrayList<String>();
try {
response = SUT.detectRecordElementWrapper(parser, tag);
} catch (XMLStreamException e) {
Assert.fail();
response = SUT.detectRecordElementWrapper(xmlParser, tag);
} catch (ServletException e) {
Assert.fail(e.getMessage());
}
Assert.assertNotNull(response);
Assert.assertEquals(response.size(), 2);
@ -123,14 +131,15 @@ public class XmlImportUtilitiesTests extends RefineTest {
@Test
public void detectRecordElementIsNullForUnfoundTag(){
loadXml("<?xml version=\"1.0\"?><library><book id=\"1\"><author>author1</author><genre>genre1</genre></book></library>");
createParser();
//createParser();
XmlParser xmlParser = new XmlParser(inputStream);
String tag="";
List<String> response = new ArrayList<String>();
try {
response = SUT.detectRecordElementWrapper(parser, tag);
} catch (XMLStreamException e) {
Assert.fail();
response = SUT.detectRecordElementWrapper(xmlParser, tag);
} catch (ServletException e) {
Assert.fail(e.getMessage());
}
Assert.assertNull(response);
}