Two more XmlImport tests now work. Some documentation stubs were added to XmlImporterUtilities.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@967 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-06-15 16:22:53 +00:00
parent 8a87ddaf3f
commit a671551289
3 changed files with 154 additions and 31 deletions

View File

@ -27,11 +27,20 @@ import com.metaweb.gridworks.model.Row;
public class XmlImportUtilities { public class XmlImportUtilities {
final static Logger logger = LoggerFactory.getLogger("XmlImporterUtilities"); final static Logger logger = LoggerFactory.getLogger("XmlImporterUtilities");
/**
* An element which holds sub-elements we
* shall import as records
*/
static protected class RecordElementCandidate { static protected class RecordElementCandidate {
String[] path; String[] path;
int count; int count;
} }
/**
*
*
*
*/
static protected abstract class ImportVertical { static protected abstract class ImportVertical {
public String name = ""; public String name = "";
public int nonBlankCount; public int nonBlankCount;
@ -39,6 +48,9 @@ public class XmlImportUtilities {
abstract void tabulate(); abstract void tabulate();
} }
/**
* A column group describes a branch in tree structured data
*/
static public class ImportColumnGroup extends ImportVertical { static public class ImportColumnGroup extends ImportVertical {
public Map<String, ImportColumnGroup> subgroups = new HashMap<String, ImportColumnGroup>(); public Map<String, ImportColumnGroup> subgroups = new HashMap<String, ImportColumnGroup>();
public Map<String, ImportColumn> columns = new HashMap<String, ImportColumn>(); public Map<String, ImportColumn> columns = new HashMap<String, ImportColumn>();
@ -56,19 +68,32 @@ public class XmlImportUtilities {
} }
} }
/**
* A column is used to describe a branch-terminating element in a tree structure
*
*/
static public class ImportColumn extends ImportVertical { static public class ImportColumn extends ImportVertical {
public int cellIndex; public int cellIndex;
public boolean blankOnFirstRow; public boolean blankOnFirstRow;
public ImportColumn(){}
public ImportColumn(String name){ //required for testing
super.name = name;
}
@Override @Override
void tabulate() { void tabulate() {
// already done the tabulation elsewhere // already done the tabulation elsewhere
} }
} }
/**
* A record describes a data element in a tree-structure
*
*/
static public class ImportRecord { static public class ImportRecord {
List<List<Cell>> rows = new LinkedList<List<Cell>>(); public List<List<Cell>> rows = new LinkedList<List<Cell>>();
List<Integer> columnEmptyRowIndices = new ArrayList<Integer>(); public List<Integer> columnEmptyRowIndices = new ArrayList<Integer>();
} }
static public String[] detectPathFromTag(InputStream inputStream, String tag) { static public String[] detectPathFromTag(InputStream inputStream, String tag) {
@ -96,6 +121,19 @@ public class XmlImportUtilities {
return null; return null;
} }
/**
* Looks for an element with the given tag name in the Xml being parsed, returning the path hierarchy to reach it.
*
* @param parser
* @param tag
* The Xml element name (can be qualified) to search for
* @return
* If the tag is found, an array of strings is returned.
* If the tag is at the top level, the tag will be the only item in the array.
* If the tag is nested beneath the top level, the array is filled with the hierarchy with the tag name at the last index
* Null if the the tag is not found.
* @throws XMLStreamException
*/
static protected List<String> detectRecordElement(XMLStreamReader parser, String tag) throws XMLStreamException { static protected List<String> detectRecordElement(XMLStreamReader parser, String tag) throws XMLStreamException {
if(parser.getEventType() == XMLStreamConstants.START_DOCUMENT) if(parser.getEventType() == XMLStreamConstants.START_DOCUMENT)
parser.next(); parser.next();
@ -123,6 +161,15 @@ public class XmlImportUtilities {
return null; return null;
} }
/**
* Seeks for recurring XML element in an InputStream
* which are likely candidates for being data records
* @param inputStream
* The XML data as a stream
* @return
* The path to the most numerous of the possible candidates.
* null if no candidates were found (less than 6 recurrences)
*/
static public String[] detectRecordElement(InputStream inputStream) { static public String[] detectRecordElement(InputStream inputStream) {
logger.trace("detectRecordElement(inputStream)"); logger.trace("detectRecordElement(inputStream)");
List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>(); List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>();
@ -320,6 +367,15 @@ public class XmlImportUtilities {
} }
} }
/**
*
* @param project
* @param parser
* @param recordPath
* @param pathIndex
* @param rootColumnGroup
* @throws XMLStreamException
*/
static protected void findRecord( static protected void findRecord(
Project project, Project project,
XMLStreamReader parser, XMLStreamReader parser,
@ -361,6 +417,14 @@ public class XmlImportUtilities {
} }
} }
/**
* processRecord parsesXml for a single element and it's sub-elements,
* adding the parsed data as a row to the project
* @param project
* @param parser
* @param rootColumnGroup
* @throws XMLStreamException
*/
static protected void processRecord( static protected void processRecord(
Project project, Project project,
XMLStreamReader parser, XMLStreamReader parser,
@ -390,6 +454,14 @@ public class XmlImportUtilities {
return prefix != null && prefix.length() > 0 ? (prefix + ":" + localName) : localName; return prefix != null && prefix.length() > 0 ? (prefix + ":" + localName) : localName;
} }
/**
*
* @param project
* @param parser
* @param columnGroup
* @param record
* @throws XMLStreamException
*/
static protected void processSubRecord( static protected void processSubRecord(
Project project, Project project,
XMLStreamReader parser, XMLStreamReader parser,
@ -495,7 +567,7 @@ public class XmlImportUtilities {
row.add(null); row.add(null);
} }
logger.trace("Adding cell with value : " + value + " to row : " + rowIndex + " at cell index : " + (cellIndex-1)); logger.trace("Adding cell with value : \"" + value + "\" to row : " + rowIndex + " at cell index : " + (cellIndex-1));
row.set(cellIndex-1, new Cell(value, null)); row.set(cellIndex-1, new Cell(value, null));

View File

@ -27,12 +27,12 @@ import com.metaweb.gridworks.tests.GridworksTest;
public class XmlImportUtilitiesTests extends GridworksTest { public class XmlImportUtilitiesTests extends GridworksTest {
@BeforeTest @BeforeTest
public void init() { public void init() {
logger = LoggerFactory.getLogger(this.getClass()); logger = LoggerFactory.getLogger(this.getClass());
} }
//dependencies //dependencies
Project project; Project project;
XMLStreamReader parser; XMLStreamReader parser;
@ -157,26 +157,51 @@ public class XmlImportUtilitiesTests extends GridworksTest {
assertProjectCreated(project, 0, 6); assertProjectCreated(project, 0, 6);
Assert.assertEquals(project.rows.get(0).cells.size(), 4); Assert.assertEquals(project.rows.get(0).cells.size(), 4);
//TODO
Assert.assertEquals(columnGroup.subgroups.size(), 1);
Assert.assertNotNull(columnGroup.subgroups.get("book"));
Assert.assertEquals(columnGroup.subgroups.get("book").subgroups.size(), 3);
Assert.assertNotNull(columnGroup.subgroups.get("book").subgroups.get("author"));
Assert.assertNotNull(columnGroup.subgroups.get("book").subgroups.get("title"));
Assert.assertNotNull(columnGroup.subgroups.get("book").subgroups.get("publish_date"));
} }
@Test @Test
public void createColumnsFromImportTest() { public void importXmlWithVaryingStructureTest(){
loadXml(XmlImporterTests.getSampleWithVaryingStructure());
String[] recordPath = new String[]{"library", "book"};
XmlImportUtilitiesStub.importXml(inputStream, project, recordPath, columnGroup);
log(project);
assertProjectCreated(project, 0, 6);
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals(project.rows.get(5).cells.size(), 5);
Assert.assertEquals(columnGroup.subgroups.size(), 1);
Assert.assertEquals(columnGroup.name, "");
ImportColumnGroup book = columnGroup.subgroups.get("book");
Assert.assertNotNull(book);
Assert.assertEquals(book.columns.size(), 1);
Assert.assertEquals(book.subgroups.size(), 4);
Assert.assertNotNull(book.subgroups.get("author"));
Assert.assertEquals(book.subgroups.get("author").columns.size(), 1);
Assert.assertNotNull(book.subgroups.get("title"));
Assert.assertNotNull(book.subgroups.get("publish_date"));
Assert.assertNotNull(book.subgroups.get("genre"));
}
@Test
public void createColumnsFromImportTest(){
ImportColumnGroup columnGroup = new ImportColumnGroup(); ImportColumnGroup columnGroup = new ImportColumnGroup();
ImportColumn ic1 = new ImportColumn();
ic1.name = "hello";
ImportColumn ic2 = new ImportColumn();
ic2.name = "world";
ImportColumnGroup subGroup = new ImportColumnGroup(); ImportColumnGroup subGroup = new ImportColumnGroup();
ImportColumn ic3 = new ImportColumn(); columnGroup.columns.put("a", new ImportColumn("hello"));
ic3.name = "foo"; columnGroup.columns.put("b", new ImportColumn("world"));
ImportColumn ic4 = new ImportColumn(); subGroup.columns.put("c", new ImportColumn("foo"));
ic4.name = "bar"; subGroup.columns.put("d", new ImportColumn("bar"));
subGroup.columns.put("c", ic3);
subGroup.columns.put("d", ic4);
columnGroup.columns.put("a", ic1);
columnGroup.columns.put("b", ic2);
columnGroup.subgroups.put("e", subGroup); columnGroup.subgroups.put("e", subGroup);
XmlImportUtilitiesStub.createColumnsFromImport(project, columnGroup); XmlImportUtilitiesStub.createColumnsFromImport(project, columnGroup);
log(project); log(project);
assertProjectCreated(project, 4, 0); assertProjectCreated(project, 4, 0);
@ -206,7 +231,7 @@ public class XmlImportUtilitiesTests extends GridworksTest {
log(project); log(project);
assertProjectCreated(project, 0, 6); assertProjectCreated(project, 0, 6);
Assert.assertEquals(project.rows.get(0).cells.size(), 4); Assert.assertEquals(project.rows.get(0).cells.size(), 4);
//TODO //TODO
} }
@ -279,7 +304,7 @@ public class XmlImportUtilitiesTests extends GridworksTest {
} }
@Test(groups={"broken"}) @Test
public void processSubRecordTest(){ public void processSubRecordTest(){
loadXml("<?xml version=\"1.0\"?><library><book id=\"1\"><author>author1</author><genre>genre1</genre></book></library>"); loadXml("<?xml version=\"1.0\"?><library><book id=\"1\"><author>author1</author><genre>genre1</genre></book></library>");
createParser(); createParser();
@ -291,20 +316,43 @@ public class XmlImportUtilitiesTests extends GridworksTest {
Assert.fail(); Assert.fail();
} }
log(project); log(project);
Assert.fail();
//TODO need to verify 'record' was set correctly which we can't do as ImportRecord is an internal class Assert.assertEquals(columnGroup.subgroups.size(), 1);
Assert.assertEquals(columnGroup.name, "");
Assert.assertNotNull(columnGroup.subgroups.get("library"));
Assert.assertEquals(columnGroup.subgroups.get("library").subgroups.size(), 1);
ImportColumnGroup book = columnGroup.subgroups.get("library").subgroups.get("book");
Assert.assertNotNull(book);
Assert.assertEquals(book.subgroups.size(), 2);
Assert.assertNotNull(book.subgroups.get("author"));
Assert.assertNotNull(book.subgroups.get("genre"));
//TODO check record
} }
@Test(groups={"broken"}) @Test
public void addCellTest(){ public void addCellTest(){
String columnLocalName = "author"; String columnLocalName = "author";
String text = "Author1, The"; String text = "Author1, The";
int commonStartingRowIndex = 0; int commonStartingRowIndex = 0;
project.rows.add(new Row(0));
SUT.addCellWrapper(project, columnGroup, record, columnLocalName, text, commonStartingRowIndex); SUT.addCellWrapper(project, columnGroup, record, columnLocalName, text, commonStartingRowIndex);
Assert.fail(); Assert.assertNotNull(record);
//TODO need to verify 'record' was set correctly which we can't do as ImportRecord is an internal class Assert.assertNotNull(record.rows);
Assert.assertNotNull(record.columnEmptyRowIndices);
Assert.assertEquals(record.rows.size(), 1);
Assert.assertEquals(record.columnEmptyRowIndices.size(), 2);
Assert.assertNotNull(record.rows.get(0));
Assert.assertNotNull(record.columnEmptyRowIndices.get(0));
Assert.assertNotNull(record.columnEmptyRowIndices.get(1));
Assert.assertEquals(record.rows.get(0).size(), 2);
Assert.assertNotNull(record.rows.get(0).get(0));
Assert.assertEquals(record.rows.get(0).get(0).value, "Author1, The");
Assert.assertEquals(record.columnEmptyRowIndices.get(0).intValue(),0);
Assert.assertEquals(record.columnEmptyRowIndices.get(1).intValue(),1);
} }
//----------------helpers------------- //----------------helpers-------------

View File

@ -95,24 +95,27 @@ public class XmlImporterTests extends GridworksTest {
Assert.assertEquals(row.getCell(1).value, "With line\n break"); Assert.assertEquals(row.getCell(1).value, "With line\n break");
} }
@Test(groups={"broken"}) @Test
public void testElementsWithVaryingStructure(){ public void testElementsWithVaryingStructure(){
RunTest(getSampleWithVaryingStructure()); RunTest(getSampleWithVaryingStructure());
log(project); log(project);
assertProjectCreated(project, 5, 6); assertProjectCreated(project, 5, 6);
Assert.assertEquals( project.columnModel.getColumnByCellIndex(5).getName(), "book - genre");
Row row0 = project.rows.get(0); Row row0 = project.rows.get(0);
Assert.assertNotNull(row0); Assert.assertNotNull(row0);
Assert.assertEquals(row0.cells.size(),6); Assert.assertEquals(row0.cells.size(),4);
Row row5 = project.rows.get(5); Row row5 = project.rows.get(5);
Assert.assertNotNull(row5); Assert.assertNotNull(row5);
Assert.assertEquals(row5.cells.size(),6); Assert.assertEquals(row5.cells.size(),5);
} }
@Test(groups={"broken"}) @Test
public void testElementWithNestedTree(){ public void testElementWithNestedTree(){
RunTest(getSampleWithTreeStructure());
log(project); log(project);
assertProjectCreated(project, 5, 6); assertProjectCreated(project, 5, 6);