Two more XmlImport tests now work. Some documentation stubs were added to XmlImporterUtilities.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@967 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-06-15 16:22:53 +00:00
parent 8a87ddaf3f
commit a671551289
3 changed files with 154 additions and 31 deletions

View File

@ -27,11 +27,20 @@ import com.metaweb.gridworks.model.Row;
public class XmlImportUtilities {
final static Logger logger = LoggerFactory.getLogger("XmlImporterUtilities");
/**
* An element which holds sub-elements we
* shall import as records
*/
static protected class RecordElementCandidate {
String[] path;
int count;
}
/**
*
*
*
*/
static protected abstract class ImportVertical {
public String name = "";
public int nonBlankCount;
@ -39,6 +48,9 @@ public class XmlImportUtilities {
abstract void tabulate();
}
/**
* A column group describes a branch in tree structured data
*/
static public class ImportColumnGroup extends ImportVertical {
public Map<String, ImportColumnGroup> subgroups = new HashMap<String, ImportColumnGroup>();
public Map<String, ImportColumn> columns = new HashMap<String, ImportColumn>();
@ -56,19 +68,32 @@ public class XmlImportUtilities {
}
}
/**
* A column is used to describe a branch-terminating element in a tree structure
*
*/
static public class ImportColumn extends ImportVertical {
public int cellIndex;
public boolean blankOnFirstRow;
public ImportColumn(){}
public ImportColumn(String name){ //required for testing
super.name = name;
}
@Override
void tabulate() {
// already done the tabulation elsewhere
}
}
/**
* A record describes a data element in a tree-structure
*
*/
static public class ImportRecord {
List<List<Cell>> rows = new LinkedList<List<Cell>>();
List<Integer> columnEmptyRowIndices = new ArrayList<Integer>();
public List<List<Cell>> rows = new LinkedList<List<Cell>>();
public List<Integer> columnEmptyRowIndices = new ArrayList<Integer>();
}
static public String[] detectPathFromTag(InputStream inputStream, String tag) {
@ -96,6 +121,19 @@ public class XmlImportUtilities {
return null;
}
/**
* Looks for an element with the given tag name in the Xml being parsed, returning the path hierarchy to reach it.
*
* @param parser
* @param tag
* The Xml element name (can be qualified) to search for
* @return
* If the tag is found, an array of strings is returned.
* If the tag is at the top level, the tag will be the only item in the array.
* If the tag is nested beneath the top level, the array is filled with the hierarchy with the tag name at the last index
* Null if the the tag is not found.
* @throws XMLStreamException
*/
static protected List<String> detectRecordElement(XMLStreamReader parser, String tag) throws XMLStreamException {
if(parser.getEventType() == XMLStreamConstants.START_DOCUMENT)
parser.next();
@ -123,6 +161,15 @@ public class XmlImportUtilities {
return null;
}
/**
* Seeks for recurring XML element in an InputStream
* which are likely candidates for being data records
* @param inputStream
* The XML data as a stream
* @return
* The path to the most numerous of the possible candidates.
* null if no candidates were found (less than 6 recurrences)
*/
static public String[] detectRecordElement(InputStream inputStream) {
logger.trace("detectRecordElement(inputStream)");
List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>();
@ -320,6 +367,15 @@ public class XmlImportUtilities {
}
}
/**
*
* @param project
* @param parser
* @param recordPath
* @param pathIndex
* @param rootColumnGroup
* @throws XMLStreamException
*/
static protected void findRecord(
Project project,
XMLStreamReader parser,
@ -361,6 +417,14 @@ public class XmlImportUtilities {
}
}
/**
* processRecord parsesXml for a single element and it's sub-elements,
* adding the parsed data as a row to the project
* @param project
* @param parser
* @param rootColumnGroup
* @throws XMLStreamException
*/
static protected void processRecord(
Project project,
XMLStreamReader parser,
@ -390,6 +454,14 @@ public class XmlImportUtilities {
return prefix != null && prefix.length() > 0 ? (prefix + ":" + localName) : localName;
}
/**
*
* @param project
* @param parser
* @param columnGroup
* @param record
* @throws XMLStreamException
*/
static protected void processSubRecord(
Project project,
XMLStreamReader parser,
@ -495,7 +567,7 @@ public class XmlImportUtilities {
row.add(null);
}
logger.trace("Adding cell with value : " + value + " to row : " + rowIndex + " at cell index : " + (cellIndex-1));
logger.trace("Adding cell with value : \"" + value + "\" to row : " + rowIndex + " at cell index : " + (cellIndex-1));
row.set(cellIndex-1, new Cell(value, null));

View File

@ -157,26 +157,51 @@ public class XmlImportUtilitiesTests extends GridworksTest {
assertProjectCreated(project, 0, 6);
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
//TODO
Assert.assertEquals(columnGroup.subgroups.size(), 1);
Assert.assertNotNull(columnGroup.subgroups.get("book"));
Assert.assertEquals(columnGroup.subgroups.get("book").subgroups.size(), 3);
Assert.assertNotNull(columnGroup.subgroups.get("book").subgroups.get("author"));
Assert.assertNotNull(columnGroup.subgroups.get("book").subgroups.get("title"));
Assert.assertNotNull(columnGroup.subgroups.get("book").subgroups.get("publish_date"));
}
@Test
public void createColumnsFromImportTest() {
public void importXmlWithVaryingStructureTest(){
loadXml(XmlImporterTests.getSampleWithVaryingStructure());
String[] recordPath = new String[]{"library", "book"};
XmlImportUtilitiesStub.importXml(inputStream, project, recordPath, columnGroup);
log(project);
assertProjectCreated(project, 0, 6);
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals(project.rows.get(5).cells.size(), 5);
Assert.assertEquals(columnGroup.subgroups.size(), 1);
Assert.assertEquals(columnGroup.name, "");
ImportColumnGroup book = columnGroup.subgroups.get("book");
Assert.assertNotNull(book);
Assert.assertEquals(book.columns.size(), 1);
Assert.assertEquals(book.subgroups.size(), 4);
Assert.assertNotNull(book.subgroups.get("author"));
Assert.assertEquals(book.subgroups.get("author").columns.size(), 1);
Assert.assertNotNull(book.subgroups.get("title"));
Assert.assertNotNull(book.subgroups.get("publish_date"));
Assert.assertNotNull(book.subgroups.get("genre"));
}
@Test
public void createColumnsFromImportTest(){
ImportColumnGroup columnGroup = new ImportColumnGroup();
ImportColumn ic1 = new ImportColumn();
ic1.name = "hello";
ImportColumn ic2 = new ImportColumn();
ic2.name = "world";
ImportColumnGroup subGroup = new ImportColumnGroup();
ImportColumn ic3 = new ImportColumn();
ic3.name = "foo";
ImportColumn ic4 = new ImportColumn();
ic4.name = "bar";
subGroup.columns.put("c", ic3);
subGroup.columns.put("d", ic4);
columnGroup.columns.put("a", ic1);
columnGroup.columns.put("b", ic2);
columnGroup.columns.put("a", new ImportColumn("hello"));
columnGroup.columns.put("b", new ImportColumn("world"));
subGroup.columns.put("c", new ImportColumn("foo"));
subGroup.columns.put("d", new ImportColumn("bar"));
columnGroup.subgroups.put("e", subGroup);
XmlImportUtilitiesStub.createColumnsFromImport(project, columnGroup);
log(project);
assertProjectCreated(project, 4, 0);
@ -279,7 +304,7 @@ public class XmlImportUtilitiesTests extends GridworksTest {
}
@Test(groups={"broken"})
@Test
public void processSubRecordTest(){
loadXml("<?xml version=\"1.0\"?><library><book id=\"1\"><author>author1</author><genre>genre1</genre></book></library>");
createParser();
@ -291,20 +316,43 @@ public class XmlImportUtilitiesTests extends GridworksTest {
Assert.fail();
}
log(project);
Assert.fail();
//TODO need to verify 'record' was set correctly which we can't do as ImportRecord is an internal class
Assert.assertEquals(columnGroup.subgroups.size(), 1);
Assert.assertEquals(columnGroup.name, "");
Assert.assertNotNull(columnGroup.subgroups.get("library"));
Assert.assertEquals(columnGroup.subgroups.get("library").subgroups.size(), 1);
ImportColumnGroup book = columnGroup.subgroups.get("library").subgroups.get("book");
Assert.assertNotNull(book);
Assert.assertEquals(book.subgroups.size(), 2);
Assert.assertNotNull(book.subgroups.get("author"));
Assert.assertNotNull(book.subgroups.get("genre"));
//TODO check record
}
@Test(groups={"broken"})
@Test
public void addCellTest(){
String columnLocalName = "author";
String text = "Author1, The";
int commonStartingRowIndex = 0;
project.rows.add(new Row(0));
SUT.addCellWrapper(project, columnGroup, record, columnLocalName, text, commonStartingRowIndex);
Assert.fail();
//TODO need to verify 'record' was set correctly which we can't do as ImportRecord is an internal class
Assert.assertNotNull(record);
Assert.assertNotNull(record.rows);
Assert.assertNotNull(record.columnEmptyRowIndices);
Assert.assertEquals(record.rows.size(), 1);
Assert.assertEquals(record.columnEmptyRowIndices.size(), 2);
Assert.assertNotNull(record.rows.get(0));
Assert.assertNotNull(record.columnEmptyRowIndices.get(0));
Assert.assertNotNull(record.columnEmptyRowIndices.get(1));
Assert.assertEquals(record.rows.get(0).size(), 2);
Assert.assertNotNull(record.rows.get(0).get(0));
Assert.assertEquals(record.rows.get(0).get(0).value, "Author1, The");
Assert.assertEquals(record.columnEmptyRowIndices.get(0).intValue(),0);
Assert.assertEquals(record.columnEmptyRowIndices.get(1).intValue(),1);
}
//----------------helpers-------------

View File

@ -95,24 +95,27 @@ public class XmlImporterTests extends GridworksTest {
Assert.assertEquals(row.getCell(1).value, "With line\n break");
}
@Test(groups={"broken"})
@Test
public void testElementsWithVaryingStructure(){
RunTest(getSampleWithVaryingStructure());
log(project);
assertProjectCreated(project, 5, 6);
Assert.assertEquals( project.columnModel.getColumnByCellIndex(5).getName(), "book - genre");
Row row0 = project.rows.get(0);
Assert.assertNotNull(row0);
Assert.assertEquals(row0.cells.size(),6);
Assert.assertEquals(row0.cells.size(),4);
Row row5 = project.rows.get(5);
Assert.assertNotNull(row5);
Assert.assertEquals(row5.cells.size(),6);
Assert.assertEquals(row5.cells.size(),5);
}
@Test(groups={"broken"})
@Test
public void testElementWithNestedTree(){
RunTest(getSampleWithTreeStructure());
log(project);
assertProjectCreated(project, 5, 6);