XmlImporter is partially unit tested. One broken test for case where Record Elements contain varying numbers of nested elements. (This is for Issue 61 which is, at the time of this commit, an open issue)

XmlImportUtilities produces log for case when no RecordElementCandidate are found.  (too few similar Xml elements).

git-svn-id: http://google-refine.googlecode.com/svn/trunk@862 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-05-26 19:22:38 +00:00
parent 1c47ff476b
commit 34cb1c4d07
3 changed files with 332 additions and 110 deletions

View File

@ -16,28 +16,33 @@ import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
public class XmlImportUtilities {
final static Logger logger = LoggerFactory.getLogger("XmlImporterUtilities");
static protected class RecordElementCandidate {
String[] path;
int count;
}
static protected abstract class ImportVertical {
public String name = "";
public int nonBlankCount;
abstract void tabulate();
}
static public class ImportColumnGroup extends ImportVertical {
public Map<String, ImportColumnGroup> subgroups = new HashMap<String, ImportColumnGroup>();
public Map<String, ImportColumn> columns = new HashMap<String, ImportColumn>();
@Override
void tabulate() {
for (ImportColumn c : columns.values()) {
@ -50,7 +55,7 @@ public class XmlImportUtilities {
}
}
}
static public class ImportColumn extends ImportVertical {
public int cellIndex;
public boolean blankOnFirstRow;
@ -60,27 +65,27 @@ public class XmlImportUtilities {
// already done the tabulation elsewhere
}
}
static public class ImportRecord {
List<List<Cell>> rows = new LinkedList<List<Cell>>();
List<Integer> columnEmptyRowIndices = new ArrayList<Integer>();
}
static public String[] detectPathFromTag(InputStream inputStream, String tag) {
//List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>();
try {
XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream);
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.START_ELEMENT) {
List<String> path = detectRecordElement(parser, tag);
if (path != null) {
String[] path2 = new String[path.size()];
path.toArray(path2);
return path2;
}
}
@ -89,20 +94,20 @@ public class XmlImportUtilities {
// silent
// e.printStackTrace();
}
return null;
}
static protected List<String> detectRecordElement(XMLStreamReader parser, String tag) throws XMLStreamException {
String localName = parser.getLocalName();
String fullName = composeName(parser.getPrefix(), localName);
if (tag.equals(parser.getLocalName()) || tag.equals(fullName)) {
List<String> path = new LinkedList<String>();
path.add(localName);
return path;
}
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.END_ELEMENT) {
@ -117,21 +122,22 @@ public class XmlImportUtilities {
}
return null;
}
static public String[] detectRecordElement(InputStream inputStream) {
logger.trace("detectRecordElement(inputStream)");
List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>();
try {
XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream);
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.START_ELEMENT) {
RecordElementCandidate candidate =
RecordElementCandidate candidate =
detectRecordElement(
parser,
parser,
new String[] { parser.getLocalName() });
if (candidate != null) {
candidates.add(candidate);
}
@ -141,22 +147,24 @@ public class XmlImportUtilities {
// silent
// e.printStackTrace();
}
if (candidates.size() > 0) {
sortRecordElementCandidates(candidates);
return candidates.get(0).path;
}
logger.info("No candidate elements were found in Xml - at least 6 similar elements are required");
return null;
}
static protected RecordElementCandidate detectRecordElement(XMLStreamReader parser, String[] path) {
logger.trace("detectRecordElement(XMLStreamReader, String[])");
List<RecordElementCandidate> descendantCandidates = new ArrayList<RecordElementCandidate>();
Map<String, Integer> immediateChildCandidateMap = new HashMap<String, Integer>();
int textNodeCount = 0;
int childElementNodeCount = 0;
try {
while (parser.hasNext()) {
int eventType = parser.next();
@ -168,18 +176,18 @@ public class XmlImportUtilities {
}
} else if (eventType == XMLStreamConstants.START_ELEMENT) {
childElementNodeCount++;
String tagName = parser.getLocalName();
immediateChildCandidateMap.put(
tagName,
immediateChildCandidateMap.containsKey(tagName) ?
tagName,
immediateChildCandidateMap.containsKey(tagName) ?
immediateChildCandidateMap.get(tagName) + 1 : 1);
String[] path2 = new String[path.length + 1];
System.arraycopy(path, 0, path2, 0, path.length);
path2[path.length] = tagName;
RecordElementCandidate c = detectRecordElement(parser, path2);
if (c != null) {
descendantCandidates.add(c);
@ -190,12 +198,12 @@ public class XmlImportUtilities {
// silent
// e.printStackTrace();
}
if (textNodeCount > 0 && childElementNodeCount > 0) {
// This is a mixed element
return null;
}
if (immediateChildCandidateMap.size() > 0) {
List<RecordElementCandidate> immediateChildCandidates = new ArrayList<RecordElementCandidate>(immediateChildCandidateMap.size());
for (Entry<String, Integer> entry : immediateChildCandidateMap.entrySet()) {
@ -204,38 +212,39 @@ public class XmlImportUtilities {
String[] path2 = new String[path.length + 1];
System.arraycopy(path, 0, path2, 0, path.length);
path2[path.length] = entry.getKey();
RecordElementCandidate candidate = new RecordElementCandidate();
candidate.path = path2;
candidate.count = count;
immediateChildCandidates.add(candidate);
}
}
if (immediateChildCandidates.size() > 0 && immediateChildCandidates.size() < 5) {
// There are some promising immediate child elements, but not many,
// There are some promising immediate child elements, but not many,
// that can serve as record elements.
sortRecordElementCandidates(immediateChildCandidates);
RecordElementCandidate ourCandidate = immediateChildCandidates.get(0);
logger.trace("ourCandidate.count : " + ourCandidate.count + "; immediateChildCandidates.size() : " + immediateChildCandidates.size());
if (ourCandidate.count / immediateChildCandidates.size() > 5) {
return ourCandidate;
}
descendantCandidates.add(ourCandidate);
}
}
if (descendantCandidates.size() > 0) {
sortRecordElementCandidates(descendantCandidates);
RecordElementCandidate candidate = descendantCandidates.get(0);
if (candidate.count / descendantCandidates.size() > 5) {
return candidate;
}
}
return null;
}
@ -246,16 +255,16 @@ public class XmlImportUtilities {
}
});
}
static public void importXml(
InputStream inputStream,
Project project,
InputStream inputStream,
Project project,
String[] recordPath,
ImportColumnGroup rootColumnGroup
) {
try {
XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream);
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.START_ELEMENT) {
@ -266,32 +275,32 @@ public class XmlImportUtilities {
// silent
}
}
static public void createColumnsFromImport(
Project project,
Project project,
ImportColumnGroup columnGroup
) {
int startColumnIndex = project.columnModel.columns.size();
List<ImportColumn> columns = new ArrayList<ImportColumn>(columnGroup.columns.values());
Collections.sort(columns, new Comparator<ImportColumn>() {
public int compare(ImportColumn o1, ImportColumn o2) {
if (o1.blankOnFirstRow != o2.blankOnFirstRow) {
return o1.blankOnFirstRow ? 1 : -1;
}
int c = o2.nonBlankCount - o1.nonBlankCount;
return c != 0 ? c : (o1.name.length() - o2.name.length());
}
});
for (int i = 0; i < columns.size(); i++) {
ImportColumn c = columns.get(i);
Column column = new com.metaweb.gridworks.model.Column(c.cellIndex, c.name);
project.columnModel.columns.add(column);
}
List<ImportColumnGroup> subgroups = new ArrayList<ImportColumnGroup>(columnGroup.subgroups.values());
Collections.sort(subgroups, new Comparator<ImportColumnGroup>() {
public int compare(ImportColumnGroup o1, ImportColumnGroup o2) {
@ -299,20 +308,20 @@ public class XmlImportUtilities {
return c != 0 ? c : (o1.name.length() - o2.name.length());
}
});
for (ImportColumnGroup g : subgroups) {
createColumnsFromImport(project, g);
}
int endColumnIndex = project.columnModel.columns.size();
int span = endColumnIndex - startColumnIndex;
if (span > 1 && span < project.columnModel.columns.size()) {
project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex);
}
}
static protected void findRecord(
Project project,
Project project,
XMLStreamReader parser,
String[] recordPath,
int pathIndex,
@ -336,7 +345,7 @@ public class XmlImportUtilities {
skip(parser);
}
}
static protected void skip(XMLStreamReader parser) throws XMLStreamException {
while (parser.hasNext()) {
int eventType = parser.next();
@ -347,71 +356,71 @@ public class XmlImportUtilities {
}
}
}
static protected void processRecord(
Project project,
Project project,
XMLStreamReader parser,
ImportColumnGroup rootColumnGroup
) throws XMLStreamException {
ImportRecord record = new ImportRecord();
processSubRecord(project, parser, rootColumnGroup, record);
if (record.rows.size() > 0) {
for (List<Cell> row : record.rows) {
Row realRow = new Row(row.size());
for (int c = 0; c < row.size(); c++) {
Cell cell = row.get(c);
if (cell != null) {
realRow.setCell(c, cell);
}
}
project.rows.add(realRow);
}
}
}
static protected String composeName(String prefix, String localName) {
return prefix != null && prefix.length() > 0 ? (prefix + ":" + localName) : localName;
}
static protected void processSubRecord(
Project project,
Project project,
XMLStreamReader parser,
ImportColumnGroup columnGroup,
ImportRecord record
) throws XMLStreamException {
ImportColumnGroup thisColumnGroup = getColumnGroup(
project,
columnGroup,
project,
columnGroup,
composeName(parser.getPrefix(), parser.getLocalName()));
int commonStartingRowIndex = 0;
for (ImportColumn column : thisColumnGroup.columns.values()) {
if (column.cellIndex < record.columnEmptyRowIndices.size()) {
commonStartingRowIndex = Math.max(
commonStartingRowIndex,
commonStartingRowIndex,
record.columnEmptyRowIndices.get(column.cellIndex));
}
}
int attributeCount = parser.getAttributeCount();
for (int i = 0; i < attributeCount; i++) {
String text = parser.getAttributeValue(i).trim();
if (text.length() > 0) {
addCell(
project,
thisColumnGroup,
record,
project,
thisColumnGroup,
record,
composeName(parser.getAttributePrefix(i), parser.getAttributeLocalName(i)),
text,
commonStartingRowIndex
);
}
}
while (parser.hasNext()) {
int eventType = parser.next();
if (eventType == XMLStreamConstants.START_ELEMENT) {
@ -421,14 +430,14 @@ public class XmlImportUtilities {
thisColumnGroup,
record
);
} else if (//eventType == XMLStreamConstants.CDATA ||
} else if (//eventType == XMLStreamConstants.CDATA ||
eventType == XMLStreamConstants.CHARACTERS) {
String text = parser.getText().trim();
if (text.length() > 0) {
addCell(
project,
thisColumnGroup,
record,
project,
thisColumnGroup,
record,
null,
parser.getText(),
commonStartingRowIndex
@ -438,10 +447,10 @@ public class XmlImportUtilities {
break;
}
}
if (commonStartingRowIndex < record.rows.size()) {
List<Cell> startingRow = record.rows.get(commonStartingRowIndex);
for (ImportColumn c : thisColumnGroup.columns.values()) {
int cellIndex = c.cellIndex;
if (cellIndex >= startingRow.size() || startingRow.get(cellIndex) == null) {
@ -450,7 +459,7 @@ public class XmlImportUtilities {
}
}
}
static protected void addCell(
Project project,
ImportColumnGroup columnGroup,
@ -462,33 +471,33 @@ public class XmlImportUtilities {
if (text == null || ((String) text).isEmpty()) {
return;
}
Serializable value = ImporterUtilities.parseCellValue(text);
ImportColumn column = getColumn(project, columnGroup, columnLocalName);
int cellIndex = column.cellIndex;
while (cellIndex >= record.columnEmptyRowIndices.size()) {
record.columnEmptyRowIndices.add(commonStaringRowIndex);
}
int rowIndex = record.columnEmptyRowIndices.get(cellIndex);
while (rowIndex >= record.rows.size()) {
record.rows.add(new ArrayList<Cell>());
}
List<Cell> row = record.rows.get(rowIndex);
while (cellIndex >= row.size()) {
row.add(null);
}
row.set(cellIndex, new Cell(value, null));
record.columnEmptyRowIndices.set(cellIndex, rowIndex + 1);
column.nonBlankCount++;
}
static protected ImportColumn getColumn(
Project project,
ImportColumnGroup columnGroup,
@ -497,27 +506,27 @@ public class XmlImportUtilities {
if (columnGroup.columns.containsKey(localName)) {
return columnGroup.columns.get(localName);
}
ImportColumn column = createColumn(project, columnGroup, localName);
columnGroup.columns.put(localName, column);
return column;
}
static protected ImportColumn createColumn(
Project project,
ImportColumnGroup columnGroup,
String localName
) {
ImportColumn newColumn = new ImportColumn();
newColumn.name =
columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) :
newColumn.name =
columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) :
(localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
newColumn.cellIndex = project.columnModel.allocateNewCellIndex();
return newColumn;
}
@ -529,25 +538,25 @@ public class XmlImportUtilities {
if (columnGroup.subgroups.containsKey(localName)) {
return columnGroup.subgroups.get(localName);
}
ImportColumnGroup subgroup = createColumnGroup(project, columnGroup, localName);
columnGroup.subgroups.put(localName, subgroup);
return subgroup;
}
static protected ImportColumnGroup createColumnGroup(
Project project,
ImportColumnGroup columnGroup,
String localName
) {
ImportColumnGroup newGroup = new ImportColumnGroup();
newGroup.name =
columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) :
newGroup.name =
columnGroup.name.length() == 0 ?
(localName == null ? "Text" : localName) :
(localName == null ? columnGroup.name : (columnGroup.name + " - " + localName));
return newGroup;
}
}

View File

@ -6,11 +6,16 @@ import java.io.PushbackInputStream;
import java.io.Reader;
import java.util.Properties;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.metaweb.gridworks.importers.XmlImportUtilities.ImportColumnGroup;
import com.metaweb.gridworks.model.Project;
public class XmlImporter implements Importer {
final static Logger logger = LoggerFactory.getLogger("XmlImporter");
public static final int BUFFER_SIZE = 64 * 1024;
public boolean takesReader() {
@ -28,6 +33,7 @@ public class XmlImporter implements Importer {
Project project,
Properties options
) throws Exception {
logger.trace("XmlImporter.read");
PushbackInputStream pis = new PushbackInputStream(inputStream,BUFFER_SIZE);
String[] recordPath = null;

View File

@ -0,0 +1,207 @@
package com.metaweb.gridworks.tests.importers;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Properties;
import static org.mockito.Mockito.mock;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import com.metaweb.gridworks.importers.XmlImporter;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
public class XmlImporterTests {
final static Logger logger = LoggerFactory.getLogger("XmlImporterTests");
//dependencies
Project project = null;
Properties options = null;
ByteArrayInputStream inputStream = null;
//System Under Test
XmlImporter SUT = null;
@BeforeMethod
public void SetUp(){
SUT = new XmlImporter();
project = new Project();
options = mock(Properties.class);
}
@AfterMethod
public void TearDown(){
SUT = null;
project = null;
options = null;
}
@Test
public void canParseSample(){
RunTest(getSample());
AssertGridCreate(project, 4, 6);
PrintProject(project);
Row row = project.rows.get(0);
Assert.assertNotNull(row);
Assert.assertNotNull(row.cells);
Assert.assertNotNull(row.cells.get(2));
Assert.assertEquals(row.cells.get(2).value, "Author 1, The");
}
@Test
public void testCanParseLineBreak(){
RunTest(getSampleWithLineBreak());
AssertGridCreate(project, 4, 6);
PrintProject(project);
Row row = project.rows.get(3);
Assert.assertNotNull(row);
Assert.assertNotNull(row.cells);
Assert.assertNotNull(row.cells.get(2));
Assert.assertEquals(row.cells.get(2).value, "With line\n break");
}
@Test(groups={"broken"})
public void testElementsWithVaryingStructure(){
RunTest(getSampleWithVaryingStructure());
AssertGridCreate(project, 5, 6);
PrintProject(project);
Row row0 = project.rows.get(0);
Assert.assertNotNull(row0);
Assert.assertNotNull(row0.cells);
Assert.assertEquals(row0.cells.size(),6);
Row row5 = project.rows.get(5);
Assert.assertNotNull(row5);
Assert.assertNotNull(row5.cells);
Assert.assertEquals(row5.cells.size(),6);
}
//------------helper methods---------------
protected String getTypicalElement(int id){
return "<book id=\"" + id + "\">" +
"<author>Author " + id + ", The</author>" +
"<title>Book title " + id + "</title>" +
"<publish_date>2010-05-26</publish_date>" +
"</book>";
}
protected String getSample(){
StringBuilder sb = new StringBuilder();
sb.append("<?xml version=\"1.0\"?><library>");
for(int i = 1; i < 7; i++){
sb.append(getTypicalElement(i));
}
sb.append("</library>");
return sb.toString();
}
protected String getSampleWithLineBreak(){
StringBuilder sb = new StringBuilder();
sb.append("<?xml version=\"1.0\"?><library>");
for(int i = 1; i < 4; i++){
sb.append(getTypicalElement(i));
}
sb.append("<book id=\"4\">" +
"<author>With line\n break</author>" +
"<title>Book title 4</title>" +
"<publish_date>2010-05-26</publish_date>" +
"</book>");
sb.append(getTypicalElement(5));
sb.append(getTypicalElement(6));
sb.append("</library>");
return sb.toString();
}
protected String getSampleWithVaryingStructure(){
StringBuilder sb = new StringBuilder();
sb.append("<?xml version=\"1.0\"?><library>");
for(int i = 1; i < 6; i++){
sb.append(getTypicalElement(i));
}
sb.append("<book id=\"6\">" +
"<author>With line\n break</author>" +
"<title>Book title 6</title>" +
"<genre>New element not seen in other records</genre>" +
"<publish_date>2010-05-26</publish_date>" +
"</book>");
sb.append("</library>");
return sb.toString();
}
private void RunTest(String testString){
try {
inputStream = new ByteArrayInputStream( testString.getBytes( "UTF-8" ) );
} catch (UnsupportedEncodingException e1) {
Assert.fail();
}
try {
SUT.read(inputStream, project, options);
} catch (Exception e) {
Assert.fail();
}
try {
inputStream.close();
} catch (IOException e) {
Assert.fail();
}
}
private void AssertGridCreate(Project project, int numCols, int numRows){
Assert.assertNotNull(project);
Assert.assertNotNull(project.columnModel);
Assert.assertNotNull(project.columnModel.columns);
Assert.assertEquals(project.columnModel.columns.size(), numCols);
Assert.assertNotNull(project.rows);
Assert.assertEquals(project.rows.size(), numRows);
}
private void PrintProject(Project project){
//some quick and dirty debugging
StringBuilder sb = new StringBuilder();
for(Column c : project.columnModel.columns){
sb.append(c.getName());
sb.append("; ");
}
logger.info(sb.toString());
for(Row r : project.rows){
sb = new StringBuilder();
for(Cell c : r.cells){
if(c != null){
sb.append(c.value);
sb.append("; ");
}else{
sb.append("null; ");
}
}
logger.info(sb.toString());
}
}
}