From 34cb1c4d076f2dc63aa0c51e6ac3216803d68680 Mon Sep 17 00:00:00 2001 From: Iain Sproat Date: Wed, 26 May 2010 19:22:38 +0000 Subject: [PATCH] XmlImporter is partially unit tested. One broken test for case where Record Elements contain varying numbers of nested elements. (This is for Issue 61 which is, at the time of this commit, an open issue) XmlImportUtilities produces log for case when no RecordElementCandidate are found. (too few similar Xml elements). git-svn-id: http://google-refine.googlecode.com/svn/trunk@862 7d457c2a-affb-35e4-300a-418c747d4874 --- .../importers/XmlImportUtilities.java | 229 +++++++++--------- .../gridworks/importers/XmlImporter.java | 6 + .../tests/importers/XmlImporterTests.java | 207 ++++++++++++++++ 3 files changed, 332 insertions(+), 110 deletions(-) create mode 100644 tests/java/src/com/metaweb/gridworks/tests/importers/XmlImporterTests.java diff --git a/src/main/java/com/metaweb/gridworks/importers/XmlImportUtilities.java b/src/main/java/com/metaweb/gridworks/importers/XmlImportUtilities.java index 4f706db41..89de297a2 100644 --- a/src/main/java/com/metaweb/gridworks/importers/XmlImportUtilities.java +++ b/src/main/java/com/metaweb/gridworks/importers/XmlImportUtilities.java @@ -16,28 +16,33 @@ import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Column; import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Row; public class XmlImportUtilities { + final static Logger logger = LoggerFactory.getLogger("XmlImporterUtilities"); + static protected class RecordElementCandidate { String[] path; int count; } - + static protected abstract class ImportVertical { public String name = ""; public int nonBlankCount; - + abstract void tabulate(); } - + static public class ImportColumnGroup extends ImportVertical { public Map subgroups = new HashMap(); public Map columns = new HashMap(); - + @Override void tabulate() { for (ImportColumn c : columns.values()) { @@ -50,7 +55,7 @@ public class XmlImportUtilities { } } } - + static public class ImportColumn extends ImportVertical { public int cellIndex; public boolean blankOnFirstRow; @@ -60,27 +65,27 @@ public class XmlImportUtilities { // already done the tabulation elsewhere } } - + static public class ImportRecord { List> rows = new LinkedList>(); List columnEmptyRowIndices = new ArrayList(); } - + static public String[] detectPathFromTag(InputStream inputStream, String tag) { //List candidates = new ArrayList(); - + try { XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream); - + while (parser.hasNext()) { int eventType = parser.next(); if (eventType == XMLStreamConstants.START_ELEMENT) { List path = detectRecordElement(parser, tag); if (path != null) { String[] path2 = new String[path.size()]; - + path.toArray(path2); - + return path2; } } @@ -89,20 +94,20 @@ public class XmlImportUtilities { // silent // e.printStackTrace(); } - + return null; } - + static protected List detectRecordElement(XMLStreamReader parser, String tag) throws XMLStreamException { String localName = parser.getLocalName(); String fullName = composeName(parser.getPrefix(), localName); if (tag.equals(parser.getLocalName()) || tag.equals(fullName)) { List path = new LinkedList(); path.add(localName); - + return path; } - + while (parser.hasNext()) { int eventType = parser.next(); if (eventType == XMLStreamConstants.END_ELEMENT) { @@ -117,21 +122,22 @@ public class XmlImportUtilities { } return null; } - + static public String[] detectRecordElement(InputStream inputStream) { + logger.trace("detectRecordElement(inputStream)"); List candidates = new ArrayList(); - + try { XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream); - + while (parser.hasNext()) { int eventType = parser.next(); if (eventType == XMLStreamConstants.START_ELEMENT) { - RecordElementCandidate candidate = + RecordElementCandidate candidate = detectRecordElement( - parser, + parser, new String[] { parser.getLocalName() }); - + if (candidate != null) { candidates.add(candidate); } @@ -141,22 +147,24 @@ public class XmlImportUtilities { // silent // e.printStackTrace(); } - + if (candidates.size() > 0) { sortRecordElementCandidates(candidates); - + return candidates.get(0).path; } + logger.info("No candidate elements were found in Xml - at least 6 similar elements are required"); return null; } - + static protected RecordElementCandidate detectRecordElement(XMLStreamReader parser, String[] path) { + logger.trace("detectRecordElement(XMLStreamReader, String[])"); List descendantCandidates = new ArrayList(); - + Map immediateChildCandidateMap = new HashMap(); int textNodeCount = 0; int childElementNodeCount = 0; - + try { while (parser.hasNext()) { int eventType = parser.next(); @@ -168,18 +176,18 @@ public class XmlImportUtilities { } } else if (eventType == XMLStreamConstants.START_ELEMENT) { childElementNodeCount++; - + String tagName = parser.getLocalName(); - + immediateChildCandidateMap.put( - tagName, - immediateChildCandidateMap.containsKey(tagName) ? + tagName, + immediateChildCandidateMap.containsKey(tagName) ? immediateChildCandidateMap.get(tagName) + 1 : 1); - + String[] path2 = new String[path.length + 1]; System.arraycopy(path, 0, path2, 0, path.length); path2[path.length] = tagName; - + RecordElementCandidate c = detectRecordElement(parser, path2); if (c != null) { descendantCandidates.add(c); @@ -190,12 +198,12 @@ public class XmlImportUtilities { // silent // e.printStackTrace(); } - + if (textNodeCount > 0 && childElementNodeCount > 0) { // This is a mixed element return null; } - + if (immediateChildCandidateMap.size() > 0) { List immediateChildCandidates = new ArrayList(immediateChildCandidateMap.size()); for (Entry entry : immediateChildCandidateMap.entrySet()) { @@ -204,38 +212,39 @@ public class XmlImportUtilities { String[] path2 = new String[path.length + 1]; System.arraycopy(path, 0, path2, 0, path.length); path2[path.length] = entry.getKey(); - + RecordElementCandidate candidate = new RecordElementCandidate(); candidate.path = path2; candidate.count = count; immediateChildCandidates.add(candidate); } } - + if (immediateChildCandidates.size() > 0 && immediateChildCandidates.size() < 5) { - // There are some promising immediate child elements, but not many, + // There are some promising immediate child elements, but not many, // that can serve as record elements. - + sortRecordElementCandidates(immediateChildCandidates); - + RecordElementCandidate ourCandidate = immediateChildCandidates.get(0); + logger.trace("ourCandidate.count : " + ourCandidate.count + "; immediateChildCandidates.size() : " + immediateChildCandidates.size()); if (ourCandidate.count / immediateChildCandidates.size() > 5) { return ourCandidate; } - + descendantCandidates.add(ourCandidate); } } - + if (descendantCandidates.size() > 0) { sortRecordElementCandidates(descendantCandidates); - + RecordElementCandidate candidate = descendantCandidates.get(0); if (candidate.count / descendantCandidates.size() > 5) { return candidate; } } - + return null; } @@ -246,16 +255,16 @@ public class XmlImportUtilities { } }); } - + static public void importXml( - InputStream inputStream, - Project project, + InputStream inputStream, + Project project, String[] recordPath, ImportColumnGroup rootColumnGroup ) { try { XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream); - + while (parser.hasNext()) { int eventType = parser.next(); if (eventType == XMLStreamConstants.START_ELEMENT) { @@ -266,32 +275,32 @@ public class XmlImportUtilities { // silent } } - + static public void createColumnsFromImport( - Project project, + Project project, ImportColumnGroup columnGroup ) { int startColumnIndex = project.columnModel.columns.size(); - + List columns = new ArrayList(columnGroup.columns.values()); Collections.sort(columns, new Comparator() { public int compare(ImportColumn o1, ImportColumn o2) { if (o1.blankOnFirstRow != o2.blankOnFirstRow) { return o1.blankOnFirstRow ? 1 : -1; } - + int c = o2.nonBlankCount - o1.nonBlankCount; return c != 0 ? c : (o1.name.length() - o2.name.length()); } }); - + for (int i = 0; i < columns.size(); i++) { ImportColumn c = columns.get(i); - + Column column = new com.metaweb.gridworks.model.Column(c.cellIndex, c.name); project.columnModel.columns.add(column); } - + List subgroups = new ArrayList(columnGroup.subgroups.values()); Collections.sort(subgroups, new Comparator() { public int compare(ImportColumnGroup o1, ImportColumnGroup o2) { @@ -299,20 +308,20 @@ public class XmlImportUtilities { return c != 0 ? c : (o1.name.length() - o2.name.length()); } }); - + for (ImportColumnGroup g : subgroups) { createColumnsFromImport(project, g); } - + int endColumnIndex = project.columnModel.columns.size(); int span = endColumnIndex - startColumnIndex; if (span > 1 && span < project.columnModel.columns.size()) { project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex); } } - + static protected void findRecord( - Project project, + Project project, XMLStreamReader parser, String[] recordPath, int pathIndex, @@ -336,7 +345,7 @@ public class XmlImportUtilities { skip(parser); } } - + static protected void skip(XMLStreamReader parser) throws XMLStreamException { while (parser.hasNext()) { int eventType = parser.next(); @@ -347,71 +356,71 @@ public class XmlImportUtilities { } } } - + static protected void processRecord( - Project project, + Project project, XMLStreamReader parser, ImportColumnGroup rootColumnGroup ) throws XMLStreamException { ImportRecord record = new ImportRecord(); - + processSubRecord(project, parser, rootColumnGroup, record); - + if (record.rows.size() > 0) { for (List row : record.rows) { Row realRow = new Row(row.size()); - + for (int c = 0; c < row.size(); c++) { Cell cell = row.get(c); if (cell != null) { realRow.setCell(c, cell); } } - + project.rows.add(realRow); } } } - + static protected String composeName(String prefix, String localName) { return prefix != null && prefix.length() > 0 ? (prefix + ":" + localName) : localName; } - + static protected void processSubRecord( - Project project, + Project project, XMLStreamReader parser, ImportColumnGroup columnGroup, ImportRecord record ) throws XMLStreamException { ImportColumnGroup thisColumnGroup = getColumnGroup( - project, - columnGroup, + project, + columnGroup, composeName(parser.getPrefix(), parser.getLocalName())); - + int commonStartingRowIndex = 0; for (ImportColumn column : thisColumnGroup.columns.values()) { if (column.cellIndex < record.columnEmptyRowIndices.size()) { commonStartingRowIndex = Math.max( - commonStartingRowIndex, + commonStartingRowIndex, record.columnEmptyRowIndices.get(column.cellIndex)); } } - + int attributeCount = parser.getAttributeCount(); for (int i = 0; i < attributeCount; i++) { String text = parser.getAttributeValue(i).trim(); if (text.length() > 0) { addCell( - project, - thisColumnGroup, - record, + project, + thisColumnGroup, + record, composeName(parser.getAttributePrefix(i), parser.getAttributeLocalName(i)), text, commonStartingRowIndex ); } } - + while (parser.hasNext()) { int eventType = parser.next(); if (eventType == XMLStreamConstants.START_ELEMENT) { @@ -421,14 +430,14 @@ public class XmlImportUtilities { thisColumnGroup, record ); - } else if (//eventType == XMLStreamConstants.CDATA || + } else if (//eventType == XMLStreamConstants.CDATA || eventType == XMLStreamConstants.CHARACTERS) { String text = parser.getText().trim(); if (text.length() > 0) { addCell( - project, - thisColumnGroup, - record, + project, + thisColumnGroup, + record, null, parser.getText(), commonStartingRowIndex @@ -438,10 +447,10 @@ public class XmlImportUtilities { break; } } - + if (commonStartingRowIndex < record.rows.size()) { List startingRow = record.rows.get(commonStartingRowIndex); - + for (ImportColumn c : thisColumnGroup.columns.values()) { int cellIndex = c.cellIndex; if (cellIndex >= startingRow.size() || startingRow.get(cellIndex) == null) { @@ -450,7 +459,7 @@ public class XmlImportUtilities { } } } - + static protected void addCell( Project project, ImportColumnGroup columnGroup, @@ -462,33 +471,33 @@ public class XmlImportUtilities { if (text == null || ((String) text).isEmpty()) { return; } - + Serializable value = ImporterUtilities.parseCellValue(text); - + ImportColumn column = getColumn(project, columnGroup, columnLocalName); int cellIndex = column.cellIndex; - + while (cellIndex >= record.columnEmptyRowIndices.size()) { record.columnEmptyRowIndices.add(commonStaringRowIndex); } int rowIndex = record.columnEmptyRowIndices.get(cellIndex); - + while (rowIndex >= record.rows.size()) { record.rows.add(new ArrayList()); } List row = record.rows.get(rowIndex); - + while (cellIndex >= row.size()) { row.add(null); } - + row.set(cellIndex, new Cell(value, null)); - + record.columnEmptyRowIndices.set(cellIndex, rowIndex + 1); - + column.nonBlankCount++; } - + static protected ImportColumn getColumn( Project project, ImportColumnGroup columnGroup, @@ -497,27 +506,27 @@ public class XmlImportUtilities { if (columnGroup.columns.containsKey(localName)) { return columnGroup.columns.get(localName); } - + ImportColumn column = createColumn(project, columnGroup, localName); columnGroup.columns.put(localName, column); - + return column; } - + static protected ImportColumn createColumn( Project project, ImportColumnGroup columnGroup, String localName ) { ImportColumn newColumn = new ImportColumn(); - - newColumn.name = - columnGroup.name.length() == 0 ? - (localName == null ? "Text" : localName) : + + newColumn.name = + columnGroup.name.length() == 0 ? + (localName == null ? "Text" : localName) : (localName == null ? columnGroup.name : (columnGroup.name + " - " + localName)); - + newColumn.cellIndex = project.columnModel.allocateNewCellIndex(); - + return newColumn; } @@ -529,25 +538,25 @@ public class XmlImportUtilities { if (columnGroup.subgroups.containsKey(localName)) { return columnGroup.subgroups.get(localName); } - + ImportColumnGroup subgroup = createColumnGroup(project, columnGroup, localName); columnGroup.subgroups.put(localName, subgroup); - + return subgroup; } - + static protected ImportColumnGroup createColumnGroup( Project project, ImportColumnGroup columnGroup, String localName ) { ImportColumnGroup newGroup = new ImportColumnGroup(); - - newGroup.name = - columnGroup.name.length() == 0 ? - (localName == null ? "Text" : localName) : + + newGroup.name = + columnGroup.name.length() == 0 ? + (localName == null ? "Text" : localName) : (localName == null ? columnGroup.name : (columnGroup.name + " - " + localName)); - + return newGroup; } } diff --git a/src/main/java/com/metaweb/gridworks/importers/XmlImporter.java b/src/main/java/com/metaweb/gridworks/importers/XmlImporter.java index c3c13ddf6..23c9a6369 100644 --- a/src/main/java/com/metaweb/gridworks/importers/XmlImporter.java +++ b/src/main/java/com/metaweb/gridworks/importers/XmlImporter.java @@ -6,11 +6,16 @@ import java.io.PushbackInputStream; import java.io.Reader; import java.util.Properties; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.metaweb.gridworks.importers.XmlImportUtilities.ImportColumnGroup; import com.metaweb.gridworks.model.Project; public class XmlImporter implements Importer { + final static Logger logger = LoggerFactory.getLogger("XmlImporter"); + public static final int BUFFER_SIZE = 64 * 1024; public boolean takesReader() { @@ -28,6 +33,7 @@ public class XmlImporter implements Importer { Project project, Properties options ) throws Exception { + logger.trace("XmlImporter.read"); PushbackInputStream pis = new PushbackInputStream(inputStream,BUFFER_SIZE); String[] recordPath = null; diff --git a/tests/java/src/com/metaweb/gridworks/tests/importers/XmlImporterTests.java b/tests/java/src/com/metaweb/gridworks/tests/importers/XmlImporterTests.java new file mode 100644 index 000000000..e4272b6bf --- /dev/null +++ b/tests/java/src/com/metaweb/gridworks/tests/importers/XmlImporterTests.java @@ -0,0 +1,207 @@ +package com.metaweb.gridworks.tests.importers; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Properties; + +import static org.mockito.Mockito.mock; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import com.metaweb.gridworks.importers.XmlImporter; +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Column; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Row; + + +public class XmlImporterTests { + final static Logger logger = LoggerFactory.getLogger("XmlImporterTests"); + + //dependencies + Project project = null; + Properties options = null; + ByteArrayInputStream inputStream = null; + + //System Under Test + XmlImporter SUT = null; + + + @BeforeMethod + public void SetUp(){ + SUT = new XmlImporter(); + project = new Project(); + options = mock(Properties.class); + } + + @AfterMethod + public void TearDown(){ + SUT = null; + project = null; + options = null; + } + + @Test + public void canParseSample(){ + + RunTest(getSample()); + + AssertGridCreate(project, 4, 6); + PrintProject(project); + + Row row = project.rows.get(0); + Assert.assertNotNull(row); + Assert.assertNotNull(row.cells); + Assert.assertNotNull(row.cells.get(2)); + Assert.assertEquals(row.cells.get(2).value, "Author 1, The"); + + + } + + @Test + public void testCanParseLineBreak(){ + + RunTest(getSampleWithLineBreak()); + + AssertGridCreate(project, 4, 6); + PrintProject(project); + + Row row = project.rows.get(3); + Assert.assertNotNull(row); + Assert.assertNotNull(row.cells); + Assert.assertNotNull(row.cells.get(2)); + Assert.assertEquals(row.cells.get(2).value, "With line\n break"); + } + + @Test(groups={"broken"}) + public void testElementsWithVaryingStructure(){ + + + RunTest(getSampleWithVaryingStructure()); + + AssertGridCreate(project, 5, 6); + PrintProject(project); + + Row row0 = project.rows.get(0); + Assert.assertNotNull(row0); + Assert.assertNotNull(row0.cells); + Assert.assertEquals(row0.cells.size(),6); + + Row row5 = project.rows.get(5); + Assert.assertNotNull(row5); + Assert.assertNotNull(row5.cells); + Assert.assertEquals(row5.cells.size(),6); + + + } + + //------------helper methods--------------- + + protected String getTypicalElement(int id){ + return "" + + "Author " + id + ", The" + + "Book title " + id + "" + + "2010-05-26" + + ""; + } + + protected String getSample(){ + StringBuilder sb = new StringBuilder(); + sb.append(""); + for(int i = 1; i < 7; i++){ + sb.append(getTypicalElement(i)); + } + sb.append(""); + return sb.toString(); + } + + protected String getSampleWithLineBreak(){ + StringBuilder sb = new StringBuilder(); + sb.append(""); + for(int i = 1; i < 4; i++){ + sb.append(getTypicalElement(i)); + } + sb.append("" + + "With line\n break" + + "Book title 4" + + "2010-05-26" + + ""); + sb.append(getTypicalElement(5)); + sb.append(getTypicalElement(6)); + sb.append(""); + return sb.toString(); + } + + protected String getSampleWithVaryingStructure(){ + StringBuilder sb = new StringBuilder(); + sb.append(""); + for(int i = 1; i < 6; i++){ + sb.append(getTypicalElement(i)); + } + sb.append("" + + "With line\n break" + + "Book title 6" + + "New element not seen in other records" + + "2010-05-26" + + ""); + sb.append(""); + return sb.toString(); + } + + private void RunTest(String testString){ + try { + inputStream = new ByteArrayInputStream( testString.getBytes( "UTF-8" ) ); + } catch (UnsupportedEncodingException e1) { + Assert.fail(); + } + + try { + SUT.read(inputStream, project, options); + } catch (Exception e) { + Assert.fail(); + } + + try { + inputStream.close(); + } catch (IOException e) { + Assert.fail(); + } + } + + private void AssertGridCreate(Project project, int numCols, int numRows){ + Assert.assertNotNull(project); + Assert.assertNotNull(project.columnModel); + Assert.assertNotNull(project.columnModel.columns); + Assert.assertEquals(project.columnModel.columns.size(), numCols); + Assert.assertNotNull(project.rows); + Assert.assertEquals(project.rows.size(), numRows); + } + + private void PrintProject(Project project){ + //some quick and dirty debugging + StringBuilder sb = new StringBuilder(); + for(Column c : project.columnModel.columns){ + sb.append(c.getName()); + sb.append("; "); + } + logger.info(sb.toString()); + for(Row r : project.rows){ + sb = new StringBuilder(); + for(Cell c : r.cells){ + if(c != null){ + sb.append(c.value); + sb.append("; "); + }else{ + sb.append("null; "); + } + } + logger.info(sb.toString()); + } + } +}