diff --git a/src/main/java/com/metaweb/gridworks/importers/XmlImportUtilities.java b/src/main/java/com/metaweb/gridworks/importers/XmlImportUtilities.java index 4f706db41..89de297a2 100644 --- a/src/main/java/com/metaweb/gridworks/importers/XmlImportUtilities.java +++ b/src/main/java/com/metaweb/gridworks/importers/XmlImportUtilities.java @@ -16,28 +16,33 @@ import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.metaweb.gridworks.model.Cell; import com.metaweb.gridworks.model.Column; import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Row; public class XmlImportUtilities { + final static Logger logger = LoggerFactory.getLogger("XmlImporterUtilities"); + static protected class RecordElementCandidate { String[] path; int count; } - + static protected abstract class ImportVertical { public String name = ""; public int nonBlankCount; - + abstract void tabulate(); } - + static public class ImportColumnGroup extends ImportVertical { public Map subgroups = new HashMap(); public Map columns = new HashMap(); - + @Override void tabulate() { for (ImportColumn c : columns.values()) { @@ -50,7 +55,7 @@ public class XmlImportUtilities { } } } - + static public class ImportColumn extends ImportVertical { public int cellIndex; public boolean blankOnFirstRow; @@ -60,27 +65,27 @@ public class XmlImportUtilities { // already done the tabulation elsewhere } } - + static public class ImportRecord { List> rows = new LinkedList>(); List columnEmptyRowIndices = new ArrayList(); } - + static public String[] detectPathFromTag(InputStream inputStream, String tag) { //List candidates = new ArrayList(); - + try { XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream); - + while (parser.hasNext()) { int eventType = parser.next(); if (eventType == XMLStreamConstants.START_ELEMENT) { List path = detectRecordElement(parser, tag); if (path != null) { String[] path2 = new String[path.size()]; - + path.toArray(path2); - + return path2; } } @@ -89,20 +94,20 @@ public class XmlImportUtilities { // silent // e.printStackTrace(); } - + return null; } - + static protected List detectRecordElement(XMLStreamReader parser, String tag) throws XMLStreamException { String localName = parser.getLocalName(); String fullName = composeName(parser.getPrefix(), localName); if (tag.equals(parser.getLocalName()) || tag.equals(fullName)) { List path = new LinkedList(); path.add(localName); - + return path; } - + while (parser.hasNext()) { int eventType = parser.next(); if (eventType == XMLStreamConstants.END_ELEMENT) { @@ -117,21 +122,22 @@ public class XmlImportUtilities { } return null; } - + static public String[] detectRecordElement(InputStream inputStream) { + logger.trace("detectRecordElement(inputStream)"); List candidates = new ArrayList(); - + try { XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream); - + while (parser.hasNext()) { int eventType = parser.next(); if (eventType == XMLStreamConstants.START_ELEMENT) { - RecordElementCandidate candidate = + RecordElementCandidate candidate = detectRecordElement( - parser, + parser, new String[] { parser.getLocalName() }); - + if (candidate != null) { candidates.add(candidate); } @@ -141,22 +147,24 @@ public class XmlImportUtilities { // silent // e.printStackTrace(); } - + if (candidates.size() > 0) { sortRecordElementCandidates(candidates); - + return candidates.get(0).path; } + logger.info("No candidate elements were found in Xml - at least 6 similar elements are required"); return null; } - + static protected RecordElementCandidate detectRecordElement(XMLStreamReader parser, String[] path) { + logger.trace("detectRecordElement(XMLStreamReader, String[])"); List descendantCandidates = new ArrayList(); - + Map immediateChildCandidateMap = new HashMap(); int textNodeCount = 0; int childElementNodeCount = 0; - + try { while (parser.hasNext()) { int eventType = parser.next(); @@ -168,18 +176,18 @@ public class XmlImportUtilities { } } else if (eventType == XMLStreamConstants.START_ELEMENT) { childElementNodeCount++; - + String tagName = parser.getLocalName(); - + immediateChildCandidateMap.put( - tagName, - immediateChildCandidateMap.containsKey(tagName) ? + tagName, + immediateChildCandidateMap.containsKey(tagName) ? immediateChildCandidateMap.get(tagName) + 1 : 1); - + String[] path2 = new String[path.length + 1]; System.arraycopy(path, 0, path2, 0, path.length); path2[path.length] = tagName; - + RecordElementCandidate c = detectRecordElement(parser, path2); if (c != null) { descendantCandidates.add(c); @@ -190,12 +198,12 @@ public class XmlImportUtilities { // silent // e.printStackTrace(); } - + if (textNodeCount > 0 && childElementNodeCount > 0) { // This is a mixed element return null; } - + if (immediateChildCandidateMap.size() > 0) { List immediateChildCandidates = new ArrayList(immediateChildCandidateMap.size()); for (Entry entry : immediateChildCandidateMap.entrySet()) { @@ -204,38 +212,39 @@ public class XmlImportUtilities { String[] path2 = new String[path.length + 1]; System.arraycopy(path, 0, path2, 0, path.length); path2[path.length] = entry.getKey(); - + RecordElementCandidate candidate = new RecordElementCandidate(); candidate.path = path2; candidate.count = count; immediateChildCandidates.add(candidate); } } - + if (immediateChildCandidates.size() > 0 && immediateChildCandidates.size() < 5) { - // There are some promising immediate child elements, but not many, + // There are some promising immediate child elements, but not many, // that can serve as record elements. - + sortRecordElementCandidates(immediateChildCandidates); - + RecordElementCandidate ourCandidate = immediateChildCandidates.get(0); + logger.trace("ourCandidate.count : " + ourCandidate.count + "; immediateChildCandidates.size() : " + immediateChildCandidates.size()); if (ourCandidate.count / immediateChildCandidates.size() > 5) { return ourCandidate; } - + descendantCandidates.add(ourCandidate); } } - + if (descendantCandidates.size() > 0) { sortRecordElementCandidates(descendantCandidates); - + RecordElementCandidate candidate = descendantCandidates.get(0); if (candidate.count / descendantCandidates.size() > 5) { return candidate; } } - + return null; } @@ -246,16 +255,16 @@ public class XmlImportUtilities { } }); } - + static public void importXml( - InputStream inputStream, - Project project, + InputStream inputStream, + Project project, String[] recordPath, ImportColumnGroup rootColumnGroup ) { try { XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(inputStream); - + while (parser.hasNext()) { int eventType = parser.next(); if (eventType == XMLStreamConstants.START_ELEMENT) { @@ -266,32 +275,32 @@ public class XmlImportUtilities { // silent } } - + static public void createColumnsFromImport( - Project project, + Project project, ImportColumnGroup columnGroup ) { int startColumnIndex = project.columnModel.columns.size(); - + List columns = new ArrayList(columnGroup.columns.values()); Collections.sort(columns, new Comparator() { public int compare(ImportColumn o1, ImportColumn o2) { if (o1.blankOnFirstRow != o2.blankOnFirstRow) { return o1.blankOnFirstRow ? 1 : -1; } - + int c = o2.nonBlankCount - o1.nonBlankCount; return c != 0 ? c : (o1.name.length() - o2.name.length()); } }); - + for (int i = 0; i < columns.size(); i++) { ImportColumn c = columns.get(i); - + Column column = new com.metaweb.gridworks.model.Column(c.cellIndex, c.name); project.columnModel.columns.add(column); } - + List subgroups = new ArrayList(columnGroup.subgroups.values()); Collections.sort(subgroups, new Comparator() { public int compare(ImportColumnGroup o1, ImportColumnGroup o2) { @@ -299,20 +308,20 @@ public class XmlImportUtilities { return c != 0 ? c : (o1.name.length() - o2.name.length()); } }); - + for (ImportColumnGroup g : subgroups) { createColumnsFromImport(project, g); } - + int endColumnIndex = project.columnModel.columns.size(); int span = endColumnIndex - startColumnIndex; if (span > 1 && span < project.columnModel.columns.size()) { project.columnModel.addColumnGroup(startColumnIndex, span, startColumnIndex); } } - + static protected void findRecord( - Project project, + Project project, XMLStreamReader parser, String[] recordPath, int pathIndex, @@ -336,7 +345,7 @@ public class XmlImportUtilities { skip(parser); } } - + static protected void skip(XMLStreamReader parser) throws XMLStreamException { while (parser.hasNext()) { int eventType = parser.next(); @@ -347,71 +356,71 @@ public class XmlImportUtilities { } } } - + static protected void processRecord( - Project project, + Project project, XMLStreamReader parser, ImportColumnGroup rootColumnGroup ) throws XMLStreamException { ImportRecord record = new ImportRecord(); - + processSubRecord(project, parser, rootColumnGroup, record); - + if (record.rows.size() > 0) { for (List row : record.rows) { Row realRow = new Row(row.size()); - + for (int c = 0; c < row.size(); c++) { Cell cell = row.get(c); if (cell != null) { realRow.setCell(c, cell); } } - + project.rows.add(realRow); } } } - + static protected String composeName(String prefix, String localName) { return prefix != null && prefix.length() > 0 ? (prefix + ":" + localName) : localName; } - + static protected void processSubRecord( - Project project, + Project project, XMLStreamReader parser, ImportColumnGroup columnGroup, ImportRecord record ) throws XMLStreamException { ImportColumnGroup thisColumnGroup = getColumnGroup( - project, - columnGroup, + project, + columnGroup, composeName(parser.getPrefix(), parser.getLocalName())); - + int commonStartingRowIndex = 0; for (ImportColumn column : thisColumnGroup.columns.values()) { if (column.cellIndex < record.columnEmptyRowIndices.size()) { commonStartingRowIndex = Math.max( - commonStartingRowIndex, + commonStartingRowIndex, record.columnEmptyRowIndices.get(column.cellIndex)); } } - + int attributeCount = parser.getAttributeCount(); for (int i = 0; i < attributeCount; i++) { String text = parser.getAttributeValue(i).trim(); if (text.length() > 0) { addCell( - project, - thisColumnGroup, - record, + project, + thisColumnGroup, + record, composeName(parser.getAttributePrefix(i), parser.getAttributeLocalName(i)), text, commonStartingRowIndex ); } } - + while (parser.hasNext()) { int eventType = parser.next(); if (eventType == XMLStreamConstants.START_ELEMENT) { @@ -421,14 +430,14 @@ public class XmlImportUtilities { thisColumnGroup, record ); - } else if (//eventType == XMLStreamConstants.CDATA || + } else if (//eventType == XMLStreamConstants.CDATA || eventType == XMLStreamConstants.CHARACTERS) { String text = parser.getText().trim(); if (text.length() > 0) { addCell( - project, - thisColumnGroup, - record, + project, + thisColumnGroup, + record, null, parser.getText(), commonStartingRowIndex @@ -438,10 +447,10 @@ public class XmlImportUtilities { break; } } - + if (commonStartingRowIndex < record.rows.size()) { List startingRow = record.rows.get(commonStartingRowIndex); - + for (ImportColumn c : thisColumnGroup.columns.values()) { int cellIndex = c.cellIndex; if (cellIndex >= startingRow.size() || startingRow.get(cellIndex) == null) { @@ -450,7 +459,7 @@ public class XmlImportUtilities { } } } - + static protected void addCell( Project project, ImportColumnGroup columnGroup, @@ -462,33 +471,33 @@ public class XmlImportUtilities { if (text == null || ((String) text).isEmpty()) { return; } - + Serializable value = ImporterUtilities.parseCellValue(text); - + ImportColumn column = getColumn(project, columnGroup, columnLocalName); int cellIndex = column.cellIndex; - + while (cellIndex >= record.columnEmptyRowIndices.size()) { record.columnEmptyRowIndices.add(commonStaringRowIndex); } int rowIndex = record.columnEmptyRowIndices.get(cellIndex); - + while (rowIndex >= record.rows.size()) { record.rows.add(new ArrayList()); } List row = record.rows.get(rowIndex); - + while (cellIndex >= row.size()) { row.add(null); } - + row.set(cellIndex, new Cell(value, null)); - + record.columnEmptyRowIndices.set(cellIndex, rowIndex + 1); - + column.nonBlankCount++; } - + static protected ImportColumn getColumn( Project project, ImportColumnGroup columnGroup, @@ -497,27 +506,27 @@ public class XmlImportUtilities { if (columnGroup.columns.containsKey(localName)) { return columnGroup.columns.get(localName); } - + ImportColumn column = createColumn(project, columnGroup, localName); columnGroup.columns.put(localName, column); - + return column; } - + static protected ImportColumn createColumn( Project project, ImportColumnGroup columnGroup, String localName ) { ImportColumn newColumn = new ImportColumn(); - - newColumn.name = - columnGroup.name.length() == 0 ? - (localName == null ? "Text" : localName) : + + newColumn.name = + columnGroup.name.length() == 0 ? + (localName == null ? "Text" : localName) : (localName == null ? columnGroup.name : (columnGroup.name + " - " + localName)); - + newColumn.cellIndex = project.columnModel.allocateNewCellIndex(); - + return newColumn; } @@ -529,25 +538,25 @@ public class XmlImportUtilities { if (columnGroup.subgroups.containsKey(localName)) { return columnGroup.subgroups.get(localName); } - + ImportColumnGroup subgroup = createColumnGroup(project, columnGroup, localName); columnGroup.subgroups.put(localName, subgroup); - + return subgroup; } - + static protected ImportColumnGroup createColumnGroup( Project project, ImportColumnGroup columnGroup, String localName ) { ImportColumnGroup newGroup = new ImportColumnGroup(); - - newGroup.name = - columnGroup.name.length() == 0 ? - (localName == null ? "Text" : localName) : + + newGroup.name = + columnGroup.name.length() == 0 ? + (localName == null ? "Text" : localName) : (localName == null ? columnGroup.name : (columnGroup.name + " - " + localName)); - + return newGroup; } } diff --git a/src/main/java/com/metaweb/gridworks/importers/XmlImporter.java b/src/main/java/com/metaweb/gridworks/importers/XmlImporter.java index c3c13ddf6..23c9a6369 100644 --- a/src/main/java/com/metaweb/gridworks/importers/XmlImporter.java +++ b/src/main/java/com/metaweb/gridworks/importers/XmlImporter.java @@ -6,11 +6,16 @@ import java.io.PushbackInputStream; import java.io.Reader; import java.util.Properties; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.metaweb.gridworks.importers.XmlImportUtilities.ImportColumnGroup; import com.metaweb.gridworks.model.Project; public class XmlImporter implements Importer { + final static Logger logger = LoggerFactory.getLogger("XmlImporter"); + public static final int BUFFER_SIZE = 64 * 1024; public boolean takesReader() { @@ -28,6 +33,7 @@ public class XmlImporter implements Importer { Project project, Properties options ) throws Exception { + logger.trace("XmlImporter.read"); PushbackInputStream pis = new PushbackInputStream(inputStream,BUFFER_SIZE); String[] recordPath = null; diff --git a/tests/java/src/com/metaweb/gridworks/tests/importers/XmlImporterTests.java b/tests/java/src/com/metaweb/gridworks/tests/importers/XmlImporterTests.java new file mode 100644 index 000000000..e4272b6bf --- /dev/null +++ b/tests/java/src/com/metaweb/gridworks/tests/importers/XmlImporterTests.java @@ -0,0 +1,207 @@ +package com.metaweb.gridworks.tests.importers; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Properties; + +import static org.mockito.Mockito.mock; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import com.metaweb.gridworks.importers.XmlImporter; +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Column; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Row; + + +public class XmlImporterTests { + final static Logger logger = LoggerFactory.getLogger("XmlImporterTests"); + + //dependencies + Project project = null; + Properties options = null; + ByteArrayInputStream inputStream = null; + + //System Under Test + XmlImporter SUT = null; + + + @BeforeMethod + public void SetUp(){ + SUT = new XmlImporter(); + project = new Project(); + options = mock(Properties.class); + } + + @AfterMethod + public void TearDown(){ + SUT = null; + project = null; + options = null; + } + + @Test + public void canParseSample(){ + + RunTest(getSample()); + + AssertGridCreate(project, 4, 6); + PrintProject(project); + + Row row = project.rows.get(0); + Assert.assertNotNull(row); + Assert.assertNotNull(row.cells); + Assert.assertNotNull(row.cells.get(2)); + Assert.assertEquals(row.cells.get(2).value, "Author 1, The"); + + + } + + @Test + public void testCanParseLineBreak(){ + + RunTest(getSampleWithLineBreak()); + + AssertGridCreate(project, 4, 6); + PrintProject(project); + + Row row = project.rows.get(3); + Assert.assertNotNull(row); + Assert.assertNotNull(row.cells); + Assert.assertNotNull(row.cells.get(2)); + Assert.assertEquals(row.cells.get(2).value, "With line\n break"); + } + + @Test(groups={"broken"}) + public void testElementsWithVaryingStructure(){ + + + RunTest(getSampleWithVaryingStructure()); + + AssertGridCreate(project, 5, 6); + PrintProject(project); + + Row row0 = project.rows.get(0); + Assert.assertNotNull(row0); + Assert.assertNotNull(row0.cells); + Assert.assertEquals(row0.cells.size(),6); + + Row row5 = project.rows.get(5); + Assert.assertNotNull(row5); + Assert.assertNotNull(row5.cells); + Assert.assertEquals(row5.cells.size(),6); + + + } + + //------------helper methods--------------- + + protected String getTypicalElement(int id){ + return "" + + "Author " + id + ", The" + + "Book title " + id + "" + + "2010-05-26" + + ""; + } + + protected String getSample(){ + StringBuilder sb = new StringBuilder(); + sb.append(""); + for(int i = 1; i < 7; i++){ + sb.append(getTypicalElement(i)); + } + sb.append(""); + return sb.toString(); + } + + protected String getSampleWithLineBreak(){ + StringBuilder sb = new StringBuilder(); + sb.append(""); + for(int i = 1; i < 4; i++){ + sb.append(getTypicalElement(i)); + } + sb.append("" + + "With line\n break" + + "Book title 4" + + "2010-05-26" + + ""); + sb.append(getTypicalElement(5)); + sb.append(getTypicalElement(6)); + sb.append(""); + return sb.toString(); + } + + protected String getSampleWithVaryingStructure(){ + StringBuilder sb = new StringBuilder(); + sb.append(""); + for(int i = 1; i < 6; i++){ + sb.append(getTypicalElement(i)); + } + sb.append("" + + "With line\n break" + + "Book title 6" + + "New element not seen in other records" + + "2010-05-26" + + ""); + sb.append(""); + return sb.toString(); + } + + private void RunTest(String testString){ + try { + inputStream = new ByteArrayInputStream( testString.getBytes( "UTF-8" ) ); + } catch (UnsupportedEncodingException e1) { + Assert.fail(); + } + + try { + SUT.read(inputStream, project, options); + } catch (Exception e) { + Assert.fail(); + } + + try { + inputStream.close(); + } catch (IOException e) { + Assert.fail(); + } + } + + private void AssertGridCreate(Project project, int numCols, int numRows){ + Assert.assertNotNull(project); + Assert.assertNotNull(project.columnModel); + Assert.assertNotNull(project.columnModel.columns); + Assert.assertEquals(project.columnModel.columns.size(), numCols); + Assert.assertNotNull(project.rows); + Assert.assertEquals(project.rows.size(), numRows); + } + + private void PrintProject(Project project){ + //some quick and dirty debugging + StringBuilder sb = new StringBuilder(); + for(Column c : project.columnModel.columns){ + sb.append(c.getName()); + sb.append("; "); + } + logger.info(sb.toString()); + for(Row r : project.rows){ + sb = new StringBuilder(); + for(Cell c : r.cells){ + if(c != null){ + sb.append(c.value); + sb.append("; "); + }else{ + sb.append("null; "); + } + } + logger.info(sb.toString()); + } + } +}