XmlImporter is partially unit tested. One broken test for case where Record Elements contain varying numbers of nested elements. (This is for Issue 61 which is, at the time of this commit, an open issue)

XmlImportUtilities produces log for case when no RecordElementCandidate are found.  (too few similar Xml elements).

git-svn-id: http://google-refine.googlecode.com/svn/trunk@862 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-05-26 19:22:38 +00:00
parent 1c47ff476b
commit 34cb1c4d07
3 changed files with 332 additions and 110 deletions

View File

@ -16,12 +16,17 @@ import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
public class XmlImportUtilities {
final static Logger logger = LoggerFactory.getLogger("XmlImporterUtilities");
static protected class RecordElementCandidate {
String[] path;
int count;
@ -119,6 +124,7 @@ public class XmlImportUtilities {
}
static public String[] detectRecordElement(InputStream inputStream) {
logger.trace("detectRecordElement(inputStream)");
List<RecordElementCandidate> candidates = new ArrayList<RecordElementCandidate>();
try {
@ -147,10 +153,12 @@ public class XmlImportUtilities {
return candidates.get(0).path;
}
logger.info("No candidate elements were found in Xml - at least 6 similar elements are required");
return null;
}
static protected RecordElementCandidate detectRecordElement(XMLStreamReader parser, String[] path) {
logger.trace("detectRecordElement(XMLStreamReader, String[])");
List<RecordElementCandidate> descendantCandidates = new ArrayList<RecordElementCandidate>();
Map<String, Integer> immediateChildCandidateMap = new HashMap<String, Integer>();
@ -219,6 +227,7 @@ public class XmlImportUtilities {
sortRecordElementCandidates(immediateChildCandidates);
RecordElementCandidate ourCandidate = immediateChildCandidates.get(0);
logger.trace("ourCandidate.count : " + ourCandidate.count + "; immediateChildCandidates.size() : " + immediateChildCandidates.size());
if (ourCandidate.count / immediateChildCandidates.size() > 5) {
return ourCandidate;
}

View File

@ -6,11 +6,16 @@ import java.io.PushbackInputStream;
import java.io.Reader;
import java.util.Properties;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.metaweb.gridworks.importers.XmlImportUtilities.ImportColumnGroup;
import com.metaweb.gridworks.model.Project;
public class XmlImporter implements Importer {
final static Logger logger = LoggerFactory.getLogger("XmlImporter");
public static final int BUFFER_SIZE = 64 * 1024;
public boolean takesReader() {
@ -28,6 +33,7 @@ public class XmlImporter implements Importer {
Project project,
Properties options
) throws Exception {
logger.trace("XmlImporter.read");
PushbackInputStream pis = new PushbackInputStream(inputStream,BUFFER_SIZE);
String[] recordPath = null;

View File

@ -0,0 +1,207 @@
package com.metaweb.gridworks.tests.importers;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Properties;
import static org.mockito.Mockito.mock;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import com.metaweb.gridworks.importers.XmlImporter;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
public class XmlImporterTests {
final static Logger logger = LoggerFactory.getLogger("XmlImporterTests");
//dependencies
Project project = null;
Properties options = null;
ByteArrayInputStream inputStream = null;
//System Under Test
XmlImporter SUT = null;
@BeforeMethod
public void SetUp(){
SUT = new XmlImporter();
project = new Project();
options = mock(Properties.class);
}
@AfterMethod
public void TearDown(){
SUT = null;
project = null;
options = null;
}
@Test
public void canParseSample(){
RunTest(getSample());
AssertGridCreate(project, 4, 6);
PrintProject(project);
Row row = project.rows.get(0);
Assert.assertNotNull(row);
Assert.assertNotNull(row.cells);
Assert.assertNotNull(row.cells.get(2));
Assert.assertEquals(row.cells.get(2).value, "Author 1, The");
}
@Test
public void testCanParseLineBreak(){
RunTest(getSampleWithLineBreak());
AssertGridCreate(project, 4, 6);
PrintProject(project);
Row row = project.rows.get(3);
Assert.assertNotNull(row);
Assert.assertNotNull(row.cells);
Assert.assertNotNull(row.cells.get(2));
Assert.assertEquals(row.cells.get(2).value, "With line\n break");
}
@Test(groups={"broken"})
public void testElementsWithVaryingStructure(){
RunTest(getSampleWithVaryingStructure());
AssertGridCreate(project, 5, 6);
PrintProject(project);
Row row0 = project.rows.get(0);
Assert.assertNotNull(row0);
Assert.assertNotNull(row0.cells);
Assert.assertEquals(row0.cells.size(),6);
Row row5 = project.rows.get(5);
Assert.assertNotNull(row5);
Assert.assertNotNull(row5.cells);
Assert.assertEquals(row5.cells.size(),6);
}
//------------helper methods---------------
protected String getTypicalElement(int id){
return "<book id=\"" + id + "\">" +
"<author>Author " + id + ", The</author>" +
"<title>Book title " + id + "</title>" +
"<publish_date>2010-05-26</publish_date>" +
"</book>";
}
protected String getSample(){
StringBuilder sb = new StringBuilder();
sb.append("<?xml version=\"1.0\"?><library>");
for(int i = 1; i < 7; i++){
sb.append(getTypicalElement(i));
}
sb.append("</library>");
return sb.toString();
}
protected String getSampleWithLineBreak(){
StringBuilder sb = new StringBuilder();
sb.append("<?xml version=\"1.0\"?><library>");
for(int i = 1; i < 4; i++){
sb.append(getTypicalElement(i));
}
sb.append("<book id=\"4\">" +
"<author>With line\n break</author>" +
"<title>Book title 4</title>" +
"<publish_date>2010-05-26</publish_date>" +
"</book>");
sb.append(getTypicalElement(5));
sb.append(getTypicalElement(6));
sb.append("</library>");
return sb.toString();
}
protected String getSampleWithVaryingStructure(){
StringBuilder sb = new StringBuilder();
sb.append("<?xml version=\"1.0\"?><library>");
for(int i = 1; i < 6; i++){
sb.append(getTypicalElement(i));
}
sb.append("<book id=\"6\">" +
"<author>With line\n break</author>" +
"<title>Book title 6</title>" +
"<genre>New element not seen in other records</genre>" +
"<publish_date>2010-05-26</publish_date>" +
"</book>");
sb.append("</library>");
return sb.toString();
}
private void RunTest(String testString){
try {
inputStream = new ByteArrayInputStream( testString.getBytes( "UTF-8" ) );
} catch (UnsupportedEncodingException e1) {
Assert.fail();
}
try {
SUT.read(inputStream, project, options);
} catch (Exception e) {
Assert.fail();
}
try {
inputStream.close();
} catch (IOException e) {
Assert.fail();
}
}
private void AssertGridCreate(Project project, int numCols, int numRows){
Assert.assertNotNull(project);
Assert.assertNotNull(project.columnModel);
Assert.assertNotNull(project.columnModel.columns);
Assert.assertEquals(project.columnModel.columns.size(), numCols);
Assert.assertNotNull(project.rows);
Assert.assertEquals(project.rows.size(), numRows);
}
private void PrintProject(Project project){
//some quick and dirty debugging
StringBuilder sb = new StringBuilder();
for(Column c : project.columnModel.columns){
sb.append(c.getName());
sb.append("; ");
}
logger.info(sb.toString());
for(Row r : project.rows){
sb = new StringBuilder();
for(Cell c : r.cells){
if(c != null){
sb.append(c.value);
sb.append("; ");
}else{
sb.append("null; ");
}
}
logger.info(sb.toString());
}
}
}