Fix: Data losses when importing multiple sheets from same Excell file (#2404)

* Fix loosing data when importing multiple sheets from same source Excell file

* Add test for importing multi sheets with different column size

* Fix space issues

* Restore old tests and implement new test cases for the new feature

* Restore unexpected delete

* Refactor fix

* Restore unexpected line delete

* Add new unit test for new feature
This commit is contained in:
chuhao zeng 2020-03-23 17:41:23 -04:00 committed by GitHub
parent 63bef81980
commit e484625adf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 183 additions and 3 deletions

View File

@ -139,6 +139,10 @@ public class ImporterUtilities {
// Already taken name
i++;
} else {
// Want to update currentFileColumnNames
if(! currentFileColumnNames.contains(columnName)){
currentFileColumnNames.add(columnName);
}
return column;
}
} else {

View File

@ -147,7 +147,7 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
}
} else { // data lines
Row row = new Row(columnNames.size());
if (storeBlankRows) {
rowsWithData++;
} else if (cells.size() > 0) {

View File

@ -74,6 +74,9 @@ public class ExcelImporterTests extends ImporterTest {
//private static final File xlsxFile = createSpreadsheet(true);
private static final File xlsFile = createSpreadsheet(false);
private static final File xlsxFile = createSpreadsheet(true);
private static final File xlsFileWithMultiSheets = createSheetsWithDifferentColumns(false);
private static final File xlsxFileWithMultiSheets = createSheetsWithDifferentColumns(true);
@Override
@BeforeTest
@ -176,7 +179,103 @@ public class ExcelImporterTests extends ImporterTest {
verify(options, times(1)).get("limit");
verify(options, times(1)).get("storeBlankCellsAsNulls");
}
@Test
public void readMultiSheetXls() throws FileNotFoundException, IOException{
ArrayNode sheets = ParsingUtilities.mapper.createArrayNode();
sheets.add(ParsingUtilities.mapper.readTree("{name: \"file-source#Test Sheet 0\", fileNameAndSheetIndex: \"file-source#0\", rows: 31, selected: true}"));
sheets.add(ParsingUtilities.mapper.readTree("{name: \"file-source#Test Sheet 1\", fileNameAndSheetIndex: \"file-source#1\", rows: 31, selected: true}"));
sheets.add(ParsingUtilities.mapper.readTree("{name: \"file-source#Test Sheet 2\", fileNameAndSheetIndex: \"file-source#2\", rows: 31, selected: true}"));
whenGetArrayOption("sheets", options, sheets);
whenGetIntegerOption("ignoreLines", options, 0);
whenGetIntegerOption("headerLines", options, 0);
whenGetIntegerOption("skipDataLines", options, 0);
whenGetIntegerOption("limit", options, -1);
whenGetBooleanOption("storeBlankCellsAsNulls",options,true);
InputStream stream = new FileInputStream(xlsFileWithMultiSheets);
try {
parseOneFile(SUT, stream);
} catch (Exception e) {
Assert.fail(e.getMessage());
}
Assert.assertEquals(project.rows.size(), ROWS * SHEETS);
Assert.assertEquals(project.rows.get(1).cells.size(), COLUMNS);
Assert.assertEquals(project.columnModel.columns.size(), COLUMNS + SHEETS - 1);
Assert.assertEquals(((Number)project.rows.get(1).getCellValue(0)).doubleValue(),1.1, EPSILON);
Assert.assertEquals(((Number)project.rows.get(2).getCellValue(0)).doubleValue(),2.2, EPSILON);
// Check the value read from the second sheet.
Assert.assertEquals(((Number)project.rows.get(ROWS).getCellValue(0)).doubleValue(),0.0, EPSILON);
Assert.assertEquals(((Number)project.rows.get(ROWS).getCellValue(COLUMNS)).doubleValue(),1.0, EPSILON);
Assert.assertFalse((Boolean)project.rows.get(1).getCellValue(1));
Assert.assertTrue((Boolean)project.rows.get(2).getCellValue(1));
Assert.assertEquals((String)project.rows.get(1).getCellValue(4)," Row 1 Col 5");
Assert.assertNull((String)project.rows.get(1).getCellValue(5));
// We will read SHEETS sheets from created xls file.
verify(options, times(SHEETS)).get("ignoreLines");
verify(options, times(SHEETS)).get("headerLines");
verify(options, times(SHEETS)).get("skipDataLines");
verify(options, times(SHEETS)).get("limit");
verify(options, times(SHEETS)).get("storeBlankCellsAsNulls");
}
@Test
public void readMultiSheetXlsx() throws FileNotFoundException, IOException{
ArrayNode sheets = ParsingUtilities.mapper.createArrayNode();
sheets.add(ParsingUtilities.mapper.readTree("{name: \"file-source#Test Sheet 0\", fileNameAndSheetIndex: \"file-source#0\", rows: 31, selected: true}"));
sheets.add(ParsingUtilities.mapper.readTree("{name: \"file-source#Test Sheet 1\", fileNameAndSheetIndex: \"file-source#1\", rows: 31, selected: true}"));
sheets.add(ParsingUtilities.mapper.readTree("{name: \"file-source#Test Sheet 2\", fileNameAndSheetIndex: \"file-source#2\", rows: 31, selected: true}"));
whenGetArrayOption("sheets", options, sheets);
whenGetIntegerOption("ignoreLines", options, 0);
whenGetIntegerOption("headerLines", options, 0);
whenGetIntegerOption("skipDataLines", options, 0);
whenGetIntegerOption("limit", options, -1);
whenGetBooleanOption("storeBlankCellsAsNulls",options,true);
InputStream stream = new FileInputStream(xlsxFileWithMultiSheets);
try {
parseOneFile(SUT, stream);
} catch (Exception e) {
Assert.fail(e.getMessage());
}
Assert.assertEquals(project.rows.size(), ROWS * SHEETS);
Assert.assertEquals(project.rows.get(1).cells.size(), COLUMNS);
Assert.assertEquals(project.columnModel.columns.size(), COLUMNS + SHEETS - 1);
Assert.assertEquals(((Number)project.rows.get(1).getCellValue(0)).doubleValue(),1.1, EPSILON);
Assert.assertEquals(((Number)project.rows.get(2).getCellValue(0)).doubleValue(),2.2, EPSILON);
// Check the value read from the second sheet.
Assert.assertEquals(((Number)project.rows.get(ROWS).getCellValue(0)).doubleValue(),0.0, EPSILON);
Assert.assertEquals(((Number)project.rows.get(ROWS).getCellValue(COLUMNS)).doubleValue(),1.0, EPSILON);
Assert.assertFalse((Boolean)project.rows.get(1).getCellValue(1));
Assert.assertTrue((Boolean)project.rows.get(2).getCellValue(1));
Assert.assertEquals((String)project.rows.get(1).getCellValue(4)," Row 1 Col 5");
Assert.assertNull((String)project.rows.get(1).getCellValue(5));
// We will read SHEETS sheets from created xls file.
verify(options, times(SHEETS)).get("ignoreLines");
verify(options, times(SHEETS)).get("headerLines");
verify(options, times(SHEETS)).get("skipDataLines");
verify(options, times(SHEETS)).get("limit");
verify(options, times(SHEETS)).get("storeBlankCellsAsNulls");
}
private static File createSpreadsheet(boolean xml) {
final Workbook wb = xml ? new XSSFWorkbook() : new HSSFWorkbook();
@ -227,7 +326,61 @@ public class ExcelImporterTests extends ImporterTest {
return null;
}
return file;
}
private static File createSheetsWithDifferentColumns(boolean xml) {
final Workbook wb = xml ? new XSSFWorkbook() : new HSSFWorkbook();
for (int s = 0; s < SHEETS; s++) {
Sheet sheet = wb.createSheet("Test Sheet " + s);
for (int row = 0; row < ROWS; row++) {
int col = 0;
Row r = sheet.createRow(row);
Cell c;
c = r.createCell(col++);
c.setCellValue(row * 1.1); // double
c = r.createCell(col++);
c.setCellValue(row % 2 == 0); // boolean
c = r.createCell(col++);
c.setCellValue(Calendar.getInstance()); // calendar
c = r.createCell(col++);
c.setCellValue(new Date()); // date
c = r.createCell(col++);
c.setCellValue(" Row " + row + " Col " + col); // string
c = r.createCell(col++);
c.setCellValue(""); // string
// Create extra columns to ensure sheet(i+1) has more columns than sheet(i)
for(int i = 0; i < s; i++){
c = r.createCell(col++);
c.setCellValue(i + s);
}
}
}
File file = null;
try {
file = File.createTempFile("openrefine-importer-test", xml ? ".xlsx" : ".xls");
file.deleteOnExit();
OutputStream outputStream = new FileOutputStream(file);
wb.write(outputStream);
outputStream.flush();
outputStream.close();
wb.close();
} catch (IOException e) {
return null;
}
return file;
}
}

View File

@ -52,6 +52,7 @@ import com.google.refine.importers.ImporterUtilities;
import com.google.refine.model.Cell;
import com.google.refine.model.Project;
import com.google.refine.model.Row;
import com.google.refine.model.Column;
public class ImporterUtilitiesTests extends RefineTest {
@ -157,4 +158,26 @@ public class ImporterUtilitiesTests extends RefineTest {
Assert.assertEquals( project.columnModel.columns.get(2).getName(), "Column");
}
@Test
public void testGetOrAllocateColumn(){
Project project = new Project();
List<String> columnNames = new ArrayList<String>();
columnNames.add("Column 1");
columnNames.add("Column 2");
columnNames.add("Column 3");
// Set up column names in project
ImporterUtilities.setupColumns(project, columnNames);
Assert.assertEquals( project.columnModel.columns.get(0).getName(), "Column 1" );
Assert.assertEquals( project.columnModel.columns.get(1).getName(), "Column 2" );
Assert.assertEquals( project.columnModel.columns.get(2).getName(), "Column 3");
// This will mock the situation of importing another sheet from the same file.
// Expect newColumnNames can be updated using column names.
List<String> newColumnNames = new ArrayList<String>();
Column c0 = ImporterUtilities.getOrAllocateColumn(project, newColumnNames, 0, false);
Column c1 = ImporterUtilities.getOrAllocateColumn(project, newColumnNames, 1, false);
Assert.assertEquals(c0.getName(), "Column 1");
Assert.assertEquals(c1.getName(), "Column 2");
Assert.assertEquals(newColumnNames.size(), 2);
}
}