Importer refactoring cleanup (#2984)

* Clean up importer refactoring

Remove an extra copy of filename setting.
Revert some additional API changes (retaining both versions)

* Revert archive file name changes & mark as deprecated
This commit is contained in:
Tom Morris 2020-09-06 17:46:08 -04:00 committed by GitHub
parent 15a069d3d5
commit eaf881ced7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 294 additions and 77 deletions

View File

@ -132,7 +132,7 @@ public class FixedWidthImporter extends TabularImportingParserBase {
}
};
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
TabularImportingParserBase.readTable(project, job, dataReader, limit, options, exceptions);
}
/**

View File

@ -129,6 +129,6 @@ public class LineBasedImporter extends TabularImportingParserBase {
}
};
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
TabularImportingParserBase.readTable(project, job, dataReader, limit, options, exceptions);
}
}

View File

@ -157,7 +157,7 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
}
};
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
TabularImportingParserBase.readTable(project, job, dataReader, limit, options, exceptions);
}
static protected ArrayList<Object> getCells(String line, CSVParser parser, LineNumberReader lnReader)

View File

@ -76,6 +76,18 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
super(useInputStream);
}
/**
* @param project
* @param metadata
* @param job
* @param reader
* @param fileSource
* @param limit
* @param options
* @param exceptions
* @deprecated 2020-07-23 Use {@link TabularImportingParserBase#readTable(Project, ImportingJob, TableDataReader, int, ObjectNode, List)}
*/
@Deprecated
static public void readTable(
Project project,
ProjectMetadata metadata,
@ -85,6 +97,17 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
int limit,
ObjectNode options,
List<Exception> exceptions
) {
readTable(project, job, reader, limit, options, exceptions);
}
static public void readTable(
Project project,
ImportingJob job,
TableDataReader reader,
int limit,
ObjectNode options,
List<Exception> exceptions
) {
int ignoreLines = JSONUtilities.getInt(options, "ignoreLines", -1);
int headerLines = JSONUtilities.getInt(options, "headerLines", 1);

View File

@ -749,7 +749,7 @@ public class WikitextImporter extends TabularImportingParserBase {
// TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way?
}
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
TabularImportingParserBase.readTable(project, job, dataReader, limit, options, exceptions);
// Add reconciliation statistics
if (dataReader.columnReconciled != null) {

View File

@ -27,32 +27,28 @@
package com.google.refine.importers.tree;
/**
* @deprecated 2020-07-23 Use the method signatures which take individual parameters instead of this
*/
@Deprecated
public class ImportParameters {
protected boolean trimStrings;
protected boolean storeEmptyStrings;
protected boolean guessDataType;
protected boolean includeFileSources = false;
protected String fileSource = null;
// TODO: What is the compatibility impact of including new fields
protected boolean includeArchiveFileName = false;
protected String archiveFileName = null;
@Deprecated
public ImportParameters(boolean trimStrings, boolean storeEmptyStrings, boolean guessCellValueTypes,
boolean includeFileSources, String fileSource, boolean includeArchiveFileName, String archiveFileName) {
boolean includeFileSources, String fileSource) {
this.trimStrings = trimStrings;
this.storeEmptyStrings = storeEmptyStrings;
this.guessDataType = guessCellValueTypes;
this.includeFileSources = includeFileSources;
this.fileSource = fileSource;
this.includeArchiveFileName = includeArchiveFileName;
this.archiveFileName = archiveFileName;
}
public ImportParameters(boolean trimStrings, boolean storeEmptyStrings, boolean guessCellValueTypes,
boolean includeFileSources, String fileSource) {
this(trimStrings, storeEmptyStrings, guessCellValueTypes, includeFileSources, fileSource, false, "");
}
@Deprecated
public ImportParameters(boolean trimStrings, boolean storeEmptyStrings, boolean guessCellValueTypes) {
this(trimStrings, storeEmptyStrings, guessCellValueTypes, false, "");
}

View File

@ -210,12 +210,10 @@ abstract public class TreeImportingParserBase extends ImportingParserBase {
boolean trimStrings = JSONUtilities.getBoolean(options, "trimStrings", true);
boolean storeEmptyStrings = JSONUtilities.getBoolean(options, "storeEmptyStrings", false);
boolean guessCellValueTypes = JSONUtilities.getBoolean(options, "guessCellValueTypes", true);
boolean includeFileSources = JSONUtilities.getBoolean(options, "includeFileSources", false);
try {
XmlImportUtilities.importTreeData(treeParser, project, recordPath, rootColumnGroup, limit2,
new ImportParameters(trimStrings, storeEmptyStrings, guessCellValueTypes, includeFileSources,
fileSource));
trimStrings, storeEmptyStrings, guessCellValueTypes);
} catch (Exception e){
exceptions.add(e);
}

View File

@ -245,6 +245,17 @@ public class XmlImportUtilities extends TreeImportUtilities {
return null;
}
/**
* @param parser
* @param project
* @param recordPath
* @param rootColumnGroup
* @param limit
* @param parameters
* @throws TreeReaderException
* @deprecated 2020-07-23 Use {@link XmlImportUtilities#importTreeData(TreeReader, Project, String[], ImportColumnGroup, int, boolean, boolean, boolean)}
*/
@Deprecated
static public void importTreeData(
TreeReader parser,
Project project,
@ -252,6 +263,19 @@ public class XmlImportUtilities extends TreeImportUtilities {
ImportColumnGroup rootColumnGroup,
int limit,
ImportParameters parameters
) throws TreeReaderException{
importTreeData(parser, project, recordPath, rootColumnGroup, limit, parameters.trimStrings, parameters.storeEmptyStrings, parameters.guessDataType);
}
static public void importTreeData(
TreeReader parser,
Project project,
String[] recordPath,
ImportColumnGroup rootColumnGroup,
int limit,
boolean trimStrings,
boolean storeEmptyStrings,
boolean guessDataTypes
) throws TreeReaderException{
if (logger.isTraceEnabled()) {
logger.trace("importTreeData(TreeReader, Project, String[], ImportColumnGroup)");
@ -260,7 +284,8 @@ public class XmlImportUtilities extends TreeImportUtilities {
while (parser.hasNext()) {
Token eventType = parser.next();
if (eventType == Token.StartEntity) {
findRecord(project, parser, recordPath, 0, rootColumnGroup, limit--,parameters);
findRecord(project, parser, recordPath, 0, rootColumnGroup, limit--, trimStrings, storeEmptyStrings,
guessDataTypes);
}
}
} catch (TreeReaderException e) {
@ -270,14 +295,17 @@ public class XmlImportUtilities extends TreeImportUtilities {
}
/**
*
* @param project
* @param parser
* @param recordPath
* @param pathIndex
* @param rootColumnGroup
* @throws ServletException
* @param limit
* @param parameters
* @throws TreeReaderException
* @deprecated Use {@link XmlImportUtilities#findRecord(Project, TreeReader, String[], int, ImportColumnGroup, int, boolean, boolean, boolean)}
*/
@Deprecated
static protected void findRecord(
Project project,
TreeReader parser,
@ -286,6 +314,33 @@ public class XmlImportUtilities extends TreeImportUtilities {
ImportColumnGroup rootColumnGroup,
int limit,
ImportParameters parameters
) throws TreeReaderException {
findRecord(project, parser, recordPath, pathIndex, rootColumnGroup, limit, parameters.trimStrings,
parameters.storeEmptyStrings, parameters.guessDataType);
}
/**
* @param project
* @param parser
* @param recordPath
* @param pathIndex
* @param rootColumnGroup
* @param limit
* @param trimStrings trim whitespace from strings if true
* @param storeEmptyStrings store empty strings if true
* @param guessDataTypes guess whether strings represent numbers and convert
* @throws TreeReaderException
*/
static protected void findRecord(
Project project,
TreeReader parser,
String[] recordPath,
int pathIndex,
ImportColumnGroup rootColumnGroup,
int limit,
boolean trimStrings,
boolean storeEmptyStrings,
boolean guessDataTypes
) throws TreeReaderException {
if (logger.isTraceEnabled()) {
logger.trace("findRecord(Project, TreeReader, String[], int, ImportColumnGroup - path:"+Arrays.toString(recordPath));
@ -305,7 +360,7 @@ public class XmlImportUtilities extends TreeImportUtilities {
Token eventType = parser.next();
if (eventType == Token.StartEntity) {
findRecord(project, parser, recordPath, pathIndex + 1, rootColumnGroup, limit--,
parameters);
trimStrings, storeEmptyStrings, guessDataTypes);
} else if (eventType == Token.EndEntity) {
break;
} else if (eventType == Token.Value) {
@ -314,13 +369,13 @@ public class XmlImportUtilities extends TreeImportUtilities {
String desiredFieldName = recordPath[pathIndex + 1];
String currentFieldName = parser.getFieldName();
if (desiredFieldName.equals(currentFieldName)) {
processFieldAsRecord(project, parser, rootColumnGroup,parameters);
processFieldAsRecord(project, parser, rootColumnGroup, trimStrings, storeEmptyStrings, guessDataTypes);
}
}
}
}
} else {
processRecord(project, parser, rootColumnGroup, parameters);
processRecord(project, parser, rootColumnGroup, trimStrings, storeEmptyStrings, guessDataTypes);
}
} else {
skip(parser);
@ -338,6 +393,23 @@ public class XmlImportUtilities extends TreeImportUtilities {
}
}
/**
* @param project
* @param parser
* @param rootColumnGroup
* @param parameter
* @throws TreeReaderException
* @deprecated Use {@link XmlImportUtilities#processRecord(Project, TreeReader, ImportColumnGroup, boolean, boolean, boolean)}
*/
@Deprecated
static protected void processRecord(
Project project,
TreeReader parser,
ImportColumnGroup rootColumnGroup,
ImportParameters parameter
) throws TreeReaderException {
processRecord(project, parser, rootColumnGroup, parameter.trimStrings, parameter.storeEmptyStrings, parameter.guessDataType);
}
/**
* processRecord parses Tree data for a single element and it's sub-elements,
@ -351,17 +423,34 @@ public class XmlImportUtilities extends TreeImportUtilities {
Project project,
TreeReader parser,
ImportColumnGroup rootColumnGroup,
ImportParameters parameter
boolean trimStrings,
boolean storeEmptyStrings,
boolean guessDataTypes
) throws TreeReaderException {
if (logger.isTraceEnabled()) {
logger.trace("processRecord(Project,TreeReader,ImportColumnGroup)");
}
ImportRecord record = new ImportRecord();
processSubRecord(project, parser, rootColumnGroup, record, 0, parameter);
addImportRecordToProject(record, project, parameter.includeFileSources, parameter.fileSource, parameter.includeArchiveFileName, parameter.archiveFileName);
processSubRecord(project, parser, rootColumnGroup, record, 0, trimStrings, storeEmptyStrings, guessDataTypes);
addImportRecordToProject(record, project);
}
/**
* processFieldAsRecord parses Tree data for a single element and it's sub-elements,
* adding the parsed data as a row to the project
* @param project
* @param parser
* @param rootColumnGroup
* @throws TreeReaderException
* @throws ServletException
*/
@Deprecated
static protected void processFieldAsRecord(Project project, TreeReader parser, ImportColumnGroup rootColumnGroup,
ImportParameters parameter) throws TreeReaderException {
processFieldAsRecord(project, parser, rootColumnGroup, parameter.trimStrings, parameter.storeEmptyStrings,
parameter.guessDataType);
}
/**
* processFieldAsRecord parses Tree data for a single element and it's sub-elements,
@ -375,7 +464,9 @@ public class XmlImportUtilities extends TreeImportUtilities {
Project project,
TreeReader parser,
ImportColumnGroup rootColumnGroup,
ImportParameters parameter
boolean trimStrings,
boolean storeEmptyStrings,
boolean guessDataType
) throws TreeReaderException {
if (logger.isTraceEnabled()) {
logger.trace("processFieldAsRecord(Project,TreeReader,ImportColumnGroup)");
@ -384,10 +475,10 @@ public class XmlImportUtilities extends TreeImportUtilities {
ImportRecord record = null;
if (value instanceof String) {
String text = (String) value;
if (parameter.trimStrings) {
if (trimStrings) {
text = text.trim();
}
if (text.length() > 0 | !parameter.storeEmptyStrings) {
if (text.length() > 0 | !storeEmptyStrings) {
record = new ImportRecord();
addCell(
project,
@ -395,8 +486,8 @@ public class XmlImportUtilities extends TreeImportUtilities {
record,
parser.getFieldName(),
(String) value,
parameter.storeEmptyStrings,
parameter.guessDataType
storeEmptyStrings,
guessDataType
);
}
} else {
@ -410,42 +501,42 @@ public class XmlImportUtilities extends TreeImportUtilities {
);
}
if (record != null) {
addImportRecordToProject(record, project,
parameter.includeFileSources, parameter.fileSource, parameter.includeArchiveFileName, parameter.archiveFileName);
addImportRecordToProject(record, project);
}
}
@Deprecated
static protected void addImportRecordToProject(ImportRecord record, Project project,
boolean includeFileSources, String fileSource, boolean includeArchiveFileName, String archiveFileName) {
int archiveColumnIndex = -1, fileSourceColumnIndex = -1;
if (includeArchiveFileName && archiveFileName != null) {
archiveColumnIndex = 0;
}
if (includeFileSources) {
fileSourceColumnIndex = archiveColumnIndex == 0? 1 : 0;
addImportRecordToProject(record, project);
}
static protected void addImportRecordToProject(ImportRecord record, Project project) {
for (List<Cell> row : record.rows) {
if (row.size() > 0) {
Row realRow = new Row(row.size());
for (int c = 0; c < row.size(); c++) {
if (c == archiveColumnIndex) {
realRow.setCell(archiveColumnIndex, new Cell(archiveFileName, null));
continue;
}
if (c == fileSourceColumnIndex) {
realRow.setCell(fileSourceColumnIndex, new Cell(fileSource, null));
continue;
}
Cell cell = row.get(c);
if (cell != null) {
realRow.setCell(c, cell);
}
}
if (realRow != null) {
project.rows.add(realRow);
}
}
}
@Deprecated
static protected void processSubRecord(
Project project,
TreeReader parser,
ImportColumnGroup columnGroup,
ImportRecord record,
int level,
ImportParameters parameter
) throws TreeReaderException {
processSubRecord(project, parser, columnGroup, record, level, parameter.trimStrings,
parameter.storeEmptyStrings, parameter.guessDataType);
}
/**
@ -462,7 +553,9 @@ public class XmlImportUtilities extends TreeImportUtilities {
ImportColumnGroup columnGroup,
ImportRecord record,
int level,
ImportParameters parameter
boolean trimStrings,
boolean storeEmptyStrings,
boolean guessDataType
) throws TreeReaderException {
if (logger.isTraceEnabled()) {
logger.trace("processSubRecord(Project,TreeReader,ImportColumnGroup,ImportRecord) lvl:"+level+" "+columnGroup);
@ -482,18 +575,18 @@ public class XmlImportUtilities extends TreeImportUtilities {
int attributeCount = parser.getAttributeCount();
for (int i = 0; i < attributeCount; i++) {
String text = parser.getAttributeValue(i);
if (parameter.trimStrings) {
if (trimStrings) {
text = text.trim();
}
if (text.length() > 0 | !parameter.storeEmptyStrings) {
if (text.length() > 0 | !storeEmptyStrings) {
addCell(
project,
thisColumnGroup,
record,
composeName(parser.getAttributePrefix(i), parser.getAttributeLocalName(i)),
text,
parameter.storeEmptyStrings,
parameter.guessDataType
storeEmptyStrings,
guessDataType
);
}
}
@ -507,7 +600,9 @@ public class XmlImportUtilities extends TreeImportUtilities {
thisColumnGroup,
record,
level+1,
parameter
trimStrings,
storeEmptyStrings,
guessDataType
);
} else if (//eventType == XMLStreamConstants.CDATA ||
eventType == Token.Value) { //XMLStreamConstants.CHARACTERS) {
@ -515,11 +610,11 @@ public class XmlImportUtilities extends TreeImportUtilities {
String colName = parser.getFieldName();
if (value instanceof String) {
String text = (String) value;
if(parameter.trimStrings) {
if(trimStrings) {
text = text.trim();
}
addCell(project, thisColumnGroup, record, colName, text,
parameter.storeEmptyStrings, parameter.guessDataType);
storeEmptyStrings, guessDataType);
} else {
addCell(project, thisColumnGroup, record, colName, value);
}

View File

@ -49,31 +49,50 @@ public class XmlImportUtilitiesStub extends XmlImportUtilities {
return super.detectRecordElement(parser, tag);
}
public void ProcessSubRecordWrapper(Project project, TreeReader parser, ImportColumnGroup columnGroup,
@Deprecated
public void processSubRecordWrapper(Project project, TreeReader parser, ImportColumnGroup columnGroup,
ImportRecord record, int level, ImportParameters parameter)
throws Exception {
super.processSubRecord(project, parser, columnGroup, record, level, parameter);
}
@Deprecated
public void findRecordWrapper(Project project, TreeReader parser, String[] recordPath, int pathIndex,
ImportColumnGroup rootColumnGroup, boolean trimStrings, boolean storeEmptyStrings, boolean guessDataType)
ImportColumnGroup rootColumnGroup, int limit, ImportParameters parameters)
throws Exception {
super.findRecord(project, parser, recordPath, pathIndex, rootColumnGroup, -1,
new ImportParameters(trimStrings, storeEmptyStrings, guessDataType));
super.findRecord(project, parser, recordPath, pathIndex, rootColumnGroup, limit, parameters);
}
public void findRecordWrapper(Project project, TreeReader parser, String[] recordPath, int pathIndex,
ImportColumnGroup rootColumnGroup, int limit, boolean trimStrings, boolean storeEmptyStrings,
boolean guessDataType) throws Exception {
super.findRecord(project, parser, recordPath, pathIndex, rootColumnGroup, limit, trimStrings, storeEmptyStrings,
guessDataType);
}
@Deprecated
public void processRecordWrapper(Project project, TreeReader parser, ImportColumnGroup rootColumnGroup,
ImportParameters parameters)
throws Exception {
super.processRecord(project, parser, rootColumnGroup, parameters);
}
public void processRecordWrapper(Project project, TreeReader parser, ImportColumnGroup rootColumnGroup,
boolean trimStrings, boolean storeEmptyStrings, boolean guessDataType)
throws Exception {
super.processRecord(project, parser, rootColumnGroup,
new ImportParameters(trimStrings, storeEmptyStrings, guessDataType));
super.processRecord(project, parser, rootColumnGroup, trimStrings, storeEmptyStrings, guessDataType);
}
public void addCellWrapper(Project project, ImportColumnGroup columnGroup, ImportRecord record, String columnLocalName, Serializable value, int commonStartingRowIndex) {
public void addCellWrapper(Project project, ImportColumnGroup columnGroup, ImportRecord record,
String columnLocalName, Serializable value, int commonStartingRowIndex) {
super.addCell(project, columnGroup, record, columnLocalName, value);
}
public void addCellWrapper(Project project, ImportColumnGroup columnGroup, ImportRecord record, String columnLocalName, String text, int commonStartingRowIndex, boolean trimStrings, boolean storeEmptyStrings) {
public void addCellWrapper(Project project, ImportColumnGroup columnGroup, ImportRecord record,
String columnLocalName, String text, int commonStartingRowIndex, boolean trimStrings,
boolean storeEmptyStrings) {
super.addCell(project, columnGroup, record, columnLocalName, text, trimStrings, storeEmptyStrings);
}
}

View File

@ -207,6 +207,33 @@ public class XmlImportUtilitiesTests extends RefineTest {
public void importTreeDataXmlTest(){
loadSampleXml();
String[] recordPath = new String[]{"library","book"};
try {
XmlImportUtilitiesStub.importTreeData(createXmlParser(), project, recordPath, columnGroup, -1,
false, true, false);
} catch (Exception e){
Assert.fail();
}
assertProjectCreated(project, 0, 6);
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals(columnGroup.subgroups.size(), 1);
Assert.assertNotNull(columnGroup.subgroups.get("book"));
Assert.assertEquals(columnGroup.subgroups.get("book").subgroups.size(), 3);
Assert.assertNotNull(columnGroup.subgroups.get("book").subgroups.get("author"));
Assert.assertNotNull(columnGroup.subgroups.get("book").subgroups.get("title"));
Assert.assertNotNull(columnGroup.subgroups.get("book").subgroups.get("publish_date"));
}
/**
* Test of deprecated method which can go away when it does
*/
@Test
public void importTreeDataXmlTestDeprecated(){
loadSampleXml();
String[] recordPath = new String[]{"library","book"};
try {
XmlImportUtilitiesStub.importTreeData(createXmlParser(), project, recordPath, columnGroup, -1,
@ -231,6 +258,39 @@ public class XmlImportUtilitiesTests extends RefineTest {
public void importXmlWithVaryingStructureTest(){
loadData(XmlImporterTests.getSampleWithVaryingStructure());
String[] recordPath = new String[]{"library", "book"};
try {
XmlImportUtilitiesStub.importTreeData(createXmlParser(), project, recordPath, columnGroup, -1,
false, true, false);
} catch (Exception e){
Assert.fail();
}
assertProjectCreated(project, 0, 6);
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals(project.rows.get(5).cells.size(), 5);
Assert.assertEquals(columnGroup.subgroups.size(), 1);
Assert.assertEquals(columnGroup.name, "");
ImportColumnGroup book = columnGroup.subgroups.get("book");
Assert.assertNotNull(book);
Assert.assertEquals(book.columns.size(), 1);
Assert.assertEquals(book.subgroups.size(), 4);
Assert.assertNotNull(book.subgroups.get("author"));
Assert.assertEquals(book.subgroups.get("author").columns.size(), 1);
Assert.assertNotNull(book.subgroups.get("title"));
Assert.assertNotNull(book.subgroups.get("publish_date"));
Assert.assertNotNull(book.subgroups.get("genre"));
}
/**
* Test using deprecated method which can go away when it does
*/
@Test
public void importXmlWithVaryingStructureTestDeprecated(){
loadData(XmlImporterTests.getSampleWithVaryingStructure());
String[] recordPath = new String[]{"library", "book"};
try {
XmlImportUtilitiesStub.importTreeData(createXmlParser(), project, recordPath, columnGroup, -1,
@ -289,7 +349,7 @@ public class XmlImportUtilitiesTests extends RefineTest {
int pathIndex = 0;
try {
SUT.findRecordWrapper(project, parser, recordPath, pathIndex, columnGroup,
SUT.findRecordWrapper(project, parser, recordPath, pathIndex, columnGroup, -1,
false, false, false);
} catch (Exception e) {
Assert.fail();
@ -301,6 +361,32 @@ public class XmlImportUtilitiesTests extends RefineTest {
//TODO
}
/**
* Test of deprecated wrapper method which can go away when it does
*/
@Test
public void findRecordTestXmlDeprecated(){
loadSampleXml();
createXmlParser();
ParserSkip();
String[] recordPath = new String[]{"library","book"};
int pathIndex = 0;
try {
SUT.findRecordWrapper(project, parser, recordPath, pathIndex, columnGroup, -1,
new ImportParameters(false, false, false));
} catch (Exception e) {
Assert.fail();
}
assertProjectCreated(project, 0, 6);
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
//TODO
}
@Test
public void processRecordTestXml(){
loadData("<?xml version=\"1.0\"?><library><book id=\"1\"><author>author1</author><genre>genre1</genre></book></library>");
@ -377,7 +463,7 @@ public class XmlImportUtilitiesTests extends RefineTest {
ParserSkip();
try {
SUT.ProcessSubRecordWrapper(project, parser, columnGroup, record,0,
SUT.processSubRecordWrapper(project, parser, columnGroup, record,0,
new ImportParameters(false, false, false));
} catch (Exception e) {
Assert.fail();