Refactor importer APIs - Fixes #2963 (#2978)

* Make sure data directory is directory, not a file

* Add a test for zip archive import

Also tests the saving of the archive file name and source filename

* Add TODOs - no functional changes

* Cosmetic cleanups

* Revert importer API changes for archive file name parameter

Fixes #2963
- restore binary compatibility to the API
- hoist the handling of both fileSource and archiveFileName from
TabularImportingParserBase and TreeImportingParserBase to
ImportingParserBase so that there's only one copy. These 3 classes are
all part of the internal implementation, so there should be no
compatibility issue.

* Revert weird flow of control for import options metadata

This reverts the very convoluted control flow that was introduced
when adding the input options to the project metadata. Instead
the metadata is all handled in the importer framework rather than
having to change APIs are have individual importers worry about
it.

The feature never had test coverage, so that is still to be added.

* Add test for import options in project metadata & fix bug

Fixes bug where same options object was being reused and overwritten,
so all copies in the list ended up the same.
This commit is contained in:
Tom Morris 2020-07-23 12:36:14 -04:00 committed by GitHub
parent d5abaac6df
commit 83ed9ffdaf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
30 changed files with 231 additions and 281 deletions

View File

@ -285,7 +285,6 @@ public class DatabaseImportController implements ImportingController {
job,
new DBQueryResultPreviewReader(job, databaseService, querySource, columns, dbQueryInfo, 100),
querySource,
"", // archivefilename
limit,
options,
exceptions
@ -415,7 +414,6 @@ public class DatabaseImportController implements ImportingController {
job,
new DBQueryResultImportReader(job, databaseService, querySource, columns, dbQueryInfo, getCreateBatchSize()),
querySource,
"", //archivefilename,
limit,
options,
exceptions

View File

@ -142,7 +142,6 @@ public class GDataImporter {
job,
new WorksheetBatchRowReader(job, fileSource, service, spreadsheetId, worksheetEntry),
fileSource,
"", //archivefilename
limit,
options,
exceptions

View File

@ -71,7 +71,6 @@ public class PCAxisImporter extends TabularImportingParserBase {
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
Reader reader,
int limit,
ObjectNode options,
@ -89,8 +88,6 @@ public class PCAxisImporter extends TabularImportingParserBase {
TabularImportingParserBase.readTable(
project, metadata, job, dataReader,
fileSource, archiveFileName, limit, options, exceptions);
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, reader, limit, options, exceptions);
fileSource, limit, options, exceptions);
}
}

View File

@ -46,7 +46,7 @@ public class WikidataRefineTest extends PowerMockTestCase {
SeparatorBasedImporter importer = new SeparatorBasedImporter();
List<Exception> exceptions = new ArrayList<Exception>();
importer.parseOneFile(project, metadata, job, "filesource", "archivefile", new StringReader(input), -1, options, exceptions);
importer.parseOneFile(project, metadata, job, "filesource", new StringReader(input), -1, options, exceptions);
project.update();
ProjectManager.singleton.registerProject(project, metadata);

View File

@ -128,7 +128,6 @@ public class ExcelImporter extends TabularImportingParserBase {
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
InputStream inputStream,
int limit,
ObjectNode options,
@ -216,19 +215,19 @@ public class ExcelImporter extends TabularImportingParserBase {
}
};
// TODO: Do we need to preserve the original filename? Take first piece before #?
// JSONUtilities.safePut(options, "fileSource", fileSource + "#" + sheet.getSheetName());
TabularImportingParserBase.readTable(
project,
metadata,
job,
dataReader,
fileSource + "#" + sheet.getSheetName(), archiveFileName,
fileSource + "#" + sheet.getSheetName(),
limit,
options,
exceptions
);
}
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, inputStream, limit, options, exceptions);
}
static protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell) {

View File

@ -83,7 +83,6 @@ public class FixedWidthImporter extends TabularImportingParserBase {
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
Reader reader,
int limit,
ObjectNode options,
@ -133,9 +132,7 @@ public class FixedWidthImporter extends TabularImportingParserBase {
}
};
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, archiveFileName, limit, options, exceptions);
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, reader, limit, options, exceptions);
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
}
/**

View File

@ -1,6 +1,7 @@
/*
Copyright 2011, Google Inc.
Copyright 2012,2020 OpenRefine contributors
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -39,6 +40,7 @@ import java.io.InputStream;
import java.io.Reader;
import java.util.List;
import org.apache.commons.lang.NotImplementedException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -49,9 +51,11 @@ import com.google.refine.importing.EncodingGuesser;
import com.google.refine.importing.ImportingJob;
import com.google.refine.importing.ImportingParser;
import com.google.refine.importing.ImportingUtilities;
import com.google.refine.model.Cell;
import com.google.refine.model.Column;
import com.google.refine.model.ModelException;
import com.google.refine.model.Project;
import com.google.refine.model.Row;
import com.google.refine.util.JSONUtilities;
import com.google.refine.util.ParsingUtilities;
@ -99,6 +103,7 @@ abstract public class ImportingParserBase implements ImportingParser {
}
}
// TODO: Make private? At least protected?
public void parseOneFile(
Project project,
ProjectMetadata metadata,
@ -112,24 +117,55 @@ abstract public class ImportingParserBase implements ImportingParser {
final File file = ImportingUtilities.getFile(job, fileRecord);
final String fileSource = ImportingUtilities.getFileSource(fileRecord);
final String archiveFileName = ImportingUtilities.getArchiveFileName(fileRecord);
int filenameColumnIndex = -1;
int archiveColumnIndex = -1;
int startingRowCount = project.rows.size();
progress.startFile(fileSource);
try {
InputStream inputStream = ImporterUtilities.openAndTrackFile(fileSource, file, progress);
try {
if (JSONUtilities.getBoolean(options, "includeArchiveFileName", false)
&& archiveFileName != null) {
archiveColumnIndex = addArchiveColumn(project);
}
if (JSONUtilities.getBoolean(options, "includeFileSources", false)) {
filenameColumnIndex = addFilenameColumn(project, archiveColumnIndex >=0);
}
if (useInputStream) {
parseOneFile(project, metadata, job, fileSource, archiveFileName, inputStream, limit, options, exceptions);
parseOneFile(project, metadata, job, fileSource, inputStream, limit, options, exceptions);
} else {
String commonEncoding = JSONUtilities.getString(options, "encoding", null);
if (commonEncoding != null && commonEncoding.isEmpty()) {
commonEncoding = null;
}
Reader reader = ImportingUtilities.getReaderFromStream(
inputStream, fileRecord, commonEncoding);
parseOneFile(project, metadata, job, fileSource, archiveFileName, reader, limit, options, exceptions);
inputStream, fileRecord, commonEncoding);
parseOneFile(project, metadata, job, fileSource, reader, limit, options, exceptions);
}
// Fill in filename and archive name column for all rows added from this file
int endingRowCount = project.rows.size();
for (int i = startingRowCount; i < endingRowCount; i++) {
Row row = project.rows.get(i);
if (archiveColumnIndex >= 0) {
row.setCell(archiveColumnIndex, new Cell(archiveFileName, null));
}
if (filenameColumnIndex >= 0) {
row.setCell(filenameColumnIndex, new Cell(fileSource, null));
}
}
ObjectNode fileOptions = options.deepCopy();
JSONUtilities.safePut(fileOptions, "fileSource", fileSource);
JSONUtilities.safePut(fileOptions, "archiveFileName", archiveFileName);
// TODO: This will save a separate copy for each file in the import, but they're
// going to be mostly the same
metadata.appendImportOptionMetadata(fileOptions);
} finally {
inputStream.close();
}
@ -138,51 +174,66 @@ abstract public class ImportingParserBase implements ImportingParser {
}
}
/**
* Parsing method to be implemented by Reader-based parsers.
* ie those initialized with useInputStream == false
*
* @param project
* @param metadata
* @param job
* @param fileSource
* @param reader
* @param limit
* @param options
* @param exceptions
*/
public void parseOneFile(
Project project,
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
Reader reader,
int limit,
ObjectNode options,
List<Exception> exceptions
) {
pushImportingOptions(metadata, "fileSource", fileSource, options);
pushImportingOptions(metadata, "archiveFileName", archiveFileName, options);
throw new NotImplementedException();
}
private void pushImportingOptions(ProjectMetadata metadata, String key, String value, ObjectNode options) {
options.put(key, value);
// set the import options to metadata:
metadata.appendImportOptionMetadata(options);
}
public void parseOneFile(
Project project,
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
InputStream inputStream,
int limit,
ObjectNode options,
List<Exception> exceptions
) {
pushImportingOptions(metadata, "fileSource", fileSource, options);
pushImportingOptions(metadata, "archiveFileName", archiveFileName, options);
throw new NotImplementedException();
}
/**
* @deprecated 2020-07-21 by tfmorris. This will become private in a future release.
*/
@Deprecated
protected static int addFilenameColumn(Project project, boolean archiveColumnAdded) {
String fileNameColumnName = "File";
String fileNameColumnName = "File"; // TODO: Localize?
int columnId = archiveColumnAdded? 1 : 0;
if (project.columnModel.getColumnByName(fileNameColumnName) == null) {
return addColumn(project, fileNameColumnName, columnId);
}
private static int addArchiveColumn(Project project) {
String columnName = "Archive"; // TODO: Localize?
return addColumn(project, columnName, 0);
}
private static int addColumn(Project project, String columnName, int columnId) {
if (project.columnModel.getColumnByName(columnName) == null) {
try {
project.columnModel.addColumn(
columnId, new Column(project.columnModel.allocateNewCellIndex(), fileNameColumnName), false);
columnId, new Column(project.columnModel.allocateNewCellIndex(), columnName), false);
return columnId;
} catch (ModelException e) {
// Shouldn't happen: We already checked for duplicate name.
@ -194,22 +245,5 @@ abstract public class ImportingParserBase implements ImportingParser {
}
}
protected static int addArchiveColumn(Project project) {
String ArchiveColumnName = "Archive";
if (project.columnModel.getColumnByName(ArchiveColumnName) == null) {
try {
project.columnModel.addColumn(
0, new Column(project.columnModel.allocateNewCellIndex(), ArchiveColumnName), false);
return 0;
} catch (ModelException e) {
// Shouldn't happen: We already checked for duplicate name.
logger.error("ModelException adding Filename column",e);
}
return -1;
} else {
return 0;
}
}
}

View File

@ -201,13 +201,11 @@ public class JsonImporter extends TreeImportingParserBase {
@Override
public void parseOneFile(Project project, ProjectMetadata metadata,
ImportingJob job, String fileSource, String archiveFileName, InputStream is,
ImportingJob job, String fileSource, InputStream is,
ImportColumnGroup rootColumnGroup, int limit, ObjectNode options, List<Exception> exceptions) {
parseOneFile(project, metadata, job, fileSource, archiveFileName,
parseOneFile(project, metadata, job, fileSource,
new JSONTreeReader(is), rootColumnGroup, limit, options, exceptions);
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, is, rootColumnGroup, limit, options, exceptions);
}
static public class JSONTreeReader implements TreeReader {

View File

@ -66,7 +66,6 @@ public class LineBasedImporter extends TabularImportingParserBase {
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
Reader reader,
int limit,
ObjectNode options,
@ -130,8 +129,6 @@ public class LineBasedImporter extends TabularImportingParserBase {
}
};
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, archiveFileName, limit, options, exceptions);
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, reader, limit, options, exceptions);
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
}
}

View File

@ -39,6 +39,7 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.List;
import org.marc4j.MarcPermissiveStreamReader;
import org.marc4j.MarcWriter;
@ -57,7 +58,7 @@ public class MarcImporter extends XmlImporter {
}
@Override
public ObjectNode createParserUIInitializationData(ImportingJob job, java.util.List<ObjectNode> fileRecords, String format) {
public ObjectNode createParserUIInitializationData(ImportingJob job, List<ObjectNode> fileRecords, String format) {
if (fileRecords.size() > 0) {
ObjectNode firstFileRecord = fileRecords.get(0);
File file = ImportingUtilities.getFile(job, firstFileRecord);

View File

@ -39,7 +39,6 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -66,6 +65,7 @@ import com.google.refine.util.JSONUtilities;
import com.google.refine.util.ParsingUtilities;
@SuppressWarnings("deprecation")
public class OdsImporter extends TabularImportingParserBase {
final static Logger logger = LoggerFactory.getLogger("open office");
@ -127,7 +127,6 @@ public class OdsImporter extends TabularImportingParserBase {
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
InputStream inputStream,
int limit,
ObjectNode options,
@ -195,14 +194,11 @@ public class OdsImporter extends TabularImportingParserBase {
job,
dataReader,
fileSource + "#" + table.getTableName(),
archiveFileName,
limit,
options,
exceptions
);
}
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, inputStream, limit, options, exceptions);
}
static protected Serializable extractCell(OdfTableCell cell) {

View File

@ -76,7 +76,8 @@ public class RdfTripleImporter extends ImportingParserBase {
this.mode = mode;
}
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource, String archiveFileName,
@Override
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource,
InputStream input, int limit, ObjectNode options, List<Exception> exceptions) {
// create an empty model
Model model = ModelFactory.createDefaultModel();
@ -164,7 +165,5 @@ public class RdfTripleImporter extends ImportingParserBase {
} catch (ModelException e) {
exceptions.add(e);
}
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, input, limit, options, exceptions);
}
}

View File

@ -87,7 +87,6 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
Reader reader,
int limit,
ObjectNode options,
@ -148,18 +147,17 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
usedColumnNames = true;
return columnNames;
} else {
String line = lnReader.readLine();
if (line == null) {
return null;
} else {
return getCells(line, parser, lnReader);
}
String line = lnReader.readLine();
if (line == null) {
return null;
} else {
return getCells(line, parser, lnReader);
}
}
}
};
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, archiveFileName, limit, options, exceptions);
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, lnReader, limit, options, exceptions);
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
}
static protected ArrayList<Object> getCells(String line, CSVParser parser, LineNumberReader lnReader)

View File

@ -34,7 +34,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.refine.importers;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
@ -83,7 +82,6 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
ImportingJob job,
TableDataReader reader,
String fileSource,
String archiveFileName,
int limit,
ObjectNode options,
List<Exception> exceptions
@ -104,18 +102,8 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
boolean storeBlankRows = JSONUtilities.getBoolean(options, "storeBlankRows", true);
boolean storeBlankCellsAsNulls = JSONUtilities.getBoolean(options, "storeBlankCellsAsNulls", true);
boolean includeFileSources = JSONUtilities.getBoolean(options, "includeFileSources", false);
boolean trimStrings = JSONUtilities.getBoolean(options, "trimStrings", false);
boolean includeArchiveFileName = JSONUtilities.getBoolean(options, "includeArchiveFileName", false);
int filenameColumnIndex = -1, archiveColumnIndex = -1;
if (includeArchiveFileName && archiveFileName != null) {
archiveColumnIndex = addArchiveColumn(project);
}
if (includeFileSources) {
filenameColumnIndex = addFilenameColumn(project, archiveColumnIndex >=0);
}
List<String> columnNames = new ArrayList<String>();
boolean hasOurOwnColumnNames = headerLines > 0;
@ -153,9 +141,7 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
} else { // data lines
Row row = new Row(cells.size());
if (storeBlankRows) {
rowsWithData++;
} else if (cells.size() > 0) {
if (storeBlankRows || cells.size() > 0) {
rowsWithData++;
}
@ -172,7 +158,7 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
} else if (ExpressionUtils.isNonBlankData(value)) {
Serializable storedValue;
if (value instanceof String) {
if(trimStrings) {
if(trimStrings) {
value = ((String) value).trim();
}
storedValue = guessCellValueTypes ?
@ -190,17 +176,11 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
row.setCell(column.getCellIndex(), null);
}
}
if (rowHasData || storeBlankRows) {
if (archiveColumnIndex >= 0) {
row.setCell(archiveColumnIndex, new Cell(archiveFileName, null));
}
if (filenameColumnIndex >= 0) {
row.setCell(filenameColumnIndex, new Cell(fileSource, null));
}
project.rows.add(row);
}
if (limit2 > 0 && project.rows.size() >= limit2) {
break;
}
@ -211,9 +191,4 @@ abstract public class TabularImportingParserBase extends ImportingParserBase {
exceptions.add(e);
}
}
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource, String archiveFileName,
Reader dataReader, int limit, ObjectNode options, List<Exception> exceptions) {
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, dataReader, limit, options, exceptions);
}
}

View File

@ -691,7 +691,6 @@ public class WikitextImporter extends TabularImportingParserBase {
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
Reader reader,
int limit,
ObjectNode options,
@ -750,7 +749,7 @@ public class WikitextImporter extends TabularImportingParserBase {
// TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way?
}
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, archiveFileName, limit, options, exceptions);
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
// Add reconciliation statistics
if (dataReader.columnReconciled != null) {
@ -768,8 +767,6 @@ public class WikitextImporter extends TabularImportingParserBase {
exceptions.add(e1);
e1.printStackTrace();
}
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, reader, limit, options, exceptions);
}
private StandardReconConfig getReconConfig(String url) {

View File

@ -193,15 +193,13 @@ public class XmlImporter extends TreeImportingParserBase {
@Override
public void parseOneFile(Project project, ProjectMetadata metadata,
ImportingJob job, String fileSource, String archiveFileName, InputStream inputStream,
ImportingJob job, String fileSource, InputStream inputStream,
ImportColumnGroup rootColumnGroup, int limit, ObjectNode options,
List<Exception> exceptions) {
try {
parseOneFile(project, metadata, job, fileSource, archiveFileName,
parseOneFile(project, metadata, job, fileSource,
new XmlParser(inputStream), rootColumnGroup, limit, options, exceptions);
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, inputStream, rootColumnGroup, limit, options, exceptions);
} catch (XMLStreamException e) {
exceptions.add(e);
} catch (IOException e) {

View File

@ -31,10 +31,11 @@ public class ImportParameters {
protected boolean trimStrings;
protected boolean storeEmptyStrings;
protected boolean guessDataType;
protected boolean includeFileSources;
protected String fileSource;
protected boolean includeArchiveFileName;
protected String archiveFileName;
protected boolean includeFileSources = false;
protected String fileSource = null;
// TODO: What is the compatibility impact of including new fields
protected boolean includeArchiveFileName = false;
protected String archiveFileName = null;
public ImportParameters(boolean trimStrings, boolean storeEmptyStrings, boolean guessCellValueTypes,
boolean includeFileSources, String fileSource, boolean includeArchiveFileName, String archiveFileName) {
@ -47,8 +48,13 @@ public class ImportParameters {
this.archiveFileName = archiveFileName;
}
public ImportParameters(boolean trimStrings, boolean storeEmptyStrings, boolean guessCellValueTypes,
boolean includeFileSources, String fileSource) {
this(trimStrings, storeEmptyStrings, guessCellValueTypes, includeFileSources, fileSource, false, "");
}
public ImportParameters(boolean trimStrings, boolean storeEmptyStrings, boolean guessCellValueTypes) {
this(trimStrings, storeEmptyStrings, guessCellValueTypes, false, "", false, "");
this(trimStrings, storeEmptyStrings, guessCellValueTypes, false, "");
}
}

View File

@ -40,7 +40,6 @@ import java.io.Reader;
import java.util.List;
import org.apache.commons.lang.NotImplementedException;
import com.google.refine.importers.tree.TreeReaderException;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.refine.ProjectMetadata;
@ -112,14 +111,13 @@ abstract public class TreeImportingParserBase extends ImportingParserBase {
) throws IOException {
final File file = ImportingUtilities.getFile(job, fileRecord);
final String fileSource = ImportingUtilities.getFileSource(fileRecord);
final String archiveFileName = ImportingUtilities.getArchiveFileName(fileRecord);
progress.startFile(fileSource);
try {
InputStream inputStream = ImporterUtilities.openAndTrackFile(fileSource, file, progress);
try {
if (useInputStream) {
parseOneFile(project, metadata, job, fileSource, archiveFileName, inputStream,
parseOneFile(project, metadata, job, fileSource, inputStream,
rootColumnGroup, limit, options, exceptions);
} else {
String commonEncoding = JSONUtilities.getString(options, "encoding", null);
@ -128,7 +126,7 @@ abstract public class TreeImportingParserBase extends ImportingParserBase {
}
Reader reader = ImportingUtilities.getFileReader(file, fileRecord, commonEncoding);
parseOneFile(project, metadata, job, fileSource, archiveFileName, reader,
parseOneFile(project, metadata, job, fileSource, reader,
rootColumnGroup, limit, options, exceptions);
}
} finally {
@ -150,7 +148,6 @@ abstract public class TreeImportingParserBase extends ImportingParserBase {
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
Reader reader,
ImportColumnGroup rootColumnGroup,
int limit,
@ -171,15 +168,13 @@ abstract public class TreeImportingParserBase extends ImportingParserBase {
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
InputStream inputStream,
ImportColumnGroup rootColumnGroup,
int limit,
ObjectNode options,
List<Exception> exceptions
) {
// throw new NotImplementedException();
super.parseOneFile(project, metadata, job, fileSource, archiveFileName, inputStream, limit, options, exceptions);
throw new NotImplementedException();
}
/**
@ -191,7 +186,6 @@ abstract public class TreeImportingParserBase extends ImportingParserBase {
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
String archiveFileName,
TreeReader treeParser,
ImportColumnGroup rootColumnGroup,
int limit,
@ -216,28 +210,12 @@ abstract public class TreeImportingParserBase extends ImportingParserBase {
boolean trimStrings = JSONUtilities.getBoolean(options, "trimStrings", true);
boolean storeEmptyStrings = JSONUtilities.getBoolean(options, "storeEmptyStrings", false);
boolean guessCellValueTypes = JSONUtilities.getBoolean(options, "guessCellValueTypes", true);
boolean includeFileSources = JSONUtilities.getBoolean(options, "includeFileSources", false);
boolean includeArchiveFileName = JSONUtilities.getBoolean(options, "includeArchiveFileName", false);
int filenameColumnIndex = -1, archiveColumnIndex = -1;
if (includeArchiveFileName && archiveFileName != null) {
archiveColumnIndex = addArchiveColumn(project);
assert archiveColumnIndex == 0;
}
if (includeFileSources) {
filenameColumnIndex = addFilenameColumn(project, includeArchiveFileName);
// If the column add fails for any reason, we'll end up overwriting data in the first column
if (includeArchiveFileName) {
assert filenameColumnIndex == 1;
} else {
assert filenameColumnIndex == 0;
}
}
try {
XmlImportUtilities.importTreeData(treeParser, project, recordPath, rootColumnGroup, limit2,
new ImportParameters(trimStrings, storeEmptyStrings, guessCellValueTypes, includeFileSources,
fileSource, includeArchiveFileName, archiveFileName));
fileSource));
} catch (Exception e){
exceptions.add(e);
}

View File

@ -393,6 +393,7 @@ public class ImportingUtilities {
calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null));
// TODO: This needs to be refactored to be able to test import from archives
if (postProcessRetrievedFile(rawDataDir, file, fileRecord, fileRecords, progress)) {
archiveCount++;
}
@ -641,6 +642,7 @@ public class ImportingUtilities {
return null;
}
// FIXME: This is wasteful of space and time. We should try to process on the fly
static public boolean explodeArchive(
File rawDataDir,
InputStream archiveIS,

BIN
main/tests/data/movies.zip Normal file

Binary file not shown.

View File

@ -57,9 +57,6 @@ import com.fasterxml.jackson.databind.node.BooleanNode;
import com.fasterxml.jackson.databind.node.IntNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.node.TextNode;
import com.google.refine.ProjectManager;
import com.google.refine.ProjectMetadata;
import com.google.refine.RefineServlet;
import com.google.refine.importers.SeparatorBasedImporter;
import com.google.refine.importing.ImportingJob;
import com.google.refine.importing.ImportingManager;
@ -176,7 +173,7 @@ public class RefineTest extends PowerMockTestCase {
SeparatorBasedImporter importer = new SeparatorBasedImporter();
List<Exception> exceptions = new ArrayList<Exception>();
importer.parseOneFile(project, metadata, job, "filesource", "archivefile", new StringReader(input), -1, options, exceptions);
importer.parseOneFile(project, metadata, job, "filesource", new StringReader(input), -1, options, exceptions);
project.update();
ProjectManager.singleton.registerProject(project, metadata);

View File

@ -63,7 +63,6 @@ import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.google.refine.importers.ExcelImporter;
import com.google.refine.util.ParsingUtilities;
public class ExcelImporterTests extends ImporterTest {
@ -146,35 +145,6 @@ public class ExcelImporterTests extends ImporterTest {
verify(options, times(1)).get("storeBlankCellsAsNulls");
}
@Test
public void readXlsFromArchiveFile() throws FileNotFoundException, IOException{
ArrayNode sheets = ParsingUtilities.mapper.createArrayNode();
sheets.add(ParsingUtilities.mapper.readTree("{name: \"file-source#Test Sheet 0\", fileNameAndSheetIndex: \"file-source#0\", rows: 31, selected: true}"));
whenGetArrayOption("sheets", options, sheets);
whenGetIntegerOption("ignoreLines", options, 0);
whenGetIntegerOption("headerLines", options, 0);
whenGetIntegerOption("skipDataLines", options, 0);
whenGetIntegerOption("limit", options, -1);
whenGetBooleanOption("storeBlankCellsAsNulls",options,true);
whenGetBooleanOption("includeArchiveFileName", options, true);
InputStream stream = new FileInputStream(xlsFile);
try {
parseOneFile(SUT, stream);
} catch (Exception e) {
Assert.fail(e.getMessage());
}
Assert.assertEquals(project.rows.get(0).cells.size(), COLUMNS + 1);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Archive");
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "archive-file");
verify(options, times(1)).get("includeArchiveFileName");
}
@Test
public void readXlsx() throws FileNotFoundException, IOException{

View File

@ -37,7 +37,6 @@ import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.google.refine.importers.FixedWidthImporter;
import com.google.refine.util.JSONUtilities;
import com.google.refine.util.ParsingUtilities;
@ -108,37 +107,4 @@ public class FixedWidthImporterTests extends ImporterTest {
Assert.assertNull(project.rows.get(2).getCellValue(2));
}
@Test
public void readFixedWidthFromArchiveFile(){
StringReader reader = new StringReader(SAMPLE_ROW + "\nTooShort");
ArrayNode columnWidths = ParsingUtilities.mapper.createArrayNode();
JSONUtilities.append(columnWidths, 6);
JSONUtilities.append(columnWidths, 9);
JSONUtilities.append(columnWidths, 5);
whenGetArrayOption("columnWidths", options, columnWidths);
ArrayNode columnNames = ParsingUtilities.mapper.createArrayNode();
columnNames.add("Col 1");
columnNames.add("Col 2");
columnNames.add("Col 3");
whenGetArrayOption("columnNames", options, columnNames);
whenGetIntegerOption("ignoreLines", options, 0);
whenGetIntegerOption("headerLines", options, 0);
whenGetIntegerOption("skipDataLines", options, 0);
whenGetIntegerOption("limit", options, -1);
whenGetBooleanOption("storeBlankCellsAsNulls",options,true);
whenGetBooleanOption("includeArchiveFileName", options, true);
try {
parseOneFile(SUT, reader);
} catch (Exception e) {
Assert.fail(e.getMessage());
}
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Archive");
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "archive-file");
}
}

View File

@ -43,7 +43,6 @@ import com.google.refine.ProjectMetadata;
import com.google.refine.RefineServlet;
import com.google.refine.RefineServletStub;
import com.google.refine.RefineTest;
import com.google.refine.importers.ImportingParserBase;
import com.google.refine.importers.tree.ImportColumnGroup;
import com.google.refine.importers.tree.TreeImportingParserBase;
import com.google.refine.importers.tree.XmlImportUtilities;
@ -58,9 +57,8 @@ public abstract class ImporterTest extends RefineTest {
protected ProjectMetadata metadata;
protected ImportingJob job;
protected RefineServlet servlet;
protected ObjectNode options;
public void setUp(){
//FIXME - should we try and use mock(Project.class); - seems unnecessary complexity
@ -92,7 +90,6 @@ public abstract class ImporterTest extends RefineTest {
metadata,
job,
"file-source",
"archive-file",
reader,
-1,
options,
@ -109,7 +106,6 @@ public abstract class ImporterTest extends RefineTest {
metadata,
job,
"file-source",
"archive-file",
inputStream,
-1,
options,
@ -126,7 +122,6 @@ public abstract class ImporterTest extends RefineTest {
metadata,
job,
"file-source",
"archive-file",
inputStream,
-1,
options,
@ -144,7 +139,6 @@ public abstract class ImporterTest extends RefineTest {
metadata,
job,
"file-source",
"archive-file",
reader,
rootColumnGroup,
-1,
@ -170,7 +164,6 @@ public abstract class ImporterTest extends RefineTest {
metadata,
job,
"file-source",
"archive-file",
inputStream,
rootColumnGroup,
-1,
@ -191,7 +184,6 @@ public abstract class ImporterTest extends RefineTest {
metadata,
job,
"file-source",
"archive-file",
reader,
rootColumnGroup,
-1,

View File

@ -52,7 +52,6 @@ import org.testng.annotations.Test;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.refine.importers.JsonImporter;
import com.google.refine.importers.JsonImporter.JSONTreeReader;
import com.google.refine.importers.tree.TreeImportingParserBase;
import com.google.refine.importers.tree.TreeReader.Token;
@ -135,7 +134,6 @@ public class JsonImporterTests extends ImporterTest {
metadata,
job,
"file-source",
"archive-file",
inputStream,
rootColumnGroup,
-1,

View File

@ -44,7 +44,6 @@ import org.testng.annotations.BeforeMethod;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
import com.google.refine.importers.RdfTripleImporter;
import com.google.refine.util.JSONUtilities;
public class RdfTripleImporterTests extends ImporterTest {

View File

@ -43,7 +43,6 @@ import org.testng.annotations.BeforeTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import com.google.refine.importers.SeparatorBasedImporter;
import com.google.refine.util.ParsingUtilities;
public class TsvCsvImporterTests extends ImporterTest {
@ -118,27 +117,6 @@ public class TsvCsvImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
}
@Test(dataProvider = "CSV-TSV-AutoDetermine")
public void readSimpleData_CSV_fromArchiveFileName(String sep){
//create input to test with
String inputSeparator = sep == null ? "\t" : sep;
String input = "col1" + inputSeparator + "col2" + inputSeparator + "col3\n" +
"data1" + inputSeparator + "data2" + inputSeparator + "data3";
try {
prepareOptions(sep, -1, 0, 0, 1, false, false,"\"","[]", true);
parseOneFile(SUT, new StringReader(input));
} catch (Exception e) {
Assert.fail("Exception during file parse",e);
}
Assert.assertEquals(project.columnModel.columns.size(), 4);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Archive");
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "archive-file");
}
@Test(dataProvider = "CSV-TSV-AutoDetermine")
public void readSimpleData_CSV_1Header_1Row_GuessValues(String sep){
//create input to test with

View File

@ -37,7 +37,7 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.LinkedList;
import java.util.Collections;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
@ -48,7 +48,6 @@ import org.testng.annotations.Test;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.refine.importers.XmlImporter;
import com.google.refine.importers.tree.TreeImportingParserBase;
import com.google.refine.importing.ImportingJob;
import com.google.refine.model.ColumnGroup;
@ -248,7 +247,7 @@ public class XmlImporterTests extends ImporterTest {
public static ObjectNode getOptions(ImportingJob job, TreeImportingParserBase parser) {
ObjectNode options = parser.createParserUIInitializationData(
job, new LinkedList<>(), "text/json");
job, Collections.emptyList(), "text/xml");
ArrayNode path = ParsingUtilities.mapper.createArrayNode();
JSONUtilities.append(path, "library");
@ -260,7 +259,7 @@ public class XmlImporterTests extends ImporterTest {
public static ObjectNode getNestedOptions(ImportingJob job, TreeImportingParserBase parser) {
ObjectNode options = parser.createParserUIInitializationData(
job, new LinkedList<>(), "text/json");
job, Collections.emptyList(), "text/xml");
ArrayNode path = ParsingUtilities.mapper.createArrayNode();
JSONUtilities.append(path, "nest");

View File

@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright (C) 2018, OpenRefine contributors
* Copyright (C) 2018, 2020 OpenRefine contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -26,17 +26,25 @@
******************************************************************************/
package com.google.refine.importing;
import static org.mockito.Mockito.when;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.LinkedList;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import okhttp3.HttpUrl;
import okhttp3.mockwebserver.MockResponse;
import okhttp3.mockwebserver.MockWebServer;
import org.apache.commons.collections.IteratorUtils;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.mime.MultipartEntityBuilder;
@ -50,9 +58,9 @@ import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.refine.ProjectMetadata;
import com.google.refine.importers.ImporterTest;
import com.google.refine.importers.tree.TreeImportingParserBase;
import com.google.refine.importing.ImportingJob;
import com.google.refine.importing.ImportingUtilities;
import com.google.refine.importers.ImportingParserBase;
import com.google.refine.importers.SeparatorBasedImporter;
import com.google.refine.importing.ImportingUtilities.Progress;
import com.google.refine.util.JSONUtilities;
import com.google.refine.util.ParsingUtilities;
import com.google.refine.util.TestUtils;
@ -60,8 +68,6 @@ import com.google.refine.util.TestUtils;
import javax.servlet.ServletInputStream;
import javax.servlet.http.HttpServletRequest;
import static org.mockito.Mockito.when;
public class ImportingUtilitiesTests extends ImporterTest {
@Override
@ -80,24 +86,12 @@ public class ImportingUtilitiesTests extends ImporterTest {
Assert.assertEquals(pm.getEncoding(), "UTF-8");
Assert.assertTrue(pm.getTags().length == 0);
}
@Test(expectedExceptions=IllegalArgumentException.class)
public void testZipSlip() throws IOException {
File tempDir = TestUtils.createTempDirectory("openrefine-zip-slip-test");
File tempDir = TestUtils.createTempDirectory("openrefine-zip-slip-test");
// For CVE-2018-19859, issue #1840
ImportingUtilities.allocateFile(tempDir, "../../tmp/script.sh");
}
private ObjectNode getNestedOptions(ImportingJob job, TreeImportingParserBase parser) {
ObjectNode options = parser.createParserUIInitializationData(
job, new LinkedList<>(), "text/json");
ArrayNode path = ParsingUtilities.mapper.createArrayNode();
path.add("results");
path.add("result");
JSONUtilities.safePut(options, "recordPath", path);
return options;
ImportingUtilities.allocateFile(tempDir, "../../tmp/script.sh");
}
@Test
@ -154,6 +148,8 @@ public class ImportingUtilitiesTests extends ImporterTest {
Assert.fail("No Exception was thrown");
} catch (Exception exception) {
Assert.assertEquals(MESSAGE, exception.getMessage());
} finally {
server.close();
}
}
@ -171,4 +167,91 @@ public class ImportingUtilitiesTests extends ImporterTest {
}
}
/**
* This tests both exploding a zip archive into it's constituent files
* as well as importing them all (both) and making sure that the
* recording of archive names and file names works correctly.
*
* It's kind of a lot to have in one test, but it's a sequence
* of steps that need to be done in order.
*
* @throws IOException
*/
@SuppressWarnings("unchecked")
@Test
public void importArchive() throws IOException{
String filename = "movies.zip";
String filepath = ClassLoader.getSystemResource(filename).getPath();
// Make a copy in our data directory where it's expected
File tmp = File.createTempFile("openrefine-test-movies", ".zip", job.getRawDataDir());
tmp.deleteOnExit();
FileUtils.copyFile(new File(filepath), tmp);
Progress dummyProgress = new Progress() {
@Override
public void setProgress(String message, int percent) {}
@Override
public boolean isCanceled() {
return false;
}
};
ArrayNode fileRecords = ParsingUtilities.mapper.createArrayNode();
ObjectNode fileRecord = ParsingUtilities.mapper.createObjectNode();
JSONUtilities.safePut(fileRecord, "origin", "upload");
JSONUtilities.safePut(fileRecord, "declaredEncoding", "UTF-8");
JSONUtilities.safePut(fileRecord, "declaredMimeType", "application/x-zip-compressed");
JSONUtilities.safePut(fileRecord, "fileName", filename);
JSONUtilities.safePut(fileRecord, "location", tmp.getName());
assertTrue(ImportingUtilities.postProcessRetrievedFile(job.getRawDataDir(), tmp, fileRecord, fileRecords, dummyProgress));
assertEquals(fileRecords.size(), 2);
assertEquals(fileRecords.get(0).get("fileName").asText(), "movies-condensed.tsv");
assertEquals(fileRecords.get(0).get("archiveFileName").asText(), "movies.zip");
assertEquals(fileRecords.get(1).get("fileName").asText(), "movies.tsv");
ObjectNode options = ParsingUtilities.mapper.createObjectNode();
JSONUtilities.safePut(options, "includeArchiveFileName", true);
JSONUtilities.safePut(options, "includeFileSources", true);
ImportingParserBase parser = new SeparatorBasedImporter();
List<Exception> exceptions = new ArrayList<Exception>();
parser.parse(
project,
metadata,
job,
IteratorUtils.toList(fileRecords.iterator()),
"tsv",
-1,
options,
exceptions
);
assertEquals(exceptions.size(), 0);
project.update();
assertEquals(project.columnModel.columns.get(0).getName(),"Archive");
assertEquals(project.rows.get(0).getCell(0).getValue(),"movies.zip");
assertEquals(project.columnModel.columns.get(1).getName(),"File");
assertEquals(project.rows.get(0).getCell(1).getValue(),"movies-condensed.tsv");
assertEquals(project.columnModel.columns.get(2).getName(),"name");
assertEquals(project.rows.get(0).getCell(2).getValue(),"Wayne's World");
// Make sure we imported both files contained in the zip file
assertEquals(project.rows.size(), 252);
ArrayNode importOptionsArray = metadata.getImportOptionMetadata();
assertEquals(importOptionsArray.size(), 2);
ObjectNode importOptions = (ObjectNode)importOptionsArray.get(0);
assertEquals(importOptions.get("archiveFileName").asText(), "movies.zip");
assertEquals(importOptions.get("fileSource").asText(), "movies-condensed.tsv");
assertTrue(importOptions.get("includeFileSources").asBoolean());
assertTrue(importOptions.get("includeArchiveFileName").asBoolean());
importOptions = (ObjectNode)importOptionsArray.get(1);
assertEquals(importOptions.get("fileSource").asText(), "movies.tsv");
assertEquals(importOptions.get("archiveFileName").asText(), "movies.zip");
}
}

View File

@ -63,7 +63,6 @@ import com.google.refine.model.AbstractOperation;
import com.google.refine.model.ModelException;
import com.google.refine.model.Project;
import com.google.refine.operations.OperationRegistry;
import com.google.refine.operations.cell.KeyValueColumnizeOperation;
import com.google.refine.process.Process;
import com.google.refine.util.ParsingUtilities;
import com.google.refine.util.TestUtils;
@ -202,7 +201,7 @@ public class KeyValueColumnizeTests extends RefineTest {
+ "price,3.1\n";
prepareOptions(",", 20, 0, 0, 1, false, false);
List<Exception> exceptions = new ArrayList<Exception>();
importer.parseOneFile(project, pm, job, "filesource", "archivefile", new StringReader(csv), -1, options, exceptions);
importer.parseOneFile(project, pm, job, "filesource", new StringReader(csv), -1, options, exceptions);
project.update();
ProjectManager.singleton.registerProject(project, pm);