Improve Google Sheets upload (#2784)
* Support more than 26 columns Google Sheets default to just 26 columns (A-Z) and we need to explicitly add more columns if we need them. Fixes #2760 * Improve Google Sheets upload - upload in chunks instead of serializing the entire document at once - Free up resources as we go - stop if an error occurs - reduce batch size to try and stay in 10MB request size limit (but need a more dynamic way to do this probably for very wide sheets or sheets with large values) * Add basic test and do some cleanup - add test for columns > 26 - refactor to allow testing and not depend on unnecessary fields - add i18n TODO for translating spreadsheet description * Preserve cell data types Fixes #2785 - integers and floats are sent as Doubles - bools as Boolean - DateTimes as Strings - nulls as the empty string - anything else as Strings using .toString() * Fix LGTM-flagged potentially null pointer dereference
This commit is contained in:
parent
de309158c9
commit
3aa610d6aa
@ -2,6 +2,7 @@ package com.google.refine.extension.gdata;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -11,6 +12,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
import com.fasterxml.jackson.databind.JsonNode;
|
import com.fasterxml.jackson.databind.JsonNode;
|
||||||
import com.google.api.services.sheets.v4.Sheets;
|
import com.google.api.services.sheets.v4.Sheets;
|
||||||
import com.google.api.services.sheets.v4.model.AppendCellsRequest;
|
import com.google.api.services.sheets.v4.model.AppendCellsRequest;
|
||||||
|
import com.google.api.services.sheets.v4.model.AppendDimensionRequest;
|
||||||
import com.google.api.services.sheets.v4.model.BatchUpdateSpreadsheetRequest;
|
import com.google.api.services.sheets.v4.model.BatchUpdateSpreadsheetRequest;
|
||||||
import com.google.api.services.sheets.v4.model.BatchUpdateSpreadsheetResponse;
|
import com.google.api.services.sheets.v4.model.BatchUpdateSpreadsheetResponse;
|
||||||
import com.google.api.services.sheets.v4.model.ExtendedValue;
|
import com.google.api.services.sheets.v4.model.ExtendedValue;
|
||||||
@ -18,22 +20,20 @@ import com.google.api.services.sheets.v4.model.Request;
|
|||||||
import com.google.api.services.sheets.v4.model.RowData;
|
import com.google.api.services.sheets.v4.model.RowData;
|
||||||
import com.google.refine.exporters.TabularSerializer;
|
import com.google.refine.exporters.TabularSerializer;
|
||||||
|
|
||||||
final class SpreadsheetSerializer implements TabularSerializer {
|
class SpreadsheetSerializer implements TabularSerializer {
|
||||||
static final Logger logger = LoggerFactory.getLogger("SpreadsheetSerializer");
|
static final Logger logger = LoggerFactory.getLogger("SpreadsheetSerializer");
|
||||||
|
|
||||||
private static final int BATCH_SIZE = 1000;
|
private static final int BATCH_SIZE = 500;
|
||||||
|
|
||||||
private Sheets service;
|
private Sheets service;
|
||||||
private String spreadsheetId;
|
private String spreadsheetId;
|
||||||
private List<Exception> exceptions;
|
private List<Exception> exceptions;
|
||||||
|
|
||||||
// A list of updates to apply to the spreadsheet.
|
protected List<RowData> rows = new ArrayList<>();
|
||||||
private List<Request> requests = new ArrayList<>();
|
|
||||||
|
|
||||||
private Request batchRequest = null;
|
// FIXME: This is fragile. Can we find out how many columns we have rather than assuming
|
||||||
private int row = 0;
|
// it'll always be the default A-Z?
|
||||||
|
private int maxColumns = 26;
|
||||||
private List<RowData> rows;
|
|
||||||
|
|
||||||
SpreadsheetSerializer(Sheets service, String spreadsheetId, List<Exception> exceptions) {
|
SpreadsheetSerializer(Sheets service, String spreadsheetId, List<Exception> exceptions) {
|
||||||
this.service = service;
|
this.service = service;
|
||||||
@ -48,33 +48,13 @@ final class SpreadsheetSerializer implements TabularSerializer {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void endFile() {
|
public void endFile() {
|
||||||
if (batchRequest != null) {
|
if (rows.size() > 0) {
|
||||||
sendBatch(rows);
|
sendBatch(rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
BatchUpdateSpreadsheetRequest requestBody = new BatchUpdateSpreadsheetRequest();
|
|
||||||
requestBody.setIncludeSpreadsheetInResponse(false);
|
|
||||||
requestBody.setRequests(requests);
|
|
||||||
|
|
||||||
Sheets.Spreadsheets.BatchUpdate request;
|
|
||||||
try {
|
|
||||||
logger.debug("spreadsheetId: " + spreadsheetId);
|
|
||||||
logger.debug("requestBody:" + requestBody.toString());
|
|
||||||
request = service.spreadsheets().batchUpdate(spreadsheetId, requestBody);
|
|
||||||
|
|
||||||
BatchUpdateSpreadsheetResponse response = request.execute();
|
|
||||||
logger.debug("response:" + response.toPrettyString());
|
|
||||||
} catch (IOException e) {
|
|
||||||
exceptions.add(e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void addRow(List<CellData> cells, boolean isHeader) {
|
public void addRow(List<CellData> cells, boolean isHeader) {
|
||||||
if (batchRequest == null) {
|
|
||||||
batchRequest = new Request();
|
|
||||||
rows = new ArrayList<RowData>(BATCH_SIZE);
|
|
||||||
}
|
|
||||||
List<com.google.api.services.sheets.v4.model.CellData> cellDatas = new ArrayList<>();
|
List<com.google.api.services.sheets.v4.model.CellData> cellDatas = new ArrayList<>();
|
||||||
RowData rowData = new RowData();
|
RowData rowData = new RowData();
|
||||||
|
|
||||||
@ -85,10 +65,12 @@ final class SpreadsheetSerializer implements TabularSerializer {
|
|||||||
|
|
||||||
rowData.setValues(cellDatas);
|
rowData.setValues(cellDatas);
|
||||||
rows.add(rowData);
|
rows.add(rowData);
|
||||||
row++;
|
|
||||||
|
|
||||||
if (row % BATCH_SIZE == 0) {
|
if (rows.size() >= BATCH_SIZE) {
|
||||||
sendBatch(rows);
|
sendBatch(rows);
|
||||||
|
if (exceptions.size() > 0) {
|
||||||
|
throw new RuntimeException(exceptions.get(0));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,10 +78,26 @@ final class SpreadsheetSerializer implements TabularSerializer {
|
|||||||
com.google.api.services.sheets.v4.model.CellData sheetCellData = new com.google.api.services.sheets.v4.model.CellData();
|
com.google.api.services.sheets.v4.model.CellData sheetCellData = new com.google.api.services.sheets.v4.model.CellData();
|
||||||
|
|
||||||
ExtendedValue ev = new ExtendedValue();
|
ExtendedValue ev = new ExtendedValue();
|
||||||
if (cellData == null || cellData.value == null) {
|
if (cellData != null) {
|
||||||
ev.setStringValue("");
|
if (cellData.value instanceof String) {
|
||||||
|
ev.setStringValue((String) cellData.value);
|
||||||
|
} else if (cellData.value instanceof Integer) {
|
||||||
|
ev.setNumberValue(new Double((Integer) cellData.value));
|
||||||
|
} else if (cellData.value instanceof Double) {
|
||||||
|
ev.setNumberValue((Double) cellData.value);
|
||||||
|
} else if (cellData.value instanceof OffsetDateTime) {
|
||||||
|
// supposedly started internally as a double, but not sure how to transform correctly
|
||||||
|
// ev.setNumberValue((Double) cellData.value);
|
||||||
|
ev.setStringValue(cellData.value.toString());
|
||||||
|
} else if (cellData.value instanceof Boolean) {
|
||||||
|
ev.setBoolValue((Boolean) cellData.value);
|
||||||
|
} else if (cellData.value == null) {
|
||||||
|
ev.setStringValue("");
|
||||||
|
} else {
|
||||||
|
ev.setStringValue(cellData.value.toString());
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
ev.setStringValue(cellData.value.toString());
|
ev.setStringValue("");
|
||||||
}
|
}
|
||||||
|
|
||||||
sheetCellData.setUserEnteredValue(ev);
|
sheetCellData.setUserEnteredValue(ev);
|
||||||
@ -108,13 +106,54 @@ final class SpreadsheetSerializer implements TabularSerializer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void sendBatch(List<RowData> rows) {
|
private void sendBatch(List<RowData> rows) {
|
||||||
|
List<Request> requests = prepareBatch(rows);
|
||||||
|
|
||||||
|
// FIXME: We have a 10MB cap on the request size, but I'm not sure we've got a good
|
||||||
|
// way to quickly tell how big our request is. Just reduce row count for now.
|
||||||
|
BatchUpdateSpreadsheetRequest requestBody = new BatchUpdateSpreadsheetRequest();
|
||||||
|
requestBody.setIncludeSpreadsheetInResponse(false);
|
||||||
|
requestBody.setRequests(requests);
|
||||||
|
|
||||||
|
Sheets.Spreadsheets.BatchUpdate request;
|
||||||
|
try {
|
||||||
|
logger.debug("spreadsheetId: " + spreadsheetId);
|
||||||
|
// logger.debug("requestBody:" + requestBody.toString());
|
||||||
|
request = service.spreadsheets().batchUpdate(spreadsheetId, requestBody);
|
||||||
|
BatchUpdateSpreadsheetResponse response = request.execute();
|
||||||
|
logger.debug("response:" + response.toPrettyString());
|
||||||
|
} catch (IOException e) {
|
||||||
|
exceptions.add(e);
|
||||||
|
} finally {
|
||||||
|
requestBody.clear();
|
||||||
|
requests.clear();
|
||||||
|
rows.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<Request> prepareBatch(List<RowData> rows) {
|
||||||
|
List<Request> requests = new ArrayList<>();
|
||||||
|
|
||||||
|
// If this row is wider than our sheet, add columns to the sheet
|
||||||
|
int columns = rows.get(0).getValues().size();
|
||||||
|
if (columns > maxColumns) {
|
||||||
|
AppendDimensionRequest adr = new AppendDimensionRequest();
|
||||||
|
adr.setDimension("COLUMNS");
|
||||||
|
adr.setLength(columns - maxColumns);
|
||||||
|
maxColumns = columns;
|
||||||
|
Request req = new Request();
|
||||||
|
req.setAppendDimension(adr);
|
||||||
|
requests.add(req);
|
||||||
|
}
|
||||||
AppendCellsRequest acr = new AppendCellsRequest();
|
AppendCellsRequest acr = new AppendCellsRequest();
|
||||||
acr.setFields("*");
|
acr.setFields("*");
|
||||||
acr.setSheetId(0);
|
acr.setSheetId(0);
|
||||||
acr.setRows(rows);
|
acr.setRows(rows);
|
||||||
batchRequest.setAppendCells(acr);
|
|
||||||
|
|
||||||
requests.add(batchRequest);
|
Request request = new Request();
|
||||||
|
request.setAppendCells(acr);
|
||||||
|
requests.add(request);
|
||||||
|
return requests;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getUrl() throws UnsupportedEncodingException {
|
public String getUrl() throws UnsupportedEncodingException {
|
||||||
|
@ -195,6 +195,7 @@ public class UploadCommand extends Command {
|
|||||||
try {
|
try {
|
||||||
File body = new File();
|
File body = new File();
|
||||||
body.setName(name);
|
body.setName(name);
|
||||||
|
// TODO: Internationalize (i18n)
|
||||||
body.setDescription("Spreadsheet uploaded from OpenRefine project: " + name);
|
body.setDescription("Spreadsheet uploaded from OpenRefine project: " + name);
|
||||||
body.setMimeType("application/vnd.google-apps.spreadsheet");
|
body.setMimeType("application/vnd.google-apps.spreadsheet");
|
||||||
|
|
||||||
|
@ -0,0 +1,120 @@
|
|||||||
|
package com.google.refine.extension.gdata;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.testng.Assert.assertEquals;
|
||||||
|
import static org.testng.Assert.fail;
|
||||||
|
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.time.OffsetDateTime;
|
||||||
|
import java.time.ZoneId;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.testng.annotations.AfterMethod;
|
||||||
|
import org.testng.annotations.BeforeMethod;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.JsonNode;
|
||||||
|
import com.google.api.services.sheets.v4.Sheets;
|
||||||
|
import com.google.api.services.sheets.v4.model.AppendDimensionRequest;
|
||||||
|
import com.google.api.services.sheets.v4.model.ExtendedValue;
|
||||||
|
import com.google.api.services.sheets.v4.model.Request;
|
||||||
|
import com.google.api.services.sheets.v4.model.RowData;
|
||||||
|
import com.google.refine.exporters.TabularSerializer.CellData;
|
||||||
|
|
||||||
|
public class SpreadsheetSerializerTests {
|
||||||
|
|
||||||
|
private class SpreadsheetSerializerStub extends SpreadsheetSerializer {
|
||||||
|
|
||||||
|
SpreadsheetSerializerStub(Sheets service, String spreadsheetId, List<Exception> exceptions) {
|
||||||
|
super(service, spreadsheetId, exceptions);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<RowData> getRows() {
|
||||||
|
return rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// dependencies
|
||||||
|
StringWriter writer;
|
||||||
|
JsonNode options = null;
|
||||||
|
Sheets service;
|
||||||
|
List<Exception> exceptions = new ArrayList<>();
|
||||||
|
|
||||||
|
// System Under Test
|
||||||
|
SpreadsheetSerializerStub SUT;
|
||||||
|
|
||||||
|
@BeforeMethod
|
||||||
|
public void SetUp() {
|
||||||
|
service = mock(Sheets.class);
|
||||||
|
SUT = new SpreadsheetSerializerStub(service, "spreadsheet1", exceptions);
|
||||||
|
writer = new StringWriter();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterMethod
|
||||||
|
public void TearDown() {
|
||||||
|
SUT = null;
|
||||||
|
service = null;
|
||||||
|
exceptions.clear();
|
||||||
|
writer = null;
|
||||||
|
options = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test30columns() {
|
||||||
|
SUT.startFile(options); // options is null, but unused
|
||||||
|
List<CellData> cells = new ArrayList<>();
|
||||||
|
for (int i = 0; i < 30; i++) {
|
||||||
|
String colnum = Integer.toString(i);
|
||||||
|
CellData cell = new CellData("col" + colnum, "text" + colnum, "text" + colnum, null);
|
||||||
|
cells.add(cell);
|
||||||
|
}
|
||||||
|
SUT.addRow(cells, true);
|
||||||
|
SUT.addRow(cells, false);
|
||||||
|
|
||||||
|
List<Request> requests = SUT.prepareBatch(SUT.getRows());
|
||||||
|
assertEquals(requests.size(), 2);
|
||||||
|
for (Request request : requests) {
|
||||||
|
if (request.getAppendDimension() instanceof AppendDimensionRequest) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fail("Failed to find AppendDimensionRequest for columns > 26");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDataTypes() {
|
||||||
|
SUT.startFile(options); // options is null, but unused
|
||||||
|
List<CellData> row = new ArrayList<>();
|
||||||
|
row.add(new CellData("null value", null, "null value", null));
|
||||||
|
row.add(new CellData("string value", "a string", "a string as string", null));
|
||||||
|
row.add(new CellData("integer value", 42, "42", null));
|
||||||
|
row.add(new CellData("double value", new Double(42), "42.0", null));
|
||||||
|
row.add(new CellData("boolean value", true, "true", null));
|
||||||
|
OffsetDateTime now = OffsetDateTime.now(ZoneId.of("Z"));
|
||||||
|
row.add(new CellData("datetime value", now, now.toString(), null));
|
||||||
|
|
||||||
|
SUT.addRow(row, false);
|
||||||
|
|
||||||
|
List<Request> requests = SUT.prepareBatch(SUT.getRows());
|
||||||
|
assertEquals(requests.size(), 1);
|
||||||
|
List<RowData> rows = requests.get(0).getAppendCells().getRows();
|
||||||
|
assertEquals(rows.size(), 1);
|
||||||
|
List<com.google.api.services.sheets.v4.model.CellData> values = rows.get(0).getValues();
|
||||||
|
assertEquals(values.size(), 6);
|
||||||
|
ExtendedValue value = values.get(0).getUserEnteredValue();
|
||||||
|
assertEquals(value.getStringValue(), "");
|
||||||
|
value = values.get(1).getUserEnteredValue();
|
||||||
|
assertEquals(value.getStringValue(), "a string");
|
||||||
|
value = values.get(2).getUserEnteredValue();
|
||||||
|
assertEquals(value.getNumberValue(), new Double(42));
|
||||||
|
value = values.get(3).getUserEnteredValue();
|
||||||
|
assertEquals(value.getNumberValue(), new Double(42));
|
||||||
|
value = values.get(4).getUserEnteredValue();
|
||||||
|
assertEquals(value.getBoolValue(), Boolean.TRUE);
|
||||||
|
value = values.get(5).getUserEnteredValue();
|
||||||
|
assertEquals(value.getStringValue(), now.toString());
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user