Improve Google Sheets upload (#2784)

* Support more than 26 columns

Google Sheets default to just 26 columns (A-Z) and we need to
explicitly add more columns if we need them.

Fixes #2760

* Improve Google Sheets upload

- upload in chunks instead of serializing the entire document at once
- Free up resources as we go
- stop if an error occurs
- reduce batch size to try and stay in 10MB request size limit
  (but need a more dynamic way to do this probably for very wide
   sheets or sheets with large values)

* Add basic test and do some cleanup

- add test for columns > 26
- refactor to allow testing and not depend on unnecessary fields
- add i18n TODO for translating spreadsheet description

* Preserve cell data types

Fixes #2785
- integers and floats are sent as Doubles
- bools as Boolean
- DateTimes as Strings
- nulls as the empty string
- anything else as Strings using .toString()

* Fix LGTM-flagged potentially null pointer dereference
This commit is contained in:
Tom Morris 2020-06-25 02:18:28 -04:00 committed by GitHub
parent de309158c9
commit 3aa610d6aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 202 additions and 42 deletions

View File

@ -2,6 +2,7 @@ package com.google.refine.extension.gdata;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.List;
@ -11,6 +12,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.JsonNode;
import com.google.api.services.sheets.v4.Sheets;
import com.google.api.services.sheets.v4.model.AppendCellsRequest;
import com.google.api.services.sheets.v4.model.AppendDimensionRequest;
import com.google.api.services.sheets.v4.model.BatchUpdateSpreadsheetRequest;
import com.google.api.services.sheets.v4.model.BatchUpdateSpreadsheetResponse;
import com.google.api.services.sheets.v4.model.ExtendedValue;
@ -18,22 +20,20 @@ import com.google.api.services.sheets.v4.model.Request;
import com.google.api.services.sheets.v4.model.RowData;
import com.google.refine.exporters.TabularSerializer;
final class SpreadsheetSerializer implements TabularSerializer {
class SpreadsheetSerializer implements TabularSerializer {
static final Logger logger = LoggerFactory.getLogger("SpreadsheetSerializer");
private static final int BATCH_SIZE = 1000;
private static final int BATCH_SIZE = 500;
private Sheets service;
private String spreadsheetId;
private List<Exception> exceptions;
// A list of updates to apply to the spreadsheet.
private List<Request> requests = new ArrayList<>();
protected List<RowData> rows = new ArrayList<>();
private Request batchRequest = null;
private int row = 0;
private List<RowData> rows;
// FIXME: This is fragile. Can we find out how many columns we have rather than assuming
// it'll always be the default A-Z?
private int maxColumns = 26;
SpreadsheetSerializer(Sheets service, String spreadsheetId, List<Exception> exceptions) {
this.service = service;
@ -48,33 +48,13 @@ final class SpreadsheetSerializer implements TabularSerializer {
@Override
public void endFile() {
if (batchRequest != null) {
if (rows.size() > 0) {
sendBatch(rows);
}
BatchUpdateSpreadsheetRequest requestBody = new BatchUpdateSpreadsheetRequest();
requestBody.setIncludeSpreadsheetInResponse(false);
requestBody.setRequests(requests);
Sheets.Spreadsheets.BatchUpdate request;
try {
logger.debug("spreadsheetId: " + spreadsheetId);
logger.debug("requestBody:" + requestBody.toString());
request = service.spreadsheets().batchUpdate(spreadsheetId, requestBody);
BatchUpdateSpreadsheetResponse response = request.execute();
logger.debug("response:" + response.toPrettyString());
} catch (IOException e) {
exceptions.add(e);
}
}
}
@Override
public void addRow(List<CellData> cells, boolean isHeader) {
if (batchRequest == null) {
batchRequest = new Request();
rows = new ArrayList<RowData>(BATCH_SIZE);
}
List<com.google.api.services.sheets.v4.model.CellData> cellDatas = new ArrayList<>();
RowData rowData = new RowData();
@ -85,10 +65,12 @@ final class SpreadsheetSerializer implements TabularSerializer {
rowData.setValues(cellDatas);
rows.add(rowData);
row++;
if (row % BATCH_SIZE == 0) {
if (rows.size() >= BATCH_SIZE) {
sendBatch(rows);
if (exceptions.size() > 0) {
throw new RuntimeException(exceptions.get(0));
}
}
}
@ -96,10 +78,26 @@ final class SpreadsheetSerializer implements TabularSerializer {
com.google.api.services.sheets.v4.model.CellData sheetCellData = new com.google.api.services.sheets.v4.model.CellData();
ExtendedValue ev = new ExtendedValue();
if (cellData == null || cellData.value == null) {
ev.setStringValue("");
if (cellData != null) {
if (cellData.value instanceof String) {
ev.setStringValue((String) cellData.value);
} else if (cellData.value instanceof Integer) {
ev.setNumberValue(new Double((Integer) cellData.value));
} else if (cellData.value instanceof Double) {
ev.setNumberValue((Double) cellData.value);
} else if (cellData.value instanceof OffsetDateTime) {
// supposedly started internally as a double, but not sure how to transform correctly
// ev.setNumberValue((Double) cellData.value);
ev.setStringValue(cellData.value.toString());
} else if (cellData.value instanceof Boolean) {
ev.setBoolValue((Boolean) cellData.value);
} else if (cellData.value == null) {
ev.setStringValue("");
} else {
ev.setStringValue(cellData.value.toString());
}
} else {
ev.setStringValue(cellData.value.toString());
ev.setStringValue("");
}
sheetCellData.setUserEnteredValue(ev);
@ -108,13 +106,54 @@ final class SpreadsheetSerializer implements TabularSerializer {
}
private void sendBatch(List<RowData> rows) {
List<Request> requests = prepareBatch(rows);
// FIXME: We have a 10MB cap on the request size, but I'm not sure we've got a good
// way to quickly tell how big our request is. Just reduce row count for now.
BatchUpdateSpreadsheetRequest requestBody = new BatchUpdateSpreadsheetRequest();
requestBody.setIncludeSpreadsheetInResponse(false);
requestBody.setRequests(requests);
Sheets.Spreadsheets.BatchUpdate request;
try {
logger.debug("spreadsheetId: " + spreadsheetId);
// logger.debug("requestBody:" + requestBody.toString());
request = service.spreadsheets().batchUpdate(spreadsheetId, requestBody);
BatchUpdateSpreadsheetResponse response = request.execute();
logger.debug("response:" + response.toPrettyString());
} catch (IOException e) {
exceptions.add(e);
} finally {
requestBody.clear();
requests.clear();
rows.clear();
}
}
protected List<Request> prepareBatch(List<RowData> rows) {
List<Request> requests = new ArrayList<>();
// If this row is wider than our sheet, add columns to the sheet
int columns = rows.get(0).getValues().size();
if (columns > maxColumns) {
AppendDimensionRequest adr = new AppendDimensionRequest();
adr.setDimension("COLUMNS");
adr.setLength(columns - maxColumns);
maxColumns = columns;
Request req = new Request();
req.setAppendDimension(adr);
requests.add(req);
}
AppendCellsRequest acr = new AppendCellsRequest();
acr.setFields("*");
acr.setSheetId(0);
acr.setRows(rows);
batchRequest.setAppendCells(acr);
requests.add(batchRequest);
Request request = new Request();
request.setAppendCells(acr);
requests.add(request);
return requests;
}
public String getUrl() throws UnsupportedEncodingException {

View File

@ -195,6 +195,7 @@ public class UploadCommand extends Command {
try {
File body = new File();
body.setName(name);
// TODO: Internationalize (i18n)
body.setDescription("Spreadsheet uploaded from OpenRefine project: " + name);
body.setMimeType("application/vnd.google-apps.spreadsheet");

View File

@ -0,0 +1,120 @@
package com.google.refine.extension.gdata;
import static org.mockito.Mockito.mock;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.fail;
import java.io.StringWriter;
import java.time.OffsetDateTime;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.List;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import com.fasterxml.jackson.databind.JsonNode;
import com.google.api.services.sheets.v4.Sheets;
import com.google.api.services.sheets.v4.model.AppendDimensionRequest;
import com.google.api.services.sheets.v4.model.ExtendedValue;
import com.google.api.services.sheets.v4.model.Request;
import com.google.api.services.sheets.v4.model.RowData;
import com.google.refine.exporters.TabularSerializer.CellData;
public class SpreadsheetSerializerTests {
private class SpreadsheetSerializerStub extends SpreadsheetSerializer {
SpreadsheetSerializerStub(Sheets service, String spreadsheetId, List<Exception> exceptions) {
super(service, spreadsheetId, exceptions);
}
protected List<RowData> getRows() {
return rows;
}
}
// dependencies
StringWriter writer;
JsonNode options = null;
Sheets service;
List<Exception> exceptions = new ArrayList<>();
// System Under Test
SpreadsheetSerializerStub SUT;
@BeforeMethod
public void SetUp() {
service = mock(Sheets.class);
SUT = new SpreadsheetSerializerStub(service, "spreadsheet1", exceptions);
writer = new StringWriter();
}
@AfterMethod
public void TearDown() {
SUT = null;
service = null;
exceptions.clear();
writer = null;
options = null;
}
@Test
public void test30columns() {
SUT.startFile(options); // options is null, but unused
List<CellData> cells = new ArrayList<>();
for (int i = 0; i < 30; i++) {
String colnum = Integer.toString(i);
CellData cell = new CellData("col" + colnum, "text" + colnum, "text" + colnum, null);
cells.add(cell);
}
SUT.addRow(cells, true);
SUT.addRow(cells, false);
List<Request> requests = SUT.prepareBatch(SUT.getRows());
assertEquals(requests.size(), 2);
for (Request request : requests) {
if (request.getAppendDimension() instanceof AppendDimensionRequest) {
return;
}
}
fail("Failed to find AppendDimensionRequest for columns > 26");
}
@Test
public void testDataTypes() {
SUT.startFile(options); // options is null, but unused
List<CellData> row = new ArrayList<>();
row.add(new CellData("null value", null, "null value", null));
row.add(new CellData("string value", "a string", "a string as string", null));
row.add(new CellData("integer value", 42, "42", null));
row.add(new CellData("double value", new Double(42), "42.0", null));
row.add(new CellData("boolean value", true, "true", null));
OffsetDateTime now = OffsetDateTime.now(ZoneId.of("Z"));
row.add(new CellData("datetime value", now, now.toString(), null));
SUT.addRow(row, false);
List<Request> requests = SUT.prepareBatch(SUT.getRows());
assertEquals(requests.size(), 1);
List<RowData> rows = requests.get(0).getAppendCells().getRows();
assertEquals(rows.size(), 1);
List<com.google.api.services.sheets.v4.model.CellData> values = rows.get(0).getValues();
assertEquals(values.size(), 6);
ExtendedValue value = values.get(0).getUserEnteredValue();
assertEquals(value.getStringValue(), "");
value = values.get(1).getUserEnteredValue();
assertEquals(value.getStringValue(), "a string");
value = values.get(2).getUserEnteredValue();
assertEquals(value.getNumberValue(), new Double(42));
value = values.get(3).getUserEnteredValue();
assertEquals(value.getNumberValue(), new Double(42));
value = values.get(4).getUserEnteredValue();
assertEquals(value.getBoolValue(), Boolean.TRUE);
value = values.get(5).getUserEnteredValue();
assertEquals(value.getStringValue(), now.toString());
}
}