Improve Google Sheets upload (#2784)
* Support more than 26 columns Google Sheets default to just 26 columns (A-Z) and we need to explicitly add more columns if we need them. Fixes #2760 * Improve Google Sheets upload - upload in chunks instead of serializing the entire document at once - Free up resources as we go - stop if an error occurs - reduce batch size to try and stay in 10MB request size limit (but need a more dynamic way to do this probably for very wide sheets or sheets with large values) * Add basic test and do some cleanup - add test for columns > 26 - refactor to allow testing and not depend on unnecessary fields - add i18n TODO for translating spreadsheet description * Preserve cell data types Fixes #2785 - integers and floats are sent as Doubles - bools as Boolean - DateTimes as Strings - nulls as the empty string - anything else as Strings using .toString() * Fix LGTM-flagged potentially null pointer dereference
This commit is contained in:
parent
de309158c9
commit
3aa610d6aa
@ -2,6 +2,7 @@ package com.google.refine.extension.gdata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@ -11,6 +12,7 @@ import org.slf4j.LoggerFactory;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.google.api.services.sheets.v4.Sheets;
|
||||
import com.google.api.services.sheets.v4.model.AppendCellsRequest;
|
||||
import com.google.api.services.sheets.v4.model.AppendDimensionRequest;
|
||||
import com.google.api.services.sheets.v4.model.BatchUpdateSpreadsheetRequest;
|
||||
import com.google.api.services.sheets.v4.model.BatchUpdateSpreadsheetResponse;
|
||||
import com.google.api.services.sheets.v4.model.ExtendedValue;
|
||||
@ -18,22 +20,20 @@ import com.google.api.services.sheets.v4.model.Request;
|
||||
import com.google.api.services.sheets.v4.model.RowData;
|
||||
import com.google.refine.exporters.TabularSerializer;
|
||||
|
||||
final class SpreadsheetSerializer implements TabularSerializer {
|
||||
class SpreadsheetSerializer implements TabularSerializer {
|
||||
static final Logger logger = LoggerFactory.getLogger("SpreadsheetSerializer");
|
||||
|
||||
private static final int BATCH_SIZE = 1000;
|
||||
private static final int BATCH_SIZE = 500;
|
||||
|
||||
private Sheets service;
|
||||
private String spreadsheetId;
|
||||
private List<Exception> exceptions;
|
||||
|
||||
// A list of updates to apply to the spreadsheet.
|
||||
private List<Request> requests = new ArrayList<>();
|
||||
protected List<RowData> rows = new ArrayList<>();
|
||||
|
||||
private Request batchRequest = null;
|
||||
private int row = 0;
|
||||
|
||||
private List<RowData> rows;
|
||||
// FIXME: This is fragile. Can we find out how many columns we have rather than assuming
|
||||
// it'll always be the default A-Z?
|
||||
private int maxColumns = 26;
|
||||
|
||||
SpreadsheetSerializer(Sheets service, String spreadsheetId, List<Exception> exceptions) {
|
||||
this.service = service;
|
||||
@ -48,33 +48,13 @@ final class SpreadsheetSerializer implements TabularSerializer {
|
||||
|
||||
@Override
|
||||
public void endFile() {
|
||||
if (batchRequest != null) {
|
||||
if (rows.size() > 0) {
|
||||
sendBatch(rows);
|
||||
}
|
||||
|
||||
BatchUpdateSpreadsheetRequest requestBody = new BatchUpdateSpreadsheetRequest();
|
||||
requestBody.setIncludeSpreadsheetInResponse(false);
|
||||
requestBody.setRequests(requests);
|
||||
|
||||
Sheets.Spreadsheets.BatchUpdate request;
|
||||
try {
|
||||
logger.debug("spreadsheetId: " + spreadsheetId);
|
||||
logger.debug("requestBody:" + requestBody.toString());
|
||||
request = service.spreadsheets().batchUpdate(spreadsheetId, requestBody);
|
||||
|
||||
BatchUpdateSpreadsheetResponse response = request.execute();
|
||||
logger.debug("response:" + response.toPrettyString());
|
||||
} catch (IOException e) {
|
||||
exceptions.add(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addRow(List<CellData> cells, boolean isHeader) {
|
||||
if (batchRequest == null) {
|
||||
batchRequest = new Request();
|
||||
rows = new ArrayList<RowData>(BATCH_SIZE);
|
||||
}
|
||||
List<com.google.api.services.sheets.v4.model.CellData> cellDatas = new ArrayList<>();
|
||||
RowData rowData = new RowData();
|
||||
|
||||
@ -85,10 +65,12 @@ final class SpreadsheetSerializer implements TabularSerializer {
|
||||
|
||||
rowData.setValues(cellDatas);
|
||||
rows.add(rowData);
|
||||
row++;
|
||||
|
||||
if (row % BATCH_SIZE == 0) {
|
||||
if (rows.size() >= BATCH_SIZE) {
|
||||
sendBatch(rows);
|
||||
if (exceptions.size() > 0) {
|
||||
throw new RuntimeException(exceptions.get(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -96,10 +78,26 @@ final class SpreadsheetSerializer implements TabularSerializer {
|
||||
com.google.api.services.sheets.v4.model.CellData sheetCellData = new com.google.api.services.sheets.v4.model.CellData();
|
||||
|
||||
ExtendedValue ev = new ExtendedValue();
|
||||
if (cellData == null || cellData.value == null) {
|
||||
ev.setStringValue("");
|
||||
if (cellData != null) {
|
||||
if (cellData.value instanceof String) {
|
||||
ev.setStringValue((String) cellData.value);
|
||||
} else if (cellData.value instanceof Integer) {
|
||||
ev.setNumberValue(new Double((Integer) cellData.value));
|
||||
} else if (cellData.value instanceof Double) {
|
||||
ev.setNumberValue((Double) cellData.value);
|
||||
} else if (cellData.value instanceof OffsetDateTime) {
|
||||
// supposedly started internally as a double, but not sure how to transform correctly
|
||||
// ev.setNumberValue((Double) cellData.value);
|
||||
ev.setStringValue(cellData.value.toString());
|
||||
} else if (cellData.value instanceof Boolean) {
|
||||
ev.setBoolValue((Boolean) cellData.value);
|
||||
} else if (cellData.value == null) {
|
||||
ev.setStringValue("");
|
||||
} else {
|
||||
ev.setStringValue(cellData.value.toString());
|
||||
}
|
||||
} else {
|
||||
ev.setStringValue(cellData.value.toString());
|
||||
ev.setStringValue("");
|
||||
}
|
||||
|
||||
sheetCellData.setUserEnteredValue(ev);
|
||||
@ -108,13 +106,54 @@ final class SpreadsheetSerializer implements TabularSerializer {
|
||||
}
|
||||
|
||||
private void sendBatch(List<RowData> rows) {
|
||||
List<Request> requests = prepareBatch(rows);
|
||||
|
||||
// FIXME: We have a 10MB cap on the request size, but I'm not sure we've got a good
|
||||
// way to quickly tell how big our request is. Just reduce row count for now.
|
||||
BatchUpdateSpreadsheetRequest requestBody = new BatchUpdateSpreadsheetRequest();
|
||||
requestBody.setIncludeSpreadsheetInResponse(false);
|
||||
requestBody.setRequests(requests);
|
||||
|
||||
Sheets.Spreadsheets.BatchUpdate request;
|
||||
try {
|
||||
logger.debug("spreadsheetId: " + spreadsheetId);
|
||||
// logger.debug("requestBody:" + requestBody.toString());
|
||||
request = service.spreadsheets().batchUpdate(spreadsheetId, requestBody);
|
||||
BatchUpdateSpreadsheetResponse response = request.execute();
|
||||
logger.debug("response:" + response.toPrettyString());
|
||||
} catch (IOException e) {
|
||||
exceptions.add(e);
|
||||
} finally {
|
||||
requestBody.clear();
|
||||
requests.clear();
|
||||
rows.clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected List<Request> prepareBatch(List<RowData> rows) {
|
||||
List<Request> requests = new ArrayList<>();
|
||||
|
||||
// If this row is wider than our sheet, add columns to the sheet
|
||||
int columns = rows.get(0).getValues().size();
|
||||
if (columns > maxColumns) {
|
||||
AppendDimensionRequest adr = new AppendDimensionRequest();
|
||||
adr.setDimension("COLUMNS");
|
||||
adr.setLength(columns - maxColumns);
|
||||
maxColumns = columns;
|
||||
Request req = new Request();
|
||||
req.setAppendDimension(adr);
|
||||
requests.add(req);
|
||||
}
|
||||
AppendCellsRequest acr = new AppendCellsRequest();
|
||||
acr.setFields("*");
|
||||
acr.setSheetId(0);
|
||||
acr.setRows(rows);
|
||||
batchRequest.setAppendCells(acr);
|
||||
|
||||
requests.add(batchRequest);
|
||||
Request request = new Request();
|
||||
request.setAppendCells(acr);
|
||||
requests.add(request);
|
||||
return requests;
|
||||
}
|
||||
|
||||
public String getUrl() throws UnsupportedEncodingException {
|
||||
|
@ -195,6 +195,7 @@ public class UploadCommand extends Command {
|
||||
try {
|
||||
File body = new File();
|
||||
body.setName(name);
|
||||
// TODO: Internationalize (i18n)
|
||||
body.setDescription("Spreadsheet uploaded from OpenRefine project: " + name);
|
||||
body.setMimeType("application/vnd.google-apps.spreadsheet");
|
||||
|
||||
|
@ -0,0 +1,120 @@
|
||||
package com.google.refine.extension.gdata;
|
||||
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.testng.Assert.assertEquals;
|
||||
import static org.testng.Assert.fail;
|
||||
|
||||
import java.io.StringWriter;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.testng.annotations.AfterMethod;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.google.api.services.sheets.v4.Sheets;
|
||||
import com.google.api.services.sheets.v4.model.AppendDimensionRequest;
|
||||
import com.google.api.services.sheets.v4.model.ExtendedValue;
|
||||
import com.google.api.services.sheets.v4.model.Request;
|
||||
import com.google.api.services.sheets.v4.model.RowData;
|
||||
import com.google.refine.exporters.TabularSerializer.CellData;
|
||||
|
||||
public class SpreadsheetSerializerTests {
|
||||
|
||||
private class SpreadsheetSerializerStub extends SpreadsheetSerializer {
|
||||
|
||||
SpreadsheetSerializerStub(Sheets service, String spreadsheetId, List<Exception> exceptions) {
|
||||
super(service, spreadsheetId, exceptions);
|
||||
}
|
||||
|
||||
protected List<RowData> getRows() {
|
||||
return rows;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// dependencies
|
||||
StringWriter writer;
|
||||
JsonNode options = null;
|
||||
Sheets service;
|
||||
List<Exception> exceptions = new ArrayList<>();
|
||||
|
||||
// System Under Test
|
||||
SpreadsheetSerializerStub SUT;
|
||||
|
||||
@BeforeMethod
|
||||
public void SetUp() {
|
||||
service = mock(Sheets.class);
|
||||
SUT = new SpreadsheetSerializerStub(service, "spreadsheet1", exceptions);
|
||||
writer = new StringWriter();
|
||||
}
|
||||
|
||||
@AfterMethod
|
||||
public void TearDown() {
|
||||
SUT = null;
|
||||
service = null;
|
||||
exceptions.clear();
|
||||
writer = null;
|
||||
options = null;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test30columns() {
|
||||
SUT.startFile(options); // options is null, but unused
|
||||
List<CellData> cells = new ArrayList<>();
|
||||
for (int i = 0; i < 30; i++) {
|
||||
String colnum = Integer.toString(i);
|
||||
CellData cell = new CellData("col" + colnum, "text" + colnum, "text" + colnum, null);
|
||||
cells.add(cell);
|
||||
}
|
||||
SUT.addRow(cells, true);
|
||||
SUT.addRow(cells, false);
|
||||
|
||||
List<Request> requests = SUT.prepareBatch(SUT.getRows());
|
||||
assertEquals(requests.size(), 2);
|
||||
for (Request request : requests) {
|
||||
if (request.getAppendDimension() instanceof AppendDimensionRequest) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
fail("Failed to find AppendDimensionRequest for columns > 26");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDataTypes() {
|
||||
SUT.startFile(options); // options is null, but unused
|
||||
List<CellData> row = new ArrayList<>();
|
||||
row.add(new CellData("null value", null, "null value", null));
|
||||
row.add(new CellData("string value", "a string", "a string as string", null));
|
||||
row.add(new CellData("integer value", 42, "42", null));
|
||||
row.add(new CellData("double value", new Double(42), "42.0", null));
|
||||
row.add(new CellData("boolean value", true, "true", null));
|
||||
OffsetDateTime now = OffsetDateTime.now(ZoneId.of("Z"));
|
||||
row.add(new CellData("datetime value", now, now.toString(), null));
|
||||
|
||||
SUT.addRow(row, false);
|
||||
|
||||
List<Request> requests = SUT.prepareBatch(SUT.getRows());
|
||||
assertEquals(requests.size(), 1);
|
||||
List<RowData> rows = requests.get(0).getAppendCells().getRows();
|
||||
assertEquals(rows.size(), 1);
|
||||
List<com.google.api.services.sheets.v4.model.CellData> values = rows.get(0).getValues();
|
||||
assertEquals(values.size(), 6);
|
||||
ExtendedValue value = values.get(0).getUserEnteredValue();
|
||||
assertEquals(value.getStringValue(), "");
|
||||
value = values.get(1).getUserEnteredValue();
|
||||
assertEquals(value.getStringValue(), "a string");
|
||||
value = values.get(2).getUserEnteredValue();
|
||||
assertEquals(value.getNumberValue(), new Double(42));
|
||||
value = values.get(3).getUserEnteredValue();
|
||||
assertEquals(value.getNumberValue(), new Double(42));
|
||||
value = values.get(4).getUserEnteredValue();
|
||||
assertEquals(value.getBoolValue(), Boolean.TRUE);
|
||||
value = values.get(5).getUserEnteredValue();
|
||||
assertEquals(value.getStringValue(), now.toString());
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user