diff --git a/extensions/gdata/src/com/google/refine/extension/gdata/GDataImporter.java b/extensions/gdata/src/com/google/refine/extension/gdata/GDataImporter.java index 5e72014bc..7e1284fa5 100644 --- a/extensions/gdata/src/com/google/refine/extension/gdata/GDataImporter.java +++ b/extensions/gdata/src/com/google/refine/extension/gdata/GDataImporter.java @@ -32,10 +32,17 @@ import java.io.IOException; import java.io.Serializable; import java.net.MalformedURLException; import java.net.URL; +import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.Properties; +import java.util.Scanner; +import java.util.regex.MatchResult; +import java.util.regex.Pattern; +import com.google.gdata.client.GoogleService; +import com.google.gdata.client.Service.GDataRequest; +import com.google.gdata.client.Service.GDataRequest.RequestType; import com.google.gdata.client.spreadsheet.CellQuery; import com.google.gdata.client.spreadsheet.FeedURLFactory; import com.google.gdata.client.spreadsheet.SpreadsheetService; @@ -47,6 +54,7 @@ import com.google.gdata.data.spreadsheet.SpreadsheetEntry; import com.google.gdata.data.spreadsheet.SpreadsheetFeed; import com.google.gdata.data.spreadsheet.WorksheetEntry; import com.google.gdata.data.spreadsheet.WorksheetFeed; +import com.google.gdata.util.ContentType; import com.google.gdata.util.InvalidEntryException; import com.google.gdata.util.ServiceException; import com.google.refine.ProjectMetadata; @@ -72,22 +80,14 @@ public class GDataImporter implements UrlImporter { private FeedURLFactory factory; public GDataImporter() { - // Careful - this is done at server init time and is shared by everyone + // Careful - this constructor is called at server init time + // and is shared by everyone. factory = FeedURLFactory.getDefault(); } @Override public void read(URL url, Project project, ProjectMetadata metadata, Properties options) throws Exception { - // Start fresh for each read so that we're not caching authorization or - // anything - SpreadsheetService service = new SpreadsheetService( - SERVICE_APP_NAME); - - // String token = TokenCookie.getToken(request); - // if (token != null) { - // service.setAuthSubToken(token); - // } int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1); int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1); @@ -103,7 +103,31 @@ public class GDataImporter implements UrlImporter { // TODO: Put this in a namespace? metadata.setCustomMetadata("source-url", url.toExternalForm()); - String spreadsheetKey = getKey(url); + // Start fresh for each read so that we're not caching authorization or + // anything + if (isSpreadsheetURL(url)) { + importSpreadsheet(url, project, ignoreLines, headerLines, limit, + dataStart, guessValueType); + } else if (isFusionTableURL(url)) { + importFusionTable(url, project, ignoreLines, headerLines, limit, + dataStart, guessValueType); + } else { + // should never happen (famous last words) + throw new IllegalArgumentException( + "Got invalid format URL in GDataImporter.read()"); + } + } + + private void importSpreadsheet(URL url, Project project, int ignoreLines, + int headerLines, int limit, int dataStart, boolean guessValueType) + throws MalformedURLException, IOException, ServiceException, + Exception { + SpreadsheetService service = new SpreadsheetService(SERVICE_APP_NAME); + // String token = TokenCookie.getToken(request); + // if (token != null) { + // service.setAuthSubToken(token); + // } + String spreadsheetKey = getSpreadsheetKey(url); WorksheetEntry worksheet; try { worksheet = getWorksheetEntries(service, spreadsheetKey).get(0); @@ -190,6 +214,96 @@ public class GDataImporter implements UrlImporter { project.rows.add(row); } } + + private void importFusionTable(URL url, Project project, int ignoreLines, + int headerLines, int limit, int dataStart, boolean guessValueType) + throws MalformedURLException, IOException, ServiceException, + Exception { + GoogleService service = new GoogleService("fusiontables", SERVICE_APP_NAME); + // String token = TokenCookie.getToken(request); + // if (token != null) { + // service.setAuthSubToken(token); + // } + String tableId = getFusionTableKey(url); + + final String SERVICE_URL = + "http://www.google.com/fusiontables/api/query"; + final String selectQuery = "select * from " + tableId + + " offset " + (dataStart) + (limit>0 ? (" limit " + limit):""); + + URL queryUrl = new URL( + SERVICE_URL + "?sql=" + URLEncoder.encode(selectQuery, "UTF-8")); + GDataRequest queryRequest = service.getRequestFactory().getRequest( + RequestType.QUERY, queryUrl, ContentType.TEXT_PLAIN); + queryRequest.execute(); + + Scanner scanner = new Scanner(queryRequest.getResponseStream(),"UTF-8"); + + // TODO: Just use the first row of data as column headers for now + List columnHeaders = getTableRow(scanner); + + // Create columns + int columnCount = columnHeaders.size(); + project.columnModel.setMaxCellIndex(columnCount); + boolean validColumn[] = new boolean[columnCount]; + int index = 0; + for (String name : columnHeaders) { + Column column = new Column(index, name + " " + index); + project.columnModel.columns.add(column); + validColumn[index++] = true; + } + for (int i = index; index < columnCount; index++) { + Column column = new Column(index, "Column " + index); + project.columnModel.columns.add(column); + validColumn[i] = true; + } + + // Create data rows & cells + List values = columnHeaders; + while (values != null) { + Row row = new Row(columnCount); + for (String valString : values) { + valString = valString.trim(); + if (ExpressionUtils.isNonBlankData(valString)) { + Serializable value = guessValueType ? ImporterUtilities + .parseCellValue(valString) : valString; + row.cells.add(new Cell(value, null)); + } else { + row.cells.add(null); + } + project.rows.add(row); + values = getTableRow(scanner); + } + } + } + + private List getTableRow(Scanner scanner) { + /** + * CSV values are terminated by comma or end-of-line and consist either of + * plain text without commas or quotes, or a quoted expression, where inner + * quotes are escaped by doubling. + */ + final Pattern CSV_VALUE_PATTERN = + Pattern.compile("([^,\\r\\n\"]*|\"(([^\"]*\"\")*[^\"]*)\")(,|\\r?\\n)"); + + if (!scanner.hasNextLine()) { + return null; + } + + List result = new ArrayList(); + while (scanner.hasNextLine()) { + scanner.findWithinHorizon(CSV_VALUE_PATTERN, 0); + MatchResult match = scanner.match(); + String quotedString = match.group(2); + String decoded = quotedString == null ? match.group(1) + : quotedString.replaceAll("\"\"", "\""); + result.add(decoded); + if (!match.group(4).equals(",")) { + break; + } + } + return result; + } /** * Retrieves the spreadsheets that an authenticated user has access to. Not @@ -296,17 +410,23 @@ public class GDataImporter implements UrlImporter { @Override public boolean canImportData(URL url) { - // http://spreadsheets.google.com/ccc?key=tI36b9Fxk1lFBS83iR_3XQA&hl=en - if (url.getHost().endsWith(".google.com")) { - return true; - } else { - return false; - } - + return isSpreadsheetURL(url) | isFusionTableURL(url); } + private boolean isSpreadsheetURL(URL url) { + String host = url.getHost(); + // http://spreadsheets.google.com/ccc?key=tI36b9Fxk1lFBS83iR_3XQA&hl=en + return host.endsWith(".google.com") && host.contains("spreadsheet"); + } + + private boolean isFusionTableURL(URL url) { + // http://www.google.com/fusiontables/DataSource?dsrcid=1219 + return url.getHost().endsWith(".google.com") + && url.getPath().startsWith("/fusiontables/"); + } + // Modified version of FeedURLFactor.getSpreadsheetKeyFromUrl() - private String getKey(URL url) { + private String getSpreadsheetKey(URL url) { String query = url.getQuery(); if (query != null) { String[] parts = query.split("&"); @@ -342,4 +462,22 @@ public class GDataImporter implements UrlImporter { return null; } -} + private String getFusionTableKey(URL url) { + String query = url.getQuery(); + if (query != null) { + String[] parts = query.split("&"); + for (String part : parts) { + if (part.startsWith("dsrcid=")) { + int offset = ("dsrcid=").length(); + String tableId = part.substring(offset); + // TODO: Any special id format considerations to worry about? +// if (tableId.startsWith("p") || !tableId.contains(".")) { +// return tableId; +// } + return tableId; + } + } + } + return null; + } +} \ No newline at end of file