Initial Fusion Tables implementation

git-svn-id: http://google-refine.googlecode.com/svn/trunk@1889 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Tom Morris 2010-11-17 08:26:46 +00:00
parent 6a68fc9da7
commit acbed0c1ba

View File

@ -32,10 +32,17 @@ import java.io.IOException;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Scanner;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import com.google.gdata.client.GoogleService;
import com.google.gdata.client.Service.GDataRequest;
import com.google.gdata.client.Service.GDataRequest.RequestType;
import com.google.gdata.client.spreadsheet.CellQuery;
import com.google.gdata.client.spreadsheet.FeedURLFactory;
import com.google.gdata.client.spreadsheet.SpreadsheetService;
@ -47,6 +54,7 @@ import com.google.gdata.data.spreadsheet.SpreadsheetEntry;
import com.google.gdata.data.spreadsheet.SpreadsheetFeed;
import com.google.gdata.data.spreadsheet.WorksheetEntry;
import com.google.gdata.data.spreadsheet.WorksheetFeed;
import com.google.gdata.util.ContentType;
import com.google.gdata.util.InvalidEntryException;
import com.google.gdata.util.ServiceException;
import com.google.refine.ProjectMetadata;
@ -72,22 +80,14 @@ public class GDataImporter implements UrlImporter {
private FeedURLFactory factory;
public GDataImporter() {
// Careful - this is done at server init time and is shared by everyone
// Careful - this constructor is called at server init time
// and is shared by everyone.
factory = FeedURLFactory.getDefault();
}
@Override
public void read(URL url, Project project, ProjectMetadata metadata,
Properties options) throws Exception {
// Start fresh for each read so that we're not caching authorization or
// anything
SpreadsheetService service = new SpreadsheetService(
SERVICE_APP_NAME);
// String token = TokenCookie.getToken(request);
// if (token != null) {
// service.setAuthSubToken(token);
// }
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1);
@ -103,7 +103,31 @@ public class GDataImporter implements UrlImporter {
// TODO: Put this in a namespace?
metadata.setCustomMetadata("source-url", url.toExternalForm());
String spreadsheetKey = getKey(url);
// Start fresh for each read so that we're not caching authorization or
// anything
if (isSpreadsheetURL(url)) {
importSpreadsheet(url, project, ignoreLines, headerLines, limit,
dataStart, guessValueType);
} else if (isFusionTableURL(url)) {
importFusionTable(url, project, ignoreLines, headerLines, limit,
dataStart, guessValueType);
} else {
// should never happen (famous last words)
throw new IllegalArgumentException(
"Got invalid format URL in GDataImporter.read()");
}
}
private void importSpreadsheet(URL url, Project project, int ignoreLines,
int headerLines, int limit, int dataStart, boolean guessValueType)
throws MalformedURLException, IOException, ServiceException,
Exception {
SpreadsheetService service = new SpreadsheetService(SERVICE_APP_NAME);
// String token = TokenCookie.getToken(request);
// if (token != null) {
// service.setAuthSubToken(token);
// }
String spreadsheetKey = getSpreadsheetKey(url);
WorksheetEntry worksheet;
try {
worksheet = getWorksheetEntries(service, spreadsheetKey).get(0);
@ -191,6 +215,96 @@ public class GDataImporter implements UrlImporter {
}
}
private void importFusionTable(URL url, Project project, int ignoreLines,
int headerLines, int limit, int dataStart, boolean guessValueType)
throws MalformedURLException, IOException, ServiceException,
Exception {
GoogleService service = new GoogleService("fusiontables", SERVICE_APP_NAME);
// String token = TokenCookie.getToken(request);
// if (token != null) {
// service.setAuthSubToken(token);
// }
String tableId = getFusionTableKey(url);
final String SERVICE_URL =
"http://www.google.com/fusiontables/api/query";
final String selectQuery = "select * from " + tableId
+ " offset " + (dataStart) + (limit>0 ? (" limit " + limit):"");
URL queryUrl = new URL(
SERVICE_URL + "?sql=" + URLEncoder.encode(selectQuery, "UTF-8"));
GDataRequest queryRequest = service.getRequestFactory().getRequest(
RequestType.QUERY, queryUrl, ContentType.TEXT_PLAIN);
queryRequest.execute();
Scanner scanner = new Scanner(queryRequest.getResponseStream(),"UTF-8");
// TODO: Just use the first row of data as column headers for now
List<String> columnHeaders = getTableRow(scanner);
// Create columns
int columnCount = columnHeaders.size();
project.columnModel.setMaxCellIndex(columnCount);
boolean validColumn[] = new boolean[columnCount];
int index = 0;
for (String name : columnHeaders) {
Column column = new Column(index, name + " " + index);
project.columnModel.columns.add(column);
validColumn[index++] = true;
}
for (int i = index; index < columnCount; index++) {
Column column = new Column(index, "Column " + index);
project.columnModel.columns.add(column);
validColumn[i] = true;
}
// Create data rows & cells
List<String> values = columnHeaders;
while (values != null) {
Row row = new Row(columnCount);
for (String valString : values) {
valString = valString.trim();
if (ExpressionUtils.isNonBlankData(valString)) {
Serializable value = guessValueType ? ImporterUtilities
.parseCellValue(valString) : valString;
row.cells.add(new Cell(value, null));
} else {
row.cells.add(null);
}
project.rows.add(row);
values = getTableRow(scanner);
}
}
}
private List<String> getTableRow(Scanner scanner) {
/**
* CSV values are terminated by comma or end-of-line and consist either of
* plain text without commas or quotes, or a quoted expression, where inner
* quotes are escaped by doubling.
*/
final Pattern CSV_VALUE_PATTERN =
Pattern.compile("([^,\\r\\n\"]*|\"(([^\"]*\"\")*[^\"]*)\")(,|\\r?\\n)");
if (!scanner.hasNextLine()) {
return null;
}
List<String> result = new ArrayList<String>();
while (scanner.hasNextLine()) {
scanner.findWithinHorizon(CSV_VALUE_PATTERN, 0);
MatchResult match = scanner.match();
String quotedString = match.group(2);
String decoded = quotedString == null ? match.group(1)
: quotedString.replaceAll("\"\"", "\"");
result.add(decoded);
if (!match.group(4).equals(",")) {
break;
}
}
return result;
}
/**
* Retrieves the spreadsheets that an authenticated user has access to. Not
* valid for unauthenticated access.
@ -296,17 +410,23 @@ public class GDataImporter implements UrlImporter {
@Override
public boolean canImportData(URL url) {
// http://spreadsheets.google.com/ccc?key=tI36b9Fxk1lFBS83iR_3XQA&hl=en
if (url.getHost().endsWith(".google.com")) {
return true;
} else {
return false;
}
return isSpreadsheetURL(url) | isFusionTableURL(url);
}
private boolean isSpreadsheetURL(URL url) {
String host = url.getHost();
// http://spreadsheets.google.com/ccc?key=tI36b9Fxk1lFBS83iR_3XQA&hl=en
return host.endsWith(".google.com") && host.contains("spreadsheet");
}
private boolean isFusionTableURL(URL url) {
// http://www.google.com/fusiontables/DataSource?dsrcid=1219
return url.getHost().endsWith(".google.com")
&& url.getPath().startsWith("/fusiontables/");
}
// Modified version of FeedURLFactor.getSpreadsheetKeyFromUrl()
private String getKey(URL url) {
private String getSpreadsheetKey(URL url) {
String query = url.getQuery();
if (query != null) {
String[] parts = query.split("&");
@ -342,4 +462,22 @@ public class GDataImporter implements UrlImporter {
return null;
}
private String getFusionTableKey(URL url) {
String query = url.getQuery();
if (query != null) {
String[] parts = query.split("&");
for (String part : parts) {
if (part.startsWith("dsrcid=")) {
int offset = ("dsrcid=").length();
String tableId = part.substring(offset);
// TODO: Any special id format considerations to worry about?
// if (tableId.startsWith("p") || !tableId.contains(".")) {
// return tableId;
// }
return tableId;
}
}
}
return null;
}
}