Initial Fusion Tables implementation
git-svn-id: http://google-refine.googlecode.com/svn/trunk@1889 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
6a68fc9da7
commit
acbed0c1ba
@ -32,10 +32,17 @@ import java.io.IOException;
|
|||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
|
import java.net.URLEncoder;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
import java.util.Scanner;
|
||||||
|
import java.util.regex.MatchResult;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import com.google.gdata.client.GoogleService;
|
||||||
|
import com.google.gdata.client.Service.GDataRequest;
|
||||||
|
import com.google.gdata.client.Service.GDataRequest.RequestType;
|
||||||
import com.google.gdata.client.spreadsheet.CellQuery;
|
import com.google.gdata.client.spreadsheet.CellQuery;
|
||||||
import com.google.gdata.client.spreadsheet.FeedURLFactory;
|
import com.google.gdata.client.spreadsheet.FeedURLFactory;
|
||||||
import com.google.gdata.client.spreadsheet.SpreadsheetService;
|
import com.google.gdata.client.spreadsheet.SpreadsheetService;
|
||||||
@ -47,6 +54,7 @@ import com.google.gdata.data.spreadsheet.SpreadsheetEntry;
|
|||||||
import com.google.gdata.data.spreadsheet.SpreadsheetFeed;
|
import com.google.gdata.data.spreadsheet.SpreadsheetFeed;
|
||||||
import com.google.gdata.data.spreadsheet.WorksheetEntry;
|
import com.google.gdata.data.spreadsheet.WorksheetEntry;
|
||||||
import com.google.gdata.data.spreadsheet.WorksheetFeed;
|
import com.google.gdata.data.spreadsheet.WorksheetFeed;
|
||||||
|
import com.google.gdata.util.ContentType;
|
||||||
import com.google.gdata.util.InvalidEntryException;
|
import com.google.gdata.util.InvalidEntryException;
|
||||||
import com.google.gdata.util.ServiceException;
|
import com.google.gdata.util.ServiceException;
|
||||||
import com.google.refine.ProjectMetadata;
|
import com.google.refine.ProjectMetadata;
|
||||||
@ -72,22 +80,14 @@ public class GDataImporter implements UrlImporter {
|
|||||||
private FeedURLFactory factory;
|
private FeedURLFactory factory;
|
||||||
|
|
||||||
public GDataImporter() {
|
public GDataImporter() {
|
||||||
// Careful - this is done at server init time and is shared by everyone
|
// Careful - this constructor is called at server init time
|
||||||
|
// and is shared by everyone.
|
||||||
factory = FeedURLFactory.getDefault();
|
factory = FeedURLFactory.getDefault();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void read(URL url, Project project, ProjectMetadata metadata,
|
public void read(URL url, Project project, ProjectMetadata metadata,
|
||||||
Properties options) throws Exception {
|
Properties options) throws Exception {
|
||||||
// Start fresh for each read so that we're not caching authorization or
|
|
||||||
// anything
|
|
||||||
SpreadsheetService service = new SpreadsheetService(
|
|
||||||
SERVICE_APP_NAME);
|
|
||||||
|
|
||||||
// String token = TokenCookie.getToken(request);
|
|
||||||
// if (token != null) {
|
|
||||||
// service.setAuthSubToken(token);
|
|
||||||
// }
|
|
||||||
|
|
||||||
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
|
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
|
||||||
int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1);
|
int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1);
|
||||||
@ -103,7 +103,31 @@ public class GDataImporter implements UrlImporter {
|
|||||||
// TODO: Put this in a namespace?
|
// TODO: Put this in a namespace?
|
||||||
metadata.setCustomMetadata("source-url", url.toExternalForm());
|
metadata.setCustomMetadata("source-url", url.toExternalForm());
|
||||||
|
|
||||||
String spreadsheetKey = getKey(url);
|
// Start fresh for each read so that we're not caching authorization or
|
||||||
|
// anything
|
||||||
|
if (isSpreadsheetURL(url)) {
|
||||||
|
importSpreadsheet(url, project, ignoreLines, headerLines, limit,
|
||||||
|
dataStart, guessValueType);
|
||||||
|
} else if (isFusionTableURL(url)) {
|
||||||
|
importFusionTable(url, project, ignoreLines, headerLines, limit,
|
||||||
|
dataStart, guessValueType);
|
||||||
|
} else {
|
||||||
|
// should never happen (famous last words)
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"Got invalid format URL in GDataImporter.read()");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void importSpreadsheet(URL url, Project project, int ignoreLines,
|
||||||
|
int headerLines, int limit, int dataStart, boolean guessValueType)
|
||||||
|
throws MalformedURLException, IOException, ServiceException,
|
||||||
|
Exception {
|
||||||
|
SpreadsheetService service = new SpreadsheetService(SERVICE_APP_NAME);
|
||||||
|
// String token = TokenCookie.getToken(request);
|
||||||
|
// if (token != null) {
|
||||||
|
// service.setAuthSubToken(token);
|
||||||
|
// }
|
||||||
|
String spreadsheetKey = getSpreadsheetKey(url);
|
||||||
WorksheetEntry worksheet;
|
WorksheetEntry worksheet;
|
||||||
try {
|
try {
|
||||||
worksheet = getWorksheetEntries(service, spreadsheetKey).get(0);
|
worksheet = getWorksheetEntries(service, spreadsheetKey).get(0);
|
||||||
@ -190,6 +214,96 @@ public class GDataImporter implements UrlImporter {
|
|||||||
project.rows.add(row);
|
project.rows.add(row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void importFusionTable(URL url, Project project, int ignoreLines,
|
||||||
|
int headerLines, int limit, int dataStart, boolean guessValueType)
|
||||||
|
throws MalformedURLException, IOException, ServiceException,
|
||||||
|
Exception {
|
||||||
|
GoogleService service = new GoogleService("fusiontables", SERVICE_APP_NAME);
|
||||||
|
// String token = TokenCookie.getToken(request);
|
||||||
|
// if (token != null) {
|
||||||
|
// service.setAuthSubToken(token);
|
||||||
|
// }
|
||||||
|
String tableId = getFusionTableKey(url);
|
||||||
|
|
||||||
|
final String SERVICE_URL =
|
||||||
|
"http://www.google.com/fusiontables/api/query";
|
||||||
|
final String selectQuery = "select * from " + tableId
|
||||||
|
+ " offset " + (dataStart) + (limit>0 ? (" limit " + limit):"");
|
||||||
|
|
||||||
|
URL queryUrl = new URL(
|
||||||
|
SERVICE_URL + "?sql=" + URLEncoder.encode(selectQuery, "UTF-8"));
|
||||||
|
GDataRequest queryRequest = service.getRequestFactory().getRequest(
|
||||||
|
RequestType.QUERY, queryUrl, ContentType.TEXT_PLAIN);
|
||||||
|
queryRequest.execute();
|
||||||
|
|
||||||
|
Scanner scanner = new Scanner(queryRequest.getResponseStream(),"UTF-8");
|
||||||
|
|
||||||
|
// TODO: Just use the first row of data as column headers for now
|
||||||
|
List<String> columnHeaders = getTableRow(scanner);
|
||||||
|
|
||||||
|
// Create columns
|
||||||
|
int columnCount = columnHeaders.size();
|
||||||
|
project.columnModel.setMaxCellIndex(columnCount);
|
||||||
|
boolean validColumn[] = new boolean[columnCount];
|
||||||
|
int index = 0;
|
||||||
|
for (String name : columnHeaders) {
|
||||||
|
Column column = new Column(index, name + " " + index);
|
||||||
|
project.columnModel.columns.add(column);
|
||||||
|
validColumn[index++] = true;
|
||||||
|
}
|
||||||
|
for (int i = index; index < columnCount; index++) {
|
||||||
|
Column column = new Column(index, "Column " + index);
|
||||||
|
project.columnModel.columns.add(column);
|
||||||
|
validColumn[i] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create data rows & cells
|
||||||
|
List<String> values = columnHeaders;
|
||||||
|
while (values != null) {
|
||||||
|
Row row = new Row(columnCount);
|
||||||
|
for (String valString : values) {
|
||||||
|
valString = valString.trim();
|
||||||
|
if (ExpressionUtils.isNonBlankData(valString)) {
|
||||||
|
Serializable value = guessValueType ? ImporterUtilities
|
||||||
|
.parseCellValue(valString) : valString;
|
||||||
|
row.cells.add(new Cell(value, null));
|
||||||
|
} else {
|
||||||
|
row.cells.add(null);
|
||||||
|
}
|
||||||
|
project.rows.add(row);
|
||||||
|
values = getTableRow(scanner);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> getTableRow(Scanner scanner) {
|
||||||
|
/**
|
||||||
|
* CSV values are terminated by comma or end-of-line and consist either of
|
||||||
|
* plain text without commas or quotes, or a quoted expression, where inner
|
||||||
|
* quotes are escaped by doubling.
|
||||||
|
*/
|
||||||
|
final Pattern CSV_VALUE_PATTERN =
|
||||||
|
Pattern.compile("([^,\\r\\n\"]*|\"(([^\"]*\"\")*[^\"]*)\")(,|\\r?\\n)");
|
||||||
|
|
||||||
|
if (!scanner.hasNextLine()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> result = new ArrayList<String>();
|
||||||
|
while (scanner.hasNextLine()) {
|
||||||
|
scanner.findWithinHorizon(CSV_VALUE_PATTERN, 0);
|
||||||
|
MatchResult match = scanner.match();
|
||||||
|
String quotedString = match.group(2);
|
||||||
|
String decoded = quotedString == null ? match.group(1)
|
||||||
|
: quotedString.replaceAll("\"\"", "\"");
|
||||||
|
result.add(decoded);
|
||||||
|
if (!match.group(4).equals(",")) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieves the spreadsheets that an authenticated user has access to. Not
|
* Retrieves the spreadsheets that an authenticated user has access to. Not
|
||||||
@ -296,17 +410,23 @@ public class GDataImporter implements UrlImporter {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean canImportData(URL url) {
|
public boolean canImportData(URL url) {
|
||||||
// http://spreadsheets.google.com/ccc?key=tI36b9Fxk1lFBS83iR_3XQA&hl=en
|
return isSpreadsheetURL(url) | isFusionTableURL(url);
|
||||||
if (url.getHost().endsWith(".google.com")) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isSpreadsheetURL(URL url) {
|
||||||
|
String host = url.getHost();
|
||||||
|
// http://spreadsheets.google.com/ccc?key=tI36b9Fxk1lFBS83iR_3XQA&hl=en
|
||||||
|
return host.endsWith(".google.com") && host.contains("spreadsheet");
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isFusionTableURL(URL url) {
|
||||||
|
// http://www.google.com/fusiontables/DataSource?dsrcid=1219
|
||||||
|
return url.getHost().endsWith(".google.com")
|
||||||
|
&& url.getPath().startsWith("/fusiontables/");
|
||||||
|
}
|
||||||
|
|
||||||
// Modified version of FeedURLFactor.getSpreadsheetKeyFromUrl()
|
// Modified version of FeedURLFactor.getSpreadsheetKeyFromUrl()
|
||||||
private String getKey(URL url) {
|
private String getSpreadsheetKey(URL url) {
|
||||||
String query = url.getQuery();
|
String query = url.getQuery();
|
||||||
if (query != null) {
|
if (query != null) {
|
||||||
String[] parts = query.split("&");
|
String[] parts = query.split("&");
|
||||||
@ -342,4 +462,22 @@ public class GDataImporter implements UrlImporter {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
private String getFusionTableKey(URL url) {
|
||||||
|
String query = url.getQuery();
|
||||||
|
if (query != null) {
|
||||||
|
String[] parts = query.split("&");
|
||||||
|
for (String part : parts) {
|
||||||
|
if (part.startsWith("dsrcid=")) {
|
||||||
|
int offset = ("dsrcid=").length();
|
||||||
|
String tableId = part.substring(offset);
|
||||||
|
// TODO: Any special id format considerations to worry about?
|
||||||
|
// if (tableId.startsWith("p") || !tableId.contains(".")) {
|
||||||
|
// return tableId;
|
||||||
|
// }
|
||||||
|
return tableId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user