diff --git a/extensions/gdata/module/MOD-INF/controller.js b/extensions/gdata/module/MOD-INF/controller.js index 7884cda14..2cc5f6c97 100644 --- a/extensions/gdata/module/MOD-INF/controller.js +++ b/extensions/gdata/module/MOD-INF/controller.js @@ -33,8 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. var html = "text/html"; var encoding = "UTF-8"; -var version="0.2" -var ClientSideResourceManager = Packages.com.google.refine.ClientSideResourceManager; +var version = "0.2"; /* * Function invoked to initialize the extension. @@ -43,21 +42,24 @@ function init() { // Packages.java.lang.System.err.println("Initializing gData extension"); // Packages.java.lang.System.err.println(module.getMountPoint()); - Packages.com.google.refine.RefineServlet.registerCommand( - module, "authorize", Packages.com.google.refine.extension.gdata.AuthorizeCommand()); - Packages.com.google.refine.RefineServlet.registerCommand( - module, "authorize2", Packages.com.google.refine.extension.gdata.AuthorizeCommand2()); - Packages.com.google.refine.RefineServlet.registerCommand( - module, "deauthorize", Packages.com.google.refine.extension.gdata.DeAuthorizeCommand()); - + var RS = Packages.com.google.refine.RefineServlet; + RS.registerCommand(module, "authorize", Packages.com.google.refine.extension.gdata.AuthorizeCommand()); + RS.registerCommand(module, "authorize2", Packages.com.google.refine.extension.gdata.AuthorizeCommand2()); + RS.registerCommand(module, "deauthorize", Packages.com.google.refine.extension.gdata.DeAuthorizeCommand()); + // Register importer and exporter - Packages.com.google.refine.importers.ImporterRegistry.registerImporter( - "gdata-importer", new Packages.com.google.refine.extension.gdata.GDataImporter()); + var IM = Packages.com.google.refine.importing.ImportingManager; + IM.registerFormat("service/gdata", "GData services"); // generic format, no parser to handle it + IM.registerFormat("service/gdata/spreadsheet", "Google spreadsheets", false, "GoogleSpreadsheetParserUI", + new Packages.com.google.refine.extension.gdata.GDataImporter()); + IM.registerUrlRewriter(new Packages.com.google.refine.extension.gdata.GDataUrlRewriter()) + IM.registerUrlRewriter(new Packages.com.google.refine.extension.gdata.FusionTablesUrlRewriter()) // Packages.com.google.refine.exporters.ExporterRegistry.registerExporter( -// "gdata-exporter", new Packages.com.google.refine.extension.gdata.GDataExporter()); +// "gdata-exporter", new Packages.com.google.refine.extension.gdata.GDataExporter()); // Script files to inject into /project page + var ClientSideResourceManager = Packages.com.google.refine.ClientSideResourceManager; ClientSideResourceManager.addPaths( "project/scripts", module, @@ -82,7 +84,7 @@ function init() { */ function process(path, request, response) { // Analyze path and handle this request yourself. - + if (path == "/" || path == "") { var context = {}; // here's how to pass things into the .vt templates diff --git a/extensions/gdata/src/com/google/refine/extension/gdata/FusionTablesUrlRewriter.java b/extensions/gdata/src/com/google/refine/extension/gdata/FusionTablesUrlRewriter.java new file mode 100644 index 000000000..5cd425e8e --- /dev/null +++ b/extensions/gdata/src/com/google/refine/extension/gdata/FusionTablesUrlRewriter.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2010, Thomas F. Morris + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of Google nor the names of its contributors may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package com.google.refine.extension.gdata; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLEncoder; + +import com.google.gdata.client.GoogleService; +import com.google.gdata.client.Service.GDataRequest; +import com.google.gdata.client.Service.GDataRequest.RequestType; +import com.google.gdata.util.ContentType; +import com.google.gdata.util.ServiceException; +import com.google.refine.importing.UrlRewriter; + +/** + * @author Tom Morris + * @copyright 2010 Thomas F. Morris + * @license New BSD http://www.opensource.org/licenses/bsd-license.php + */ +public class FusionTablesUrlRewriter implements UrlRewriter { + + @Override + public Result rewrite(String urlString) { + try { + URL url = new URL(urlString); + if (isFusionTableURL(url)) { + Result result = new Result(); + try { + result.rewrittenUrl = generateQueryUrl(url, 0, -1).toExternalForm(); + result.format = "text/line-based/*sv"; + result.download = true; + return result; + } catch (UnsupportedEncodingException e) { + // TODO: what do we do here? + } + } + } catch (MalformedURLException e) { + // Ignore + } + return null; + } + + static public boolean isFusionTableURL(URL url) { + // http://www.google.com/fusiontables/DataSource?dsrcid=1219 + String query = url.getQuery(); + if (query == null) { + query = ""; + } + return url.getHost().endsWith(".google.com") + && url.getPath().startsWith("/fusiontables/DataSource") + && query.contains("dsrcid="); + } + + static public URL generateQueryUrl(URL url, int start, int limit) + throws MalformedURLException, UnsupportedEncodingException { + + String tableId = getFusionTableKey(url); + + final String SERVICE_URL = + "http://www.google.com/fusiontables/api/query"; + final String selectQuery = "select * from " + tableId + + " offset " + (start) + (limit > 0 ? (" limit " + limit) : ""); + + return new URL(SERVICE_URL + "?sql=" + URLEncoder.encode(selectQuery, "UTF-8")); + } + + static public InputStream openInputStream(URL queryUrl) throws IOException, ServiceException { + GoogleService service = new GoogleService("fusiontables", GDataExtension.SERVICE_APP_NAME); + // String token = TokenCookie.getToken(request); + // if (token != null) { + // service.setAuthSubToken(token); + // } + GDataRequest queryRequest = service.getRequestFactory().getRequest( + RequestType.QUERY, queryUrl, ContentType.TEXT_PLAIN); + queryRequest.execute(); + + return queryRequest.getResponseStream(); + } + + static private String getFusionTableKey(URL url) { + String query = url.getQuery(); + if (query != null) { + String[] parts = query.split("&"); + for (String part : parts) { + if (part.startsWith("dsrcid=")) { + int offset = ("dsrcid=").length(); + String tableId = part.substring(offset); + // TODO: Any special id format considerations to worry about? + // if (tableId.startsWith("p") || !tableId.contains(".")) { + // return tableId; + // } + return tableId; + } + } + } + return null; + } +} diff --git a/extensions/gdata/src/com/google/refine/extension/gdata/GDataExtension.java b/extensions/gdata/src/com/google/refine/extension/gdata/GDataExtension.java new file mode 100644 index 000000000..1997ba2b1 --- /dev/null +++ b/extensions/gdata/src/com/google/refine/extension/gdata/GDataExtension.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010, Thomas F. Morris + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of Google nor the names of its contributors may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package com.google.refine.extension.gdata; + +import com.google.gdata.client.spreadsheet.FeedURLFactory; + +/** + * @author Tom Morris + * @copyright 2010 Thomas F. Morris + * @license New BSD http://www.opensource.org/licenses/bsd-license.php + */ +abstract public class GDataExtension { + static final String SERVICE_APP_NAME = "Google-Refine-GData-Extension"; + + static private FeedURLFactory factory; + static public FeedURLFactory getFeedUrlFactory() { + if (factory == null) { + // Careful - this is shared by everyone. + factory = FeedURLFactory.getDefault(); + } + return factory; + } +} diff --git a/extensions/gdata/src/com/google/refine/extension/gdata/GDataImporter.java b/extensions/gdata/src/com/google/refine/extension/gdata/GDataImporter.java index 4e8175972..285c81fc5 100644 --- a/extensions/gdata/src/com/google/refine/extension/gdata/GDataImporter.java +++ b/extensions/gdata/src/com/google/refine/extension/gdata/GDataImporter.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010,2011. Thomas F. Morris + * Copyright (c) 2010, Thomas F. Morris * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,282 +29,126 @@ package com.google.refine.extension.gdata; import java.io.IOException; -import java.io.Serializable; import java.net.MalformedURLException; import java.net.URL; -import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; -import java.util.Properties; -import java.util.Scanner; -import java.util.regex.MatchResult; -import java.util.regex.Pattern; -import com.google.gdata.client.GoogleService; -import com.google.gdata.client.Service.GDataRequest; -import com.google.gdata.client.Service.GDataRequest.RequestType; +import org.json.JSONObject; + import com.google.gdata.client.spreadsheet.CellQuery; -import com.google.gdata.client.spreadsheet.FeedURLFactory; import com.google.gdata.client.spreadsheet.SpreadsheetService; +import com.google.gdata.data.spreadsheet.Cell; import com.google.gdata.data.spreadsheet.CellEntry; import com.google.gdata.data.spreadsheet.CellFeed; -import com.google.gdata.data.spreadsheet.ListEntry; -import com.google.gdata.data.spreadsheet.ListFeed; import com.google.gdata.data.spreadsheet.SpreadsheetEntry; import com.google.gdata.data.spreadsheet.SpreadsheetFeed; import com.google.gdata.data.spreadsheet.WorksheetEntry; import com.google.gdata.data.spreadsheet.WorksheetFeed; -import com.google.gdata.util.ContentType; -import com.google.gdata.util.InvalidEntryException; import com.google.gdata.util.ServiceException; import com.google.refine.ProjectMetadata; -import com.google.refine.expr.ExpressionUtils; -import com.google.refine.importers.ImporterUtilities; -import com.google.refine.importers.UrlImporter; -import com.google.refine.model.Cell; -import com.google.refine.model.Column; +import com.google.refine.importers.TabularImportingParserBase; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingUtilities; import com.google.refine.model.Project; -import com.google.refine.model.Row; +import com.google.refine.util.JSONUtilities; /** - * Google Refine importer for Google Spreadsheets. + * Google Refine parser for Google Spreadsheets. * * @author Tom Morris * @copyright 2010 Thomas F. Morris * @license New BSD http://www.opensource.org/licenses/bsd-license.php */ -public class GDataImporter implements UrlImporter { - - static final String SERVICE_APP_NAME = "Google-Refine-GData-Extension"; - - private FeedURLFactory factory; - +public class GDataImporter extends TabularImportingParserBase { public GDataImporter() { - // Careful - this constructor is called at server init time - // and is shared by everyone. - factory = FeedURLFactory.getDefault(); + super(false); } + + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + JSONObject fileRecord, + int limit, + JSONObject options, + List exceptions + ) throws IOException { + String fileSource = ImportingUtilities.getFileSource(fileRecord); + String urlString = JSONUtilities.getString(fileRecord, "url", null); + URL url = new URL(urlString); - @Override - public void read(URL url, Project project, ProjectMetadata metadata, - Properties options) throws Exception { - - int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1); - int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1); - int limit = ImporterUtilities.getIntegerOption("limit", options, -1); - - // Note: Unlike TSV/CSV importer, we count all rows towards skip, not - // just "data" rows - int skip = ImporterUtilities.getIntegerOption("skip", options, 0); - int dataStart = ignoreLines + headerLines + skip; - boolean guessValueType = ImporterUtilities.getBooleanOption( - "guess-value-type", options, true); - - // TODO: Put this in a namespace? - metadata.setCustomMetadata("source-url", url.toExternalForm()); - - // Start fresh for each read so that we're not caching authorization or - // anything - if (isSpreadsheetURL(url)) { - importSpreadsheet(url, project, ignoreLines, headerLines, limit, - dataStart, guessValueType); - } else if (isFusionTableURL(url)) { - importFusionTable(url, project, ignoreLines, headerLines, limit, - dataStart, guessValueType); - } else { - // should never happen (famous last words) - throw new IllegalArgumentException( - "Got invalid format URL in GDataImporter.read()"); - } - } - - private void importSpreadsheet(URL url, Project project, int ignoreLines, - int headerLines, int limit, int dataStart, boolean guessValueType) - throws MalformedURLException, IOException, ServiceException, - Exception { - SpreadsheetService service = new SpreadsheetService(SERVICE_APP_NAME); + SpreadsheetService service = new SpreadsheetService(GDataExtension.SERVICE_APP_NAME); // String token = TokenCookie.getToken(request); // if (token != null) { // service.setAuthSubToken(token); // } String spreadsheetKey = getSpreadsheetKey(url); - WorksheetEntry worksheet; - try { - worksheet = getWorksheetEntries(service, spreadsheetKey).get(0); - } catch (InvalidEntryException e) { - throw new RuntimeException("Failed to open spreadsheet " - + e.getResponseBody(), e); - } - - // Create columns - List columnHeaders = getColumnHeaders(service, worksheet, - ignoreLines, headerLines); - - int columnCount = worksheet.getColCount(); - project.columnModel.setMaxCellIndex(columnCount); - boolean validColumn[] = new boolean[columnCount]; - int index = 0; - for (String name : columnHeaders) { - Column column = new Column(index, name + " " + index); - project.columnModel.columns.add(column); - validColumn[index++] = true; - } - for (int i = index; index < columnCount; index++) { - Column column = new Column(index, "Column " + index); - project.columnModel.columns.add(column); - validColumn[i] = true; - } - - // Create data rows & cells - int previousRow = dataStart - 1; - int previousCol = -1; - List cellEntries = getCells(service, worksheet, dataStart); - Row row = null; - for (CellEntry cellEntry : cellEntries) { - com.google.gdata.data.spreadsheet.Cell cell = cellEntry.getCell(); - if (cell == null) { + + int[] sheets = JSONUtilities.getIntArray(options, "sheets"); + for (int sheetIndex : sheets) { + WorksheetEntry worksheet; + try { + worksheet = getWorksheetEntries(service, spreadsheetKey).get(sheetIndex); + } catch (ServiceException e) { + exceptions.add(e); continue; } - int r = cell.getRow() - 1; // convert from 1-based to 0-based - int c = cell.getCol() - 1; - - if (limit > 0 && r > limit) { - break; - } - - // Handle gaps in rows - if (r > previousRow) { - // Finish and add current row - if (row != null) { - project.rows.add(row); - // project.columnModel.setMaxCellIndex(row.cells.size()); // - // TODO: ??? - } - - // Add empty rows for skipped rows - while (previousRow < r - 1) { - project.rows.add(new Row(columnCount)); - previousRow++; - } - row = new Row(columnCount); - previousRow = r; - previousCol = 0; - } - - // Add blank cells for any that were skipped before the current one - for (int col = previousCol + 1; col < c; col++) { - row.cells.add(new Cell("", null)); - } - previousCol = c; - - String s = cell.getValue(); - if (s != null) { - s = s.trim(); - } - if (ExpressionUtils.isNonBlankData(s)) { - Serializable value = guessValueType ? ImporterUtilities - .parseCellValue(s) : s; - row.cells.add(new Cell(value, null)); - } else { - row.cells.add(null); - } - } - // Add last row - if (row != null) { - project.rows.add(row); + + readTable( + project, + metadata, + job, + new BatchRowReader(service, worksheet, 20), + fileSource + "#" + worksheet.getTitle().getPlainText(), + limit, + options, + exceptions + ); } } - private void importFusionTable(URL url, Project project, int ignoreLines, - int headerLines, int limit, int dataStart, boolean guessValueType) - throws MalformedURLException, IOException, ServiceException, - Exception { - GoogleService service = new GoogleService("fusiontables", SERVICE_APP_NAME); - // String token = TokenCookie.getToken(request); - // if (token != null) { - // service.setAuthSubToken(token); - // } - String tableId = getFusionTableKey(url); + static private class BatchRowReader implements TableDataReader { + final int batchSize; + final SpreadsheetService service; + final WorksheetEntry worksheet; + final int totalRowCount; - final String SERVICE_URL = - "http://www.google.com/fusiontables/api/query"; - final String selectQuery = "select * from " + tableId - + " offset " + (dataStart) + (limit>0 ? (" limit " + limit):""); - - URL queryUrl = new URL( - SERVICE_URL + "?sql=" + URLEncoder.encode(selectQuery, "UTF-8")); - GDataRequest queryRequest = service.getRequestFactory().getRequest( - RequestType.QUERY, queryUrl, ContentType.TEXT_PLAIN); - queryRequest.execute(); - - Scanner scanner = new Scanner(queryRequest.getResponseStream(),"UTF-8"); - - // TODO: Just use the first row of data as column headers for now - List columnHeaders = getTableRow(scanner); - - // Create columns - int columnCount = columnHeaders.size(); - project.columnModel.setMaxCellIndex(columnCount); - boolean validColumn[] = new boolean[columnCount]; - int index = 0; - for (String name : columnHeaders) { - Column column = new Column(index, name + " " + index); - project.columnModel.columns.add(column); - validColumn[index++] = true; + int nextRow = 0; // 0-based + int batchRowStart = -1; // 0-based + List> rowsOfCells = null; + + public BatchRowReader(SpreadsheetService service, WorksheetEntry worksheet, int batchSize) { + this.service = service; + this.worksheet = worksheet; + this.batchSize = batchSize; + this.totalRowCount = worksheet.getRowCount(); } - for (int i = index; index < columnCount; index++) { - Column column = new Column(index, "Column " + index); - project.columnModel.columns.add(column); - validColumn[i] = true; - } - - // Create data rows & cells - List values = columnHeaders; - while (values != null) { - Row row = new Row(columnCount); - for (String valString : values) { - valString = valString.trim(); - if (ExpressionUtils.isNonBlankData(valString)) { - Serializable value = guessValueType ? ImporterUtilities - .parseCellValue(valString) : valString; - row.cells.add(new Cell(value, null)); + + @Override + public List getNextRowOfCells() throws IOException { + if (rowsOfCells == null || nextRow > batchRowStart + rowsOfCells.size()) { + batchRowStart = batchRowStart + (rowsOfCells == null ? 0 : rowsOfCells.size()); + if (batchRowStart < totalRowCount) { + try { + rowsOfCells = getRowsOfCells(service, worksheet, batchRowStart + 1, batchSize); + } catch (ServiceException e) { + rowsOfCells = null; + throw new IOException(e); + } } else { - row.cells.add(null); + rowsOfCells = null; } } - project.rows.add(row); - values = getTableRow(scanner); - } - } - - private List getTableRow(Scanner scanner) { - /** - * CSV values are terminated by comma or end-of-line and consist either of - * plain text without commas or quotes, or a quoted expression, where inner - * quotes are escaped by doubling. - */ - final Pattern CSV_VALUE_PATTERN = - Pattern.compile("([^,\\r\\n\"]*|\"(([^\"]*\"\")*[^\"]*)\")(,|\\r?\\n)"); - - if (!scanner.hasNextLine()) { - return null; - } - - List result = new ArrayList(); - while (scanner.hasNextLine()) { - scanner.findWithinHorizon(CSV_VALUE_PATTERN, 0); - MatchResult match = scanner.match(); - String quotedString = match.group(2); - String decoded = quotedString == null ? match.group(1) - : quotedString.replaceAll("\"\"", "\""); - result.add(decoded); - if (!match.group(4).equals(",")) { - break; + + if (rowsOfCells != null && nextRow - batchRowStart < rowsOfCells.size()) { + return rowsOfCells.get(nextRow++ - batchRowStart); + } else { + return null; } } - return result; } - + /** * Retrieves the spreadsheets that an authenticated user has access to. Not * valid for unauthenticated access. @@ -313,130 +157,67 @@ public class GDataImporter implements UrlImporter { * @throws Exception * if error in retrieving the spreadsheet information */ - public List getSpreadsheetEntries( - SpreadsheetService service) throws Exception { + static public List getSpreadsheetEntries( + SpreadsheetService service + ) throws Exception { SpreadsheetFeed feed = service.getFeed( - factory.getSpreadsheetsFeedUrl(), SpreadsheetFeed.class); + GDataExtension.getFeedUrlFactory().getSpreadsheetsFeedUrl(), + SpreadsheetFeed.class); return feed.getEntries(); } - - public List getWorksheetEntries(SpreadsheetService service, - String spreadsheetKey) throws MalformedURLException, IOException, - ServiceException { - WorksheetFeed feed = service - .getFeed(factory.getWorksheetFeedUrl(spreadsheetKey, "public", - "values"), WorksheetFeed.class); + + static public List getWorksheetEntries( + SpreadsheetService service, String spreadsheetKey + ) throws MalformedURLException, IOException, ServiceException { + WorksheetFeed feed = service.getFeed( + GDataExtension.getFeedUrlFactory().getWorksheetFeedUrl(spreadsheetKey, "public", "values"), + WorksheetFeed.class); return feed.getEntries(); } - - /** - * Retrieves the columns headers from the cell feed of the worksheet entry. - * - * @param worksheet - * worksheet entry containing the cell feed in question - * @return a list of column headers - * @throws Exception - * if error in retrieving the spreadsheet information - */ - public List getColumnHeaders(SpreadsheetService service, - WorksheetEntry worksheet, int startRow, int rows) throws Exception { - List headers = new ArrayList(); - - // Get the appropriate URL for a cell feed + + static public List> getRowsOfCells( + SpreadsheetService service, + WorksheetEntry worksheet, + int startRow, // 1-based + int rowCount + ) throws IOException, ServiceException { URL cellFeedUrl = worksheet.getCellFeedUrl(); - // Create a query for the cells in the header row(s) (1-based) - CellQuery cellQuery = new CellQuery(cellFeedUrl); - if (startRow > 0) { - cellQuery.setMinimumRow(startRow + 1); - } - cellQuery.setMaximumRow(startRow + rows); - - // Get the cell feed matching the query - CellFeed topRowCellFeed = service.query(cellQuery, CellFeed.class); - - // Get the cell entries from the feed - List cellEntries = topRowCellFeed.getEntries(); - for (CellEntry entry : cellEntries) { - - // Get the cell element from the entry - com.google.gdata.data.spreadsheet.Cell cell = entry.getCell(); - int r = cell.getRow() - 1; - if (cell != null) { - if (r == startRow) { - headers.add(cell.getValue().trim()); - } else if (r < startRow + rows) { - headers.set(r, headers.get(r) + " " - + cell.getValue().trim()); - } - } - } - - return headers; - } - - public List getCells(SpreadsheetService service, - WorksheetEntry worksheet, int startRow) throws IOException, - ServiceException { - - URL cellFeedUrl = worksheet.getCellFeedUrl(); - - // Create a query skipping the desired number of rows - CellQuery cellQuery = new CellQuery(cellFeedUrl); - cellQuery.setMinimumRow(startRow + 1); // 1-based - int rows = worksheet.getRowCount(); - cellQuery.setMaximumRow(rows); - // cellQuery.setMinimumCol(1); + int minRow = Math.max(1, startRow); + int maxRow = Math.min(worksheet.getRowCount(), startRow + rowCount - 1); + int rows = maxRow - minRow + 1; int cols = worksheet.getColCount(); + + CellQuery cellQuery = new CellQuery(cellFeedUrl); + cellQuery.setMinimumRow(minRow); + cellQuery.setMaximumRow(maxRow); cellQuery.setMaximumCol(cols); cellQuery.setMaxResults(rows * cols); cellQuery.setReturnEmpty(false); - + CellFeed cellFeed = service.query(cellQuery, CellFeed.class); - return cellFeed.getEntries(); - } - - List getListEntries(SpreadsheetService service, - WorksheetEntry worksheet) throws IOException, ServiceException { - URL listFeedUrl = worksheet.getListFeedUrl(); - ListFeed feed = service.getFeed(listFeedUrl, ListFeed.class); - return feed.getEntries(); - } - - @Override - public boolean canImportData(String contentType, String filename) { - return false; - } - - @Override - public boolean canImportData(URL url) { - return isSpreadsheetURL(url) || isFusionTableURL(url); - } - - private boolean isSpreadsheetURL(URL url) { - String host = url.getHost(); - String query = url.getQuery(); - if (query == null) { - query = ""; + List cellEntries = cellFeed.getEntries(); + + List> rowsOfCells = new ArrayList>(rows); + for (CellEntry cellEntry : cellEntries) { + Cell cell = cellEntry.getCell(); + int row = cell.getRow(); + int col = cell.getCol(); + + while (row > rowsOfCells.size()) { + rowsOfCells.add(new ArrayList(cols)); + } + List rowOfCells = rowsOfCells.get(row - 1); // 1-based + + while (col > rowOfCells.size()) { + rowOfCells.add(null); + } + rowOfCells.set(col - 1, cell.getValue()); } - // http://spreadsheets.google.com/ccc?key=tI36b9Fxk1lFBS83iR_3XQA&hl=en - return host.endsWith(".google.com") - && host.contains("spreadsheet") - && getSpreadsheetKey(url) != null; + return rowsOfCells; } - - private boolean isFusionTableURL(URL url) { - // http://www.google.com/fusiontables/DataSource?dsrcid=1219 - String query = url.getQuery(); - if (query == null) { - query = ""; - } - return url.getHost().endsWith(".google.com") - && url.getPath().startsWith("/fusiontables/DataSource") - && getFusionTableKey(url) != null; - } - - // Modified version of FeedURLFactor.getSpreadsheetKeyFromUrl() + + // Modified version of FeedURLFactory.getSpreadsheetKeyFromUrl() private String getSpreadsheetKey(URL url) { String query = url.getQuery(); if (query != null) { @@ -472,23 +253,4 @@ public class GDataImporter implements UrlImporter { } return null; } - - private String getFusionTableKey(URL url) { - String query = url.getQuery(); - if (query != null) { - String[] parts = query.split("&"); - for (String part : parts) { - if (part.startsWith("dsrcid=")) { - int offset = ("dsrcid=").length(); - String tableId = part.substring(offset); - // TODO: Any special id format considerations to worry about? -// if (tableId.startsWith("p") || !tableId.contains(".")) { -// return tableId; -// } - return tableId; - } - } - } - return null; - } } \ No newline at end of file diff --git a/extensions/gdata/src/com/google/refine/extension/gdata/GDataUrlRewriter.java b/extensions/gdata/src/com/google/refine/extension/gdata/GDataUrlRewriter.java new file mode 100644 index 000000000..c054f6187 --- /dev/null +++ b/extensions/gdata/src/com/google/refine/extension/gdata/GDataUrlRewriter.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2010, Thomas F. Morris + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of Google nor the names of its contributors may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package com.google.refine.extension.gdata; + +import java.net.MalformedURLException; +import java.net.URL; + +import com.google.refine.importing.UrlRewriter; + +/** + * @author Tom Morris + * @copyright 2010 Thomas F. Morris + * @license New BSD http://www.opensource.org/licenses/bsd-license.php + */ +public class GDataUrlRewriter implements UrlRewriter { + + @Override + public Result rewrite(String urlString) { + try { + URL url = new URL(urlString); + if (isSpreadsheetURL(url)) { + Result result = new Result(); + result.rewrittenUrl = urlString; + result.format = "service/gdata/spreadsheet"; + result.download = false; + return result; + } + } catch (MalformedURLException e) { + // Ignore + } + return null; + } + + static public boolean isSpreadsheetURL(URL url) { + String host = url.getHost(); + String query = url.getQuery(); + if (query == null) { + query = ""; + } + // http://spreadsheets.google.com/ccc?key=tI36b9Fxk1lFBS83iR_3XQA&hl=en + return host.endsWith(".google.com") && host.contains("spreadsheet") && query.contains("key="); + } +} diff --git a/main/src/com/google/refine/HttpResponder.java b/main/src/com/google/refine/HttpResponder.java new file mode 100644 index 000000000..3ed70a883 --- /dev/null +++ b/main/src/com/google/refine/HttpResponder.java @@ -0,0 +1,19 @@ +package com.google.refine; + +import java.io.IOException; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import com.google.refine.RefineServlet; + +public interface HttpResponder { + public void init(RefineServlet servlet); + + public void doPost(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException; + + public void doGet(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException; +} diff --git a/main/src/com/google/refine/RefineServlet.java b/main/src/com/google/refine/RefineServlet.java index 12e0f1fd6..b7517e871 100644 --- a/main/src/com/google/refine/RefineServlet.java +++ b/main/src/com/google/refine/RefineServlet.java @@ -50,7 +50,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.refine.commands.Command; -import com.google.refine.commands.importing.ImportManager; +import com.google.refine.importing.ImportingManager; import com.google.refine.io.FileProjectManager; import edu.mit.simile.butterfly.Butterfly; @@ -125,7 +125,7 @@ public class RefineServlet extends Butterfly { s_dataDir = new File(data); FileProjectManager.initialize(s_dataDir); - ImportManager.initialize(this); + ImportingManager.initialize(this); if (_timer == null) { _timer = new Timer("autosave"); diff --git a/main/src/com/google/refine/commands/HttpUtilities.java b/main/src/com/google/refine/commands/HttpUtilities.java new file mode 100644 index 000000000..a7495652d --- /dev/null +++ b/main/src/com/google/refine/commands/HttpUtilities.java @@ -0,0 +1,180 @@ +package com.google.refine.commands; + +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.io.Writer; +import java.util.Properties; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.velocity.VelocityContext; +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.refine.Jsonizable; +import com.google.refine.RefineServlet; +import com.google.refine.util.ParsingUtilities; + +abstract public class HttpUtilities { + final static protected Logger logger = LoggerFactory.getLogger("command"); + + static public void respond(HttpServletResponse response, String content) + throws IOException, ServletException { + + response.setCharacterEncoding("UTF-8"); + response.setStatus(HttpServletResponse.SC_OK); + Writer w = response.getWriter(); + if (w != null) { + w.write(content); + w.flush(); + w.close(); + } else { + throw new ServletException("response returned a null writer"); + } + } + + static public void respond(HttpServletResponse response, String status, String message) + throws IOException { + + Writer w = response.getWriter(); + try { + JSONWriter writer = new JSONWriter(w); + writer.object(); + writer.key("status"); writer.value(status); + writer.key("message"); writer.value(message); + writer.endObject(); + w.flush(); + w.close(); + } catch (JSONException e) { + // This can never occue + } + } + + static public void respondJSON(HttpServletResponse response, Jsonizable o) + throws IOException, JSONException { + + respondJSON(response, o, new Properties()); + } + + static public void respondJSON( + HttpServletResponse response, Jsonizable o, Properties options) + throws IOException, JSONException { + + response.setCharacterEncoding("UTF-8"); + response.setHeader("Content-Type", "application/json"); + + Writer w = response.getWriter(); + JSONWriter writer = new JSONWriter(w); + + o.write(writer, options); + w.flush(); + w.close(); + } + + static public void respondException(HttpServletResponse response, Exception e) + throws IOException, ServletException { + + logger.warn("Exception caught", e); + + if (response == null) { + throw new ServletException("Response object can't be null"); + } + + try { + JSONObject o = new JSONObject(); + o.put("code", "error"); + o.put("message", e.getMessage()); + + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + e.printStackTrace(pw); + pw.flush(); + sw.flush(); + + o.put("stack", sw.toString()); + + response.setCharacterEncoding("UTF-8"); + response.setHeader("Content-Type", "application/json"); + respond(response, o.toString()); + } catch (JSONException e1) { + e.printStackTrace(response.getWriter()); + } + } + + static public void redirect(HttpServletResponse response, String url) throws IOException { + response.sendRedirect(url); + } + + static public int getIntegerParameter(HttpServletRequest request, String name, int def) { + if (request == null) throw new IllegalArgumentException("parameter 'request' should not be null"); + try { + return Integer.parseInt(request.getParameter(name)); + } catch (Exception e) { + logger.warn("Error getting integer parameter", e); + } + return def; + } + + static public JSONObject getJsonParameter(HttpServletRequest request, String name) { + if (request == null) throw new IllegalArgumentException("parameter 'request' should not be null"); + String value = request.getParameter(name); + if (value != null) { + try { + return ParsingUtilities.evaluateJsonStringToObject(value); + } catch (JSONException e) { + logger.warn("Error getting json parameter", e); + } + } + return null; + } + + static public void respondWithErrorPage( + RefineServlet servlet, + HttpServletRequest request, + HttpServletResponse response, + String message, + Throwable e + ) { + respondWithErrorPage(servlet, request, response, message, + HttpServletResponse.SC_INTERNAL_SERVER_ERROR, e); + } + + static public void respondWithErrorPage( + RefineServlet servlet, + HttpServletRequest request, + HttpServletResponse response, + String message, + int status, + Throwable e + ) { + VelocityContext context = new VelocityContext(); + + context.put("message", message); + + if (e != null) { + StringWriter writer = new StringWriter(); + + e.printStackTrace(new PrintWriter(writer)); + + context.put("stack", writer.toString()); + } else { + context.put("stack", ""); + } + + try { + response.setStatus(status); + + servlet.getModule("core").sendTextFromTemplate( + request, response, context, "error.vt", "UTF-8", "text/html", true); + + } catch (Exception e1) { + e1.printStackTrace(); + } + } +} diff --git a/main/src/com/google/refine/commands/importing/CancelImportingJobCommand.java b/main/src/com/google/refine/commands/importing/CancelImportingJobCommand.java new file mode 100644 index 000000000..2b63c5897 --- /dev/null +++ b/main/src/com/google/refine/commands/importing/CancelImportingJobCommand.java @@ -0,0 +1,61 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.commands.importing; + +import java.io.IOException; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import com.google.refine.commands.Command; +import com.google.refine.commands.HttpUtilities; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingManager; + +public class CancelImportingJobCommand extends Command { + @Override + public void doPost(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + long jobID = Long.parseLong(request.getParameter("jobID")); + ImportingJob job = ImportingManager.getJob(jobID); + if (job == null) { + HttpUtilities.respond(response, "error", "No such import job"); + } else { + job.canceled = true; + HttpUtilities.respond(response, "ok", "Job canceled"); + } + } +} diff --git a/main/src/com/google/refine/commands/importing/CreateImportJobCommand.java b/main/src/com/google/refine/commands/importing/CreateImportingJobCommand.java similarity index 93% rename from main/src/com/google/refine/commands/importing/CreateImportJobCommand.java rename to main/src/com/google/refine/commands/importing/CreateImportingJobCommand.java index e5dc135e9..27ed84979 100644 --- a/main/src/com/google/refine/commands/importing/CreateImportJobCommand.java +++ b/main/src/com/google/refine/commands/importing/CreateImportingJobCommand.java @@ -43,8 +43,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.refine.commands.Command; +import com.google.refine.importing.ImportingManager; -public class CreateImportJobCommand extends Command { +public class CreateImportingJobCommand extends Command { final static Logger logger = LoggerFactory.getLogger("create-import-job_command"); @@ -52,7 +53,7 @@ public class CreateImportJobCommand extends Command { public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { - long id = ImportManager.singleton().createJob().id; + long id = ImportingManager.createJob().id; response.setCharacterEncoding("UTF-8"); response.setHeader("Content-Type", "application/json"); diff --git a/main/src/com/google/refine/importers/StreamImporter.java b/main/src/com/google/refine/commands/importing/GetImportingConfigurationCommand.java similarity index 59% rename from main/src/com/google/refine/importers/StreamImporter.java rename to main/src/com/google/refine/commands/importing/GetImportingConfigurationCommand.java index 71fb166dc..19a7b40ec 100644 --- a/main/src/com/google/refine/importers/StreamImporter.java +++ b/main/src/com/google/refine/commands/importing/GetImportingConfigurationCommand.java @@ -1,6 +1,6 @@ /* -Copyright 2010, Google Inc. +Copyright 2011, Google Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,24 +31,38 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -package com.google.refine.importers; +package com.google.refine.commands.importing; -import java.io.InputStream; +import java.io.IOException; +import java.io.Writer; import java.util.Properties; -import com.google.refine.ProjectMetadata; -import com.google.refine.model.Project; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; -public interface StreamImporter extends Importer { +import org.json.JSONException; +import org.json.JSONWriter; - /** - * @param inputStream stream to be imported - * @param project project to import stream into - * @param metadata metadata of new project - * @param options - * @throws ImportException - */ - public void read(InputStream inputStream, Project project, - ProjectMetadata metadata, Properties options) throws ImportException; +import com.google.refine.commands.Command; +import com.google.refine.importing.ImportingManager; +public class GetImportingConfigurationCommand extends Command { + @Override + public void doPost(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + Writer w = response.getWriter(); + JSONWriter writer = new JSONWriter(w); + try { + writer.object(); + writer.key("config"); ImportingManager.writeConfiguration(writer, new Properties()); + writer.endObject(); + } catch (JSONException e) { + throw new ServletException(e); + } finally { + w.flush(); + w.close(); + } + } } diff --git a/main/src/com/google/refine/commands/importing/GetImportJobStatusCommand.java b/main/src/com/google/refine/commands/importing/GetImportingJobStatusCommand.java similarity index 63% rename from main/src/com/google/refine/commands/importing/GetImportJobStatusCommand.java rename to main/src/com/google/refine/commands/importing/GetImportingJobStatusCommand.java index d9d2b997e..7c982c35a 100644 --- a/main/src/com/google/refine/commands/importing/GetImportJobStatusCommand.java +++ b/main/src/com/google/refine/commands/importing/GetImportingJobStatusCommand.java @@ -34,9 +34,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.commands.importing; import java.io.IOException; -import java.io.PrintWriter; -import java.io.StringWriter; import java.io.Writer; +import java.util.Properties; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; @@ -44,22 +43,18 @@ import javax.servlet.http.HttpServletResponse; import org.json.JSONException; import org.json.JSONWriter; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import com.google.refine.commands.Command; -import com.google.refine.commands.importing.ImportJob.State; - -public class GetImportJobStatusCommand extends Command { - - final static Logger logger = LoggerFactory.getLogger("get-import-job-status_command"); +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingManager; +public class GetImportingJobStatusCommand extends Command { @Override public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { long jobID = Long.parseLong(request.getParameter("jobID")); - ImportJob job = ImportManager.singleton().getJob(jobID); + ImportingJob job = ImportingManager.getJob(jobID); Writer w = response.getWriter(); JSONWriter writer = new JSONWriter(w); @@ -70,32 +65,11 @@ public class GetImportJobStatusCommand extends Command { writer.key("message"); writer.value("No such import job"); } else { writer.key("code"); writer.value("ok"); - writer.key("state"); - if (job.state == State.NEW) { - writer.value("new"); - } else if (job.state == State.RETRIEVING_DATA) { - writer.value("retrieving"); - writer.key("progress"); writer.value(job.retrievingProgress); - writer.key("bytesSaved"); writer.value(job.bytesSaved); - } else if (job.state == State.READY) { - writer.value("ready"); - } else if (job.state == State.ERROR) { - writer.value("error"); - writer.key("message"); writer.value(job.errorMessage); - if (job.exception != null) { - StringWriter sw = new StringWriter(); - PrintWriter pw = new PrintWriter(sw); - job.exception.printStackTrace(pw); - pw.flush(); - sw.flush(); - - writer.key("stack"); writer.value(sw.toString()); - } - } + writer.key("job"); job.write(writer, new Properties()); } writer.endObject(); } catch (JSONException e) { - throw new IOException(e); + throw new ServletException(e); } finally { w.flush(); w.close(); diff --git a/main/src/com/google/refine/commands/importing/ImportJob.java b/main/src/com/google/refine/commands/importing/ImportJob.java deleted file mode 100644 index 4ff2cd1f0..000000000 --- a/main/src/com/google/refine/commands/importing/ImportJob.java +++ /dev/null @@ -1,49 +0,0 @@ -package com.google.refine.commands.importing; - -import java.io.File; -import java.io.IOException; - -import org.apache.commons.io.FileUtils; - -import com.google.refine.model.meta.ImportSource; - -public class ImportJob { - static public enum State { - NEW, - RETRIEVING_DATA, - READY, - ERROR - } - - final public long id; - final public File dir; - - public long lastTouched; - public State state = State.NEW; - - // Data for retrieving phase - public int retrievingProgress = 0; // from 0 to 100 - public long bytesSaved = 0; // in case percentage is unknown - public String errorMessage; - public Throwable exception; - - public ImportSource importSource; - - public ImportJob(long id, File dir) { - this.id = id; - this.dir = dir; - - dir.mkdirs(); - } - - public void touch() { - lastTouched = System.currentTimeMillis(); - } - - public void dispose() { - try { - FileUtils.deleteDirectory(dir); - } catch (IOException e) { - } - } -} diff --git a/main/src/com/google/refine/commands/importing/ImportManager.java b/main/src/com/google/refine/commands/importing/ImportManager.java deleted file mode 100644 index d5d511744..000000000 --- a/main/src/com/google/refine/commands/importing/ImportManager.java +++ /dev/null @@ -1,101 +0,0 @@ -package com.google.refine.commands.importing; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -import org.apache.commons.io.FileUtils; - -import com.google.refine.RefineServlet; -import com.google.refine.model.meta.ImportSource; - -public class ImportManager { - static final private Map> nameToImportSourceClass = - new HashMap>(); - - static final private Map importSourceClassNameToName = - new HashMap(); - - /** - * Register a single import source class. - * - * @param name importer verb for importer - * @param importerObject object implementing the importer - * - * @return true if importer was loaded and registered successfully - */ - static public boolean registerImportSourceClass(String name, Class klass) { - if (nameToImportSourceClass.containsKey(name)) { - return false; - } - nameToImportSourceClass.put(name, klass); - importSourceClassNameToName.put(klass.getName(), name); - return true; - } - - static public Class getImportSourceClass(String name) { - return nameToImportSourceClass.get(name); - } - - static public String getImportSourceClassName(Class klass) { - return importSourceClassNameToName.get(klass.getName()); - } - - final private RefineServlet servlet; - final private Map jobs = new HashMap(); - private File importDir; - - static private ImportManager singleton; - - static public void initialize(RefineServlet servlet) { - singleton = new ImportManager(servlet); - } - - static public ImportManager singleton() { - return singleton; - } - - private ImportManager(RefineServlet servlet) { - this.servlet = servlet; - } - - private File getImportDir() { - if (importDir == null) { - File tempDir = servlet.getTempDir(); - importDir = tempDir == null ? new File(".import-temp") : new File(tempDir, "import"); - - if (importDir.exists()) { - try { - // start fresh - FileUtils.deleteDirectory(importDir); - } catch (IOException e) { - } - } - importDir.mkdirs(); - } - return importDir; - } - - public ImportJob createJob() { - long id = System.currentTimeMillis() + (long) (Math.random() * 1000000); - File jobDir = new File(getImportDir(), Long.toString(id)); - - ImportJob job = new ImportJob(id, jobDir); - jobs.put(id, job); - - return job; - } - - public ImportJob getJob(long id) { - return jobs.get(id); - } - - public void disposeJob(long id) { - ImportJob job = getJob(id); - if (job != null) { - job.dispose(); - jobs.remove(id); - } - } -} diff --git a/main/src/com/google/refine/commands/importing/RetrieveImportContentCommand.java b/main/src/com/google/refine/commands/importing/ImportingControllerCommand.java similarity index 61% rename from main/src/com/google/refine/commands/importing/RetrieveImportContentCommand.java rename to main/src/com/google/refine/commands/importing/ImportingControllerCommand.java index 5d446a244..02a279db9 100644 --- a/main/src/com/google/refine/commands/importing/RetrieveImportContentCommand.java +++ b/main/src/com/google/refine/commands/importing/ImportingControllerCommand.java @@ -44,18 +44,40 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.refine.commands.Command; -import com.google.refine.commands.importing.ImportJob.State; -import com.google.refine.model.meta.ImportSource; +import com.google.refine.commands.HttpUtilities; +import com.google.refine.importing.ImportingController; +import com.google.refine.importing.ImportingManager; import com.google.refine.util.ParsingUtilities; -public class RetrieveImportContentCommand extends Command { - - final static Logger logger = LoggerFactory.getLogger("retrieve-import-content_command"); +public class ImportingControllerCommand extends Command { + final static Logger logger = LoggerFactory.getLogger("importing-controller_command"); + @Override public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + ImportingController controller = getController(request); + if (controller != null) { + controller.doPost(request, response); + } else { + HttpUtilities.respond(response, "error", "No such import controller"); + } + } + + @Override + public void doGet(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + ImportingController controller = getController(request); + if (controller != null) { + controller.doPost(request, response); + } else { + HttpUtilities.respond(response, "error", "No such import controller"); + } + } + + private ImportingController getController(HttpServletRequest request) { /* * The uploaded file is in the POST body as a "file part". If * we call request.getParameter() then the POST body will get @@ -64,39 +86,10 @@ public class RetrieveImportContentCommand extends Command { * Don't call request.getParameter() before calling internalImport(). */ Properties options = ParsingUtilities.parseUrlParameters(request); - - long jobID = Long.parseLong(options.getProperty("jobID")); - ImportJob job = ImportManager.singleton().getJob(jobID); - if (job == null) { - respondWithErrorPage(request, response, "No such import job", null); - return; - } else if (job.state != State.NEW) { - respondWithErrorPage(request, response, "Import job already started", null); - return; - } - - Class importSourceClass = - ImportManager.getImportSourceClass(options.getProperty("source")); - if (importSourceClass == null) { - respondWithErrorPage(request, response, "No such import source class", null); - return; - } - - try { - ImportSource importSource = importSourceClass.newInstance(); - job.importSource = importSource; - job.state = State.RETRIEVING_DATA; - - importSource.retrieveContent(request, options, job); - - job.retrievingProgress = 100; - job.state = State.READY; - } catch (Throwable e) {e.printStackTrace(); - job.state = State.ERROR; - job.errorMessage = e.getLocalizedMessage(); - job.exception = e; - - respondWithErrorPage(request, response, "Failed to kick start import job", e); + String name = options.getProperty("controller"); + if (name != null) { + return ImportingManager.controllers.get(name); } + return null; } } diff --git a/main/src/com/google/refine/commands/project/CreateProjectCommand.java b/main/src/com/google/refine/commands/project/CreateProjectCommand.java index bdf75e28f..05aeba61e 100644 --- a/main/src/com/google/refine/commands/project/CreateProjectCommand.java +++ b/main/src/com/google/refine/commands/project/CreateProjectCommand.java @@ -1,6 +1,6 @@ /* -Copyright 2010,2011. Google Inc. +Copyright 2010, Google Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,60 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.commands.project; -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FilterInputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.io.Serializable; -import java.io.UnsupportedEncodingException; -import java.net.URI; -import java.net.URL; -import java.net.URLConnection; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; import java.util.Properties; -import java.util.zip.GZIPInputStream; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; -import org.apache.commons.fileupload.FileItemIterator; -import org.apache.commons.fileupload.FileItemStream; -import org.apache.commons.fileupload.servlet.ServletFileUpload; -import org.apache.commons.fileupload.util.Streams; -import org.apache.tools.bzip2.CBZip2InputStream; -import org.apache.tools.tar.TarEntry; -import org.apache.tools.tar.TarInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.refine.ProjectManager; import com.google.refine.ProjectMetadata; import com.google.refine.commands.Command; -import com.google.refine.importers.Importer; -import com.google.refine.importers.ImporterRegistry; -import com.google.refine.importers.ReaderImporter; -import com.google.refine.importers.StreamImporter; -import com.google.refine.importers.TsvCsvImporter; -import com.google.refine.importers.UrlImporter; +import com.google.refine.commands.HttpUtilities; import com.google.refine.model.Project; -import com.google.refine.util.IOUtils; import com.google.refine.util.ParsingUtilities; -import com.ibm.icu.text.CharsetDetector; -import com.ibm.icu.text.CharsetMatch; public class CreateProjectCommand extends Command { @@ -99,13 +61,6 @@ public class CreateProjectCommand extends Command { ProjectManager.singleton.setBusy(true); try { - /* - * Set UTF-8 as request encoding, then ServletFileUpload will use it as default encoding - */ - if (request.getCharacterEncoding() == null) { - request.setCharacterEncoding("UTF-8"); - } - /* * The uploaded file is in the POST body as a "file part". If * we call request.getParameter() then the POST body will get @@ -118,7 +73,7 @@ public class CreateProjectCommand extends Command { Project project = new Project(); ProjectMetadata pm = new ProjectMetadata(); - internalImport(request, project, pm, options); + //internalImport(request, project, pm, options); /* * The import process above populates options with parameters @@ -133,382 +88,11 @@ public class CreateProjectCommand extends Command { project.update(); - redirect(response, "/project?project=" + project.id); + HttpUtilities.redirect(response, "/project?project=" + project.id); } catch (Exception e) { respondWithErrorPage(request, response, "Failed to import file", e); } finally { ProjectManager.singleton.setBusy(false); } } - - protected void internalImport( - HttpServletRequest request, - Project project, - ProjectMetadata metadata, - Properties options - ) throws Exception { - - ServletFileUpload upload = new ServletFileUpload(); - String url = options.getProperty("url"); - boolean imported = false; - - FileItemIterator iter = upload.getItemIterator(request); - while (iter.hasNext()) { - FileItemStream item = iter.next(); - String name = item.getFieldName().toLowerCase(); - InputStream stream = item.openStream(); - if (item.isFormField()) { - if (name.equals("raw-text")) { - Reader reader = new InputStreamReader(stream,request.getCharacterEncoding()); - try { - internalInvokeImporter(project, new TsvCsvImporter(), metadata, options, reader); - imported = true; - } finally { - reader.close(); - } - } else if (name.equals("project-url")) { - url = Streams.asString(stream, request.getCharacterEncoding()); - } else { - options.put(name, Streams.asString(stream, request.getCharacterEncoding())); - } - } else { - String fileName = item.getName().toLowerCase(); - if (fileName.length() > 0) { - try { - internalImportFile(project, metadata, options, fileName, stream); - imported = true; - } finally { - stream.close(); - } - } - } - } - - if (!imported && url != null && url.length() > 0) { - internalImportURL(request, project, metadata, options, url); - } - } - - static class SafeInputStream extends FilterInputStream { - public SafeInputStream(InputStream stream) { - super(stream); - } - - @Override - public void close() { - // some libraries attempt to close the input stream while they can't - // read anymore from it... unfortunately this behavior prevents - // the zip input stream from functioning correctly so we just have - // to ignore those close() calls and just close it ourselves - // forcefully later - } - - public void reallyClose() throws IOException { - super.close(); - } - } - - protected void internalImportFile( - Project project, - ProjectMetadata metadata, - Properties options, - String fileName, - InputStream inputStream - ) throws Exception { - - logger.info("Importing '{}'", fileName); - - if (fileName.endsWith(".zip") || fileName.endsWith(".tar") || fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz") || fileName.endsWith(".tar.bz2")) { - - // first, save the file on disk, since we need two passes and we might - // not have enough memory to keep it all in there - File file = save(inputStream); - - // in the first pass, gather statistics about what files are in there - // unfortunately, we have to rely on files extensions, which is horrible but - // better than nothing - HashMap ext_map = new HashMap(); - - FileInputStream fis = new FileInputStream(file); - InputStream is = getStream(fileName, fis); - - // NOTE(SM): unfortunately, java.io does not provide any generalized class for - // archive-like input streams so while both TarInputStream and ZipInputStream - // behave precisely the same, there is no polymorphic behavior so we have - // to treat each instance explicitly... one of those times you wish you had - // closures - try { - if (is instanceof TarInputStream) { - TarInputStream tis = (TarInputStream) is; - TarEntry te; - while ((te = tis.getNextEntry()) != null) { - if (!te.isDirectory()) { - mapExtension(te.getName(),ext_map); - } - } - } else if (is instanceof ZipInputStream) { - ZipInputStream zis = (ZipInputStream) is; - ZipEntry ze; - while ((ze = zis.getNextEntry()) != null) { - if (!ze.isDirectory()) { - mapExtension(ze.getName(),ext_map); - } - } - } - } finally { - try { - is.close(); - fis.close(); - } catch (IOException e) {} - } - - // sort extensions by how often they appear - List> values = new ArrayList>(ext_map.entrySet()); - Collections.sort(values, new ValuesComparator()); - - if (values.size() == 0) { - throw new RuntimeException("The archive contains no files."); - } - - // this will contain the set of extensions we'll load from the archive - HashSet exts = new HashSet(); - - // find the extension that is most frequent or those who share the highest frequency value - if (values.size() == 1) { - exts.add(values.get(0).getKey()); - } else { - Entry most_frequent = values.get(0); - Entry second_most_frequent = values.get(1); - if (most_frequent.getValue() > second_most_frequent.getValue()) { // we have a winner - exts.add(most_frequent.getKey()); - } else { // multiple extensions have the same frequency - int winning_frequency = most_frequent.getValue(); - for (Entry e : values) { - if (e.getValue() == winning_frequency) { - exts.add(e.getKey()); - } - } - } - } - - logger.info("Most frequent extensions: {}", exts.toString()); - - // second pass, load the data for real - is = getStream(fileName, new FileInputStream(file)); - SafeInputStream sis = new SafeInputStream(is); - try { - if (is instanceof TarInputStream) { - TarInputStream tis = (TarInputStream) is; - TarEntry te; - while ((te = tis.getNextEntry()) != null) { - if (!te.isDirectory()) { - String name = te.getName(); - String ext = getExtension(name)[1]; - if (exts.contains(ext)) { - internalImportFile(project, metadata, options, name, sis); - } - } - } - } else if (is instanceof ZipInputStream) { - ZipInputStream zis = (ZipInputStream) is; - ZipEntry ze; - while ((ze = zis.getNextEntry()) != null) { - if (!ze.isDirectory()) { - String name = ze.getName(); - String ext = getExtension(name)[1]; - if (exts.contains(ext)) { - internalImportFile(project, metadata, options, name, sis); - } - } - } - } - } finally { - try { - sis.reallyClose(); - } catch (IOException e) {} - } - - } else if (fileName.endsWith(".gz")) { - internalImportFile(project, metadata, options, getExtension(fileName)[0], new GZIPInputStream(inputStream)); - } else if (fileName.endsWith(".bz2")) { - internalImportFile(project, metadata, options, getExtension(fileName)[0], new CBZip2InputStream(inputStream)); - } else { - load(project, metadata, options, fileName, inputStream); - } - } - - public static class ValuesComparator implements Comparator>, Serializable { - private static final long serialVersionUID = 8845863616149837657L; - - public int compare(Entry o1, Entry o2) { - return o2.getValue() - o1.getValue(); - } - } - - private void load(Project project, ProjectMetadata metadata, Properties options, String fileName, InputStream inputStream) throws Exception { - Importer importer = ImporterRegistry.guessImporter(null, fileName); - internalInvokeImporter(project, importer, metadata, options, inputStream, null); - } - - private File save(InputStream is) throws IOException { - File temp = this.servlet.getTempFile(Long.toString(System.currentTimeMillis())); - temp.deleteOnExit(); - IOUtils.copy(is,temp); - is.close(); - return temp; - } - - private void mapExtension(String name, Map ext_map) { - String ext = getExtension(name)[1]; - if (ext_map.containsKey(ext)) { - ext_map.put(ext, ext_map.get(ext) + 1); - } else { - ext_map.put(ext, 1); - } - } - - private InputStream getStream(String fileName, InputStream is) throws IOException { - if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) { - return new TarInputStream(new GZIPInputStream(is)); - } else if (fileName.endsWith(".tar.bz2")) { - return new TarInputStream(new CBZip2InputStream(is)); - } else if (fileName.endsWith(".tar")) { - return new TarInputStream(is); - } else { - return new ZipInputStream(is); - } - } - - private String[] getExtension(String filename) { - String[] result = new String[2]; - int ext_index = filename.lastIndexOf('.'); - result[0] = (ext_index == -1) ? filename : filename.substring(0,ext_index); - result[1] = (ext_index == -1) ? "" : filename.substring(ext_index + 1); - return result; - } - - protected void internalImportURL( - HttpServletRequest request, - Project project, - ProjectMetadata metadata, - Properties options, - String urlString) throws Exception { - - // Little dance to get URL properly encoded (e.g. for funky Fusion Tables queries) - URL url = new URL(urlString); - url = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null).toURL(); - - URLConnection connection = null; - - // Try for a URL importer first - Importer importer = ImporterRegistry.guessUrlImporter(url); - if (importer instanceof UrlImporter) { - ((UrlImporter) importer).read(url, project, metadata, options); - } else { - // If we couldn't find one, try opening URL and treating as a stream - try { - connection = url.openConnection(); - connection.setConnectTimeout(5000); - connection.connect(); - } catch (Exception e) { - throw new Exception("Cannot connect to " + urlString, e); - } - - InputStream inputStream = null; - try { - inputStream = connection.getInputStream(); - } catch (Exception e) { - throw new Exception("Cannot retrieve content from " + url, e); - } - - try { - String contentType = connection.getContentType(); - int semicolon = contentType.indexOf(';'); - if (semicolon >= 0) { - contentType = contentType.substring(0, semicolon); - } - - importer = ImporterRegistry.guessImporter(contentType, url.getPath()); - - internalInvokeImporter(project, importer, metadata, options, inputStream, connection.getContentEncoding()); - } finally { - inputStream.close(); - } - } - } - - protected void internalInvokeImporter( - Project project, - Importer importer, - ProjectMetadata metadata, - Properties options, - InputStream rawInputStream, - String encoding - ) throws Exception { - if (importer instanceof ReaderImporter) { - - // NOTE: The ICU4J char detection code requires the input stream to support mark/reset. - InputStream inputStream = rawInputStream; - if (!inputStream.markSupported()) { - inputStream = new BufferedInputStream(rawInputStream); - } - - CharsetDetector detector = new CharsetDetector(); - detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that - options.setProperty("encoding_confidence", "0"); // in case we don't find anything suitable - - InputStreamReader reader = null; - CharsetMatch[] charsetMatches = detector.setText(inputStream).detectAll(); - for (CharsetMatch charsetMatch : charsetMatches) { // matches are ordered - first is best match - String matchName = charsetMatch.getName(); - int confidence = charsetMatch.getConfidence(); - // Threshold was 50. Do we ever want to not use our best guess even if it's low confidence? - tfmorris - if (confidence >= 20) { - logger.info("Encoding guess: {} [confidence: {}]", matchName, confidence); - try { - reader = new InputStreamReader(inputStream, matchName); - } catch (UnsupportedEncodingException e) { - logger.debug("Unsupported InputStreamReader charset encoding: {} [confidence: {}]; skipping", matchName, confidence); - continue; - } - // Encoding will be set later at common exit point - options.setProperty("encoding_confidence", Integer.toString(confidence)); - break; - } else { - logger.debug("Poor encoding guess: {} [confidence: {}]; skipping", matchName, confidence); - } - } - - if (reader == null) { // when all else fails - if (encoding != null) { - reader = new InputStreamReader(inputStream, encoding); - } else { - reader = new InputStreamReader(inputStream); - } - } - // Get the actual encoding which will be used and save it for project metadata - options.setProperty("encoding", reader.getEncoding()); - - ((ReaderImporter) importer).read(reader, project, metadata, options); - } else { - // TODO: How do we set character encoding here? - // Things won't work right if it's not set, so pick some arbitrary values - if (encoding != null) { - options.setProperty("encoding", encoding); - } - options.setProperty("encoding_confidence", "0"); - ((StreamImporter) importer).read(rawInputStream, project, metadata, options); - } - } - - protected void internalInvokeImporter( - Project project, - ReaderImporter importer, - ProjectMetadata metadata, - Properties options, - Reader reader - ) throws Exception { - importer.read(reader, project, metadata, options); - } - } diff --git a/main/src/com/google/refine/commands/project/GetModelsCommand.java b/main/src/com/google/refine/commands/project/GetModelsCommand.java index 80af20a23..7276f9848 100644 --- a/main/src/com/google/refine/commands/project/GetModelsCommand.java +++ b/main/src/com/google/refine/commands/project/GetModelsCommand.java @@ -44,17 +44,43 @@ import org.json.JSONException; import org.json.JSONWriter; import com.google.refine.commands.Command; +import com.google.refine.commands.HttpUtilities; import com.google.refine.expr.MetaParser; import com.google.refine.expr.MetaParser.LanguageInfo; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingManager; import com.google.refine.model.OverlayModel; import com.google.refine.model.Project; public class GetModelsCommand extends Command { @Override - public void doGet(HttpServletRequest request, HttpServletResponse response) + public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + internalRespond(request, response); + } + + @Override + public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + internalRespond(request, response); + } + + protected void internalRespond(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { - Project project = getProject(request); + Project project = null; + + // This command also supports retrieving rows for an importing job. + String importingJobID = request.getParameter("importingJobID"); + if (importingJobID != null) { + long jobID = Long.parseLong(importingJobID); + ImportingJob job = ImportingManager.getJob(jobID); + if (job != null) { + project = job.project; + } + } + if (project == null) { + project = getProject(request); + } try { response.setCharacterEncoding("UTF-8"); @@ -92,7 +118,7 @@ public class GetModelsCommand extends Command { writer.endObject(); } catch (JSONException e) { - respondException(response, e); + HttpUtilities.respondException(response, e); } } diff --git a/main/src/com/google/refine/commands/row/GetRowsCommand.java b/main/src/com/google/refine/commands/row/GetRowsCommand.java index 44bb31bca..3edb379c3 100644 --- a/main/src/com/google/refine/commands/row/GetRowsCommand.java +++ b/main/src/com/google/refine/commands/row/GetRowsCommand.java @@ -52,6 +52,8 @@ import com.google.refine.browsing.RecordVisitor; import com.google.refine.browsing.RowVisitor; import com.google.refine.browsing.Engine.Mode; import com.google.refine.commands.Command; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingManager; import com.google.refine.model.Project; import com.google.refine.model.Record; import com.google.refine.model.Row; @@ -61,7 +63,7 @@ import com.google.refine.util.ParsingUtilities; import com.google.refine.util.Pool; public class GetRowsCommand extends Command { - + @Override public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { @@ -77,7 +79,21 @@ public class GetRowsCommand extends Command { throws ServletException, IOException { try { - Project project = getProject(request); + Project project = null; + + // This command also supports retrieving rows for an importing job. + String importingJobID = request.getParameter("importingJobID"); + if (importingJobID != null) { + long jobID = Long.parseLong(importingJobID); + ImportingJob job = ImportingManager.getJob(jobID); + if (job != null) { + project = job.project; + } + } + if (project == null) { + project = getProject(request); + } + Engine engine = getEngine(request, project); String callback = request.getParameter("callback"); @@ -108,7 +124,7 @@ public class GetRowsCommand extends Command { try{ String json = request.getParameter("sorting"); sortingJson = (json == null) ? null : - ParsingUtilities.evaluateJsonStringToObject(json); + ParsingUtilities.evaluateJsonStringToObject(json); } catch (JSONException e) { } @@ -117,12 +133,12 @@ public class GetRowsCommand extends Command { RowVisitor visitor = rwv; if (sortingJson != null) { - SortingRowVisitor srv = new SortingRowVisitor(visitor); - - srv.initializeFromJSON(project, sortingJson); - if (srv.hasCriteria()) { - visitor = srv; - } + SortingRowVisitor srv = new SortingRowVisitor(visitor); + + srv.initializeFromJSON(project, sortingJson); + if (srv.hasCriteria()) { + visitor = srv; + } } jsonWriter.key("mode"); jsonWriter.value("row-based"); @@ -136,12 +152,12 @@ public class GetRowsCommand extends Command { RecordVisitor visitor = rwv; if (sortingJson != null) { - SortingRecordVisitor srv = new SortingRecordVisitor(visitor); - - srv.initializeFromJSON(project, sortingJson); - if (srv.hasCriteria()) { - visitor = srv; - } + SortingRecordVisitor srv = new SortingRecordVisitor(visitor); + + srv.initializeFromJSON(project, sortingJson); + if (srv.hasCriteria()) { + visitor = srv; + } } jsonWriter.key("mode"); jsonWriter.value("record-based"); @@ -168,8 +184,8 @@ public class GetRowsCommand extends Command { } static protected class RowWritingVisitor implements RowVisitor, RecordVisitor { - final int start; - final int limit; + final int start; + final int limit; final JSONWriter writer; final Properties options; @@ -184,20 +200,20 @@ public class GetRowsCommand extends Command { @Override public void start(Project project) { - // nothing to do + // nothing to do } @Override public void end(Project project) { - // nothing to do + // nothing to do } public boolean visit(Project project, int rowIndex, Row row) { if (total >= start && total < start + limit) { internalVisit(project, rowIndex, row); } - total++; - + total++; + return false; } @@ -206,8 +222,8 @@ public class GetRowsCommand extends Command { if (total >= start && total < start + limit) { internalVisit(project, record); } - total++; - + total++; + return false; } @@ -223,10 +239,10 @@ public class GetRowsCommand extends Command { protected boolean internalVisit(Project project, Record record) { options.put("recordIndex", record.recordIndex); - for (int r = record.fromRowIndex; r < record.toRowIndex; r++) { + for (int r = record.fromRowIndex; r < record.toRowIndex; r++) { try { - Row row = project.rows.get(r); - + Row row = project.rows.get(r); + options.put("rowIndex", r); row.write(writer, options); @@ -235,8 +251,8 @@ public class GetRowsCommand extends Command { } options.remove("recordIndex"); - } - return false; + } + return false; } } } diff --git a/main/src/com/google/refine/importers/ExcelImporter.java b/main/src/com/google/refine/importers/ExcelImporter.java index d532ec82b..1b483200a 100644 --- a/main/src/com/google/refine/importers/ExcelImporter.java +++ b/main/src/com/google/refine/importers/ExcelImporter.java @@ -33,16 +33,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.importers; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Properties; -import java.util.Set; import org.apache.poi.common.usermodel.Hyperlink; import org.apache.poi.hssf.usermodel.HSSFDateUtil; @@ -51,184 +50,152 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.json.JSONArray; +import org.json.JSONObject; import com.google.refine.ProjectMetadata; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingUtilities; import com.google.refine.model.Cell; -import com.google.refine.model.Column; import com.google.refine.model.Project; import com.google.refine.model.Recon; import com.google.refine.model.ReconCandidate; -import com.google.refine.model.Row; import com.google.refine.model.Recon.Judgment; +import com.google.refine.util.JSONUtilities; -public class ExcelImporter implements StreamImporter { - protected boolean _xmlBased; - +public class ExcelImporter extends TabularImportingParserBase { + public ExcelImporter() { + super(true); + } + @Override - public void read(InputStream inputStream, Project project, ProjectMetadata metadata, Properties options) throws ImportException { - int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1); - int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1); - int limit = ImporterUtilities.getIntegerOption("limit", options, -1); - int skip = ImporterUtilities.getIntegerOption("skip", options, 0); - + public JSONObject createParserUIInitializationData( + ImportingJob job, List fileRecords, String format) { + JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); + + boolean xmlBased = "text/xml/xlsx".equals(format); + JSONUtilities.safePut(options, "xmlBased", xmlBased); + + JSONArray sheetRecords = new JSONArray(); + JSONUtilities.safePut(options, "sheetRecords", sheetRecords); + try { + JSONObject firstFileRecord = fileRecords.get(0); + File file = ImportingUtilities.getFile(job, firstFileRecord); + InputStream is = new FileInputStream(file); + try { + Workbook wb = xmlBased ? + new XSSFWorkbook(is) : + new HSSFWorkbook(new POIFSFileSystem(is)); + + int sheetCount = wb.getNumberOfSheets(); + boolean hasData = false; + for (int i = 0; i < sheetCount; i++) { + Sheet sheet = wb.getSheetAt(i); + int rows = sheet.getLastRowNum() - sheet.getFirstRowNum() + 1; + + JSONObject sheetRecord = new JSONObject(); + JSONUtilities.safePut(sheetRecord, "name", sheet.getSheetName()); + JSONUtilities.safePut(sheetRecord, "rows", rows); + if (hasData) { + JSONUtilities.safePut(sheetRecord, "selected", false); + } else if (rows > 1) { + JSONUtilities.safePut(sheetRecord, "selected", true); + hasData = true; + } + JSONUtilities.append(sheetRecords, sheetRecord); + } + } finally { + is.close(); + } + } catch (IOException e) { + // Ignore + } + + return options; + } + + @Override + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + InputStream inputStream, + int limit, + JSONObject options, + List exceptions + ) { + boolean xmlBased = JSONUtilities.getBoolean(options, "xmlBased", false); Workbook wb = null; try { - wb = _xmlBased ? + wb = xmlBased ? new XSSFWorkbook(inputStream) : new HSSFWorkbook(new POIFSFileSystem(inputStream)); } catch (IOException e) { - throw new ImportException( + exceptions.add(new ImportException( "Attempted to parse as an Excel file but failed. " + "Try to use Excel to re-save the file as a different Excel version or as TSV and upload again.", e - ); + )); + return; } catch (ArrayIndexOutOfBoundsException e){ - throw new ImportException( - "Attempted to parse file as an Excel file but failed. " + - "This is probably caused by a corrupt excel file, or due to the file having previously been created or saved by a non-Microsoft application. " + - "Please try opening the file in Microsoft Excel and resaving it, then try re-uploading the file. " + - "See https://issues.apache.org/bugzilla/show_bug.cgi?id=48261 for further details", - e); + exceptions.add(new ImportException( + "Attempted to parse file as an Excel file but failed. " + + "This is probably caused by a corrupt excel file, or due to the file having previously been created or saved by a non-Microsoft application. " + + "Please try opening the file in Microsoft Excel and resaving it, then try re-uploading the file. " + + "See https://issues.apache.org/bugzilla/show_bug.cgi?id=48261 for further details", + e + )); + return; } - Sheet sheet = wb.getSheetAt(0); - - int firstRow = sheet.getFirstRowNum(); - int lastRow = sheet.getLastRowNum(); - - List columnNames = new ArrayList(); - Set columnNameSet = new HashSet(); - Map columnRootNameToIndex = new HashMap(); - - int rowsWithData = 0; - Map reconMap = new HashMap(); - - for (int r = firstRow; r <= lastRow; r++) { - org.apache.poi.ss.usermodel.Row row = sheet.getRow(r); - if (row == null) { - continue; - } else if (ignoreLines > 0) { - ignoreLines--; - continue; - } + int[] sheets = JSONUtilities.getIntArray(options, "sheets"); + for (int sheetIndex : sheets) { + final Sheet sheet = wb.getSheetAt(sheetIndex); + final int lastRow = sheet.getLastRowNum(); - short firstCell = row.getFirstCellNum(); - short lastCell = row.getLastCellNum(); - if (firstCell < 0 || firstCell > lastCell) { - continue; - } - - /* - * Still processing header lines - */ - if (headerLines > 0) { - headerLines--; + TableDataReader dataReader = new TableDataReader() { + int nextRow = 0; + Map reconMap = new HashMap(); - for (int c = firstCell; c <= lastCell; c++) { - org.apache.poi.ss.usermodel.Cell cell = row.getCell(c); - if (cell != null) { - Serializable value = extractCell(cell); - String text = value != null ? value.toString() : null; - if (text != null && text.length() > 0) { - while (columnNames.size() < c + 1) { - columnNames.add(null); + @Override + public List getNextRowOfCells() throws IOException { + if (nextRow >= lastRow) { + return null; + } + + List cells = new ArrayList(); + org.apache.poi.ss.usermodel.Row row = sheet.getRow(nextRow++); + if (row != null) { + short lastCell = row.getLastCellNum(); + for (short cellIndex = 0; cellIndex <= lastCell; cellIndex++) { + Cell cell = null; + + org.apache.poi.ss.usermodel.Cell sourceCell = row.getCell(cellIndex); + if (sourceCell != null) { + cell = extractCell(sourceCell, reconMap); } - - String existingName = columnNames.get(c); - String name = (existingName == null) ? text : (existingName + " " + text); - - columnNames.set(c, name); + cells.add(cell); } } + return cells; } - - if (headerLines == 0) { - for (int i = 0; i < columnNames.size(); i++) { - String rootName = columnNames.get(i); - if (rootName == null) { - continue; - } - setUnduplicatedColumnName(rootName, columnNames, i, columnNameSet, columnRootNameToIndex); - } - } - - /* - * Processing data rows - */ - } else { - Row newRow = new Row(columnNames.size()); - boolean hasData = false; - - for (int c = firstCell; c <= lastCell; c++) { - org.apache.poi.ss.usermodel.Cell cell = row.getCell(c); - if (cell == null) { - continue; - } - - Cell ourCell = extractCell(cell, reconMap); - if (ourCell != null) { - while (columnNames.size() < c + 1) { - columnNames.add(null); - } - if (columnNames.get(c) == null) { - setUnduplicatedColumnName("Column", columnNames, c, columnNameSet, columnRootNameToIndex); - } - - newRow.setCell(c, ourCell); - hasData = true; - } - } - - if (hasData) { - rowsWithData++; - - if (skip <= 0 || rowsWithData > skip) { - project.rows.add(newRow); - project.columnModel.setMaxCellIndex(newRow.cells.size()); - - if (limit > 0 && project.rows.size() >= limit) { - break; - } - } - } - } - } - - /* - * Create columns - */ - for (int c = 0; c < columnNames.size(); c++) { - String name = columnNames.get(c); - if (name != null) { - Column column = new Column(c, name); - project.columnModel.columns.add(column); - } - } - } - - protected void setUnduplicatedColumnName( - String rootName, List columnNames, int index, Set columnNameSet, Map columnRootNameToIndex) { - if (columnNameSet.contains(rootName)) { - int startIndex = columnRootNameToIndex.containsKey(rootName) ? columnRootNameToIndex.get(rootName) : 2; - while (true) { - String name = rootName + " " + startIndex; - if (columnNameSet.contains(name)) { - startIndex++; - } else { - columnNames.set(index, name); - columnNameSet.add(name); - break; - } - } + }; - columnRootNameToIndex.put(rootName, startIndex + 1); - } else { - columnNames.set(index, rootName); - columnNameSet.add(rootName); + readTable( + project, + metadata, + job, + dataReader, + fileSource + "#" + sheet.getSheetName(), + limit, + options, + exceptions + ); } } - protected Serializable extractCell(org.apache.poi.ss.usermodel.Cell cell) { + static protected Serializable extractCell(org.apache.poi.ss.usermodel.Cell cell) { int cellType = cell.getCellType(); if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) { cellType = cell.getCachedFormulaResultType(); @@ -259,7 +226,7 @@ public class ExcelImporter implements StreamImporter { return value; } - protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map reconMap) { + static protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map reconMap) { Serializable value = extractCell(cell); if (value != null) { @@ -312,33 +279,4 @@ public class ExcelImporter implements StreamImporter { return null; } } - - @Override - public boolean canImportData(String contentType, String fileName) { - if (contentType != null) { - contentType = contentType.toLowerCase().trim(); - if ("application/msexcel".equals(contentType) || - "application/x-msexcel".equals(contentType) || - "application/x-ms-excel".equals(contentType) || - "application/vnd.ms-excel".equals(contentType) || - "application/x-excel".equals(contentType) || - "application/xls".equals(contentType)) { - this._xmlBased = false; - return true; - } else if("application/x-xls".equals(contentType)) { - this._xmlBased = true; - return true; - } - } else if (fileName != null) { - fileName = fileName.toLowerCase(); - if (fileName.endsWith(".xls")) { - this._xmlBased = false; - return true; - } else if (fileName.endsWith(".xlsx")) { - this._xmlBased = true; - return true; - } - } - return false; - } } diff --git a/main/src/com/google/refine/importers/FixedWidthImporter.java b/main/src/com/google/refine/importers/FixedWidthImporter.java index d413c5583..592efb020 100644 --- a/main/src/com/google/refine/importers/FixedWidthImporter.java +++ b/main/src/com/google/refine/importers/FixedWidthImporter.java @@ -1,179 +1,107 @@ package com.google.refine.importers; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.Reader; -import java.io.Serializable; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; -import java.util.Properties; -import java.util.regex.Pattern; -import javax.servlet.ServletException; - -import org.apache.commons.lang.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.json.JSONArray; +import org.json.JSONObject; import com.google.refine.ProjectMetadata; -import com.google.refine.expr.ExpressionUtils; -import com.google.refine.model.Cell; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingUtilities; import com.google.refine.model.Project; -import com.google.refine.model.Row; +import com.google.refine.util.JSONUtilities; -public class FixedWidthImporter implements ReaderImporter, StreamImporter { //TODO this class is almost an exact copy of TsvCsvImporter. Could we combine the two, or combine common functions into a common abstract supertype? - - final static Logger logger = LoggerFactory.getLogger("FixedWidthImporter"); +public class FixedWidthImporter extends TabularImportingParserBase { + public FixedWidthImporter() { + super(false); + } @Override - public boolean canImportData(String contentType, String fileName) { - if (contentType != null) { - contentType = contentType.toLowerCase().trim(); - - //filter out tree structure data - if("application/json".equals(contentType)|| - "text/json".equals(contentType)|| - "application/xml".equals(contentType) || - "text/xml".equals(contentType) || - "application/rss+xml".equals(contentType) || - "application/atom+xml".equals(contentType) || - "application/rdf+xml".equals(contentType)) //TODO add more tree data types. - return false; - - return - "text/plain".equals(contentType) - || "text/fixed-width".equals(contentType); //FIXME Is text/fixed-width a valid contentType? - } - return false; - } - - @Override - public void read(InputStream inputStream, Project project, - ProjectMetadata metadata, Properties options) - throws ImportException { - read(new InputStreamReader(inputStream), project, metadata, options); - } - - @Override - public void read(Reader reader, Project project, ProjectMetadata metadata, - Properties options) throws ImportException { - boolean splitIntoColumns = ImporterUtilities.getBooleanOption("split-into-columns", options, true); - String columnWidths = options.getProperty("fixed-column-widths"); - int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1); - int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1); - - int limit = ImporterUtilities.getIntegerOption("limit",options,-1); - int skip = ImporterUtilities.getIntegerOption("skip",options,0); - boolean guessValueType = ImporterUtilities.getBooleanOption("guess-value-type", options, true); - - LineNumberReader lnReader = new LineNumberReader(reader); + public JSONObject createParserUIInitializationData( + ImportingJob job, List fileRecords, String format) { + JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); + JSONArray columnWidths = new JSONArray(); - - read(lnReader, project, columnWidths, - limit, skip, ignoreLines, headerLines, - guessValueType, splitIntoColumns - ); - - } - - /** - * - * @param lnReader - * LineNumberReader used to read file or string contents - * @param project - * The project into which the parsed data will be added - * @param columnWidths - * Expects a comma separated string of integers which indicate the number of characters in each line - * @param limit - * The maximum number of rows of data to import - * @param skip - * The number of initial data rows to skip - * @param ignoreLines - * The number of initial lines within the data source which should be ignored entirely - * @param headerLines - * The number of lines in the data source which describe each column - * @param guessValueType - * Whether the parser should try and guess the type of the value being parsed - * @param splitIntoColumns - * Whether the parser should try and split the data source into columns - * @throws IOException - */ - public void read(LineNumberReader lnReader, Project project, - String sep, int limit, int skip, int ignoreLines, - int headerLines, boolean guessValueType, boolean splitIntoColumns) throws ImportException{ - - int[] columnWidths = null; - - columnWidths = getColumnWidthsFromString( sep ); - - if(columnWidths.length < 2) - splitIntoColumns = false; - - List columnNames = new ArrayList(); - String line = null; - int rowsWithData = 0; - - try { - while ((line = lnReader.readLine()) != null) { - if (ignoreLines > 0) { - ignoreLines--; - continue; - } else if (StringUtils.isBlank(line)) { - continue; - } - - - if (headerLines > 0) { - //column headers - headerLines--; - - ArrayList cells = getCells(line, columnWidths, splitIntoColumns); - - for (int c = 0; c < cells.size(); c++) { - String cell = cells.get(c).trim(); - //add column even if cell is blank - ImporterUtilities.appendColumnName(columnNames, c, cell); - } - } else { - //data - Row row = new Row(columnNames.size()); - - ArrayList cells = getCells(line, columnWidths, splitIntoColumns); - - if( cells != null && cells.size() > 0 ) - rowsWithData++; - - if (skip <=0 || rowsWithData > skip){ - //add parsed data to row - for(String s : cells){ - if (ExpressionUtils.isNonBlankData(s)) { - Serializable value = guessValueType ? ImporterUtilities.parseCellValue(s) : s; - row.cells.add(new Cell(value, null)); - }else{ - row.cells.add(null); - } - } - project.rows.add(row); - project.columnModel.setMaxCellIndex(row.cells.size()); - - ImporterUtilities.ensureColumnsInRowExist(columnNames, row); - - if (limit > 0 && project.rows.size() >= limit) { - break; - } - } - } - } - } catch (IOException e) { - throw new ImportException("The fixed width importer could not read the next line", e); + JSONObject firstFileRecord = fileRecords.get(0); + String encoding = ImportingUtilities.getEncoding(firstFileRecord); + String location = JSONUtilities.getString(firstFileRecord, "location", null); + if (location != null) { + File file = new File(job.getRawDataDir(), location); + int[] columnWidthsA = guessColumnWidths(file, encoding); + if (columnWidthsA != null) { + for (int w : columnWidthsA) { + JSONUtilities.append(columnWidths, w); } - - ImporterUtilities.setupColumns(project, columnNames); - + } } + + JSONUtilities.safePut(options, "lineSeparator", "\n"); + JSONUtilities.safePut(options, "headerLines", 0); + JSONUtilities.safePut(options, "columnWidths", columnWidths); + JSONUtilities.safePut(options, "guessCellValueTypes", true); + + return options; + } + @Override + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + Reader reader, + int limit, + JSONObject options, + List exceptions + ) { + // String lineSeparator = JSONUtilities.getString(options, "lineSeparator", "\n"); + final int[] columnWidths = JSONUtilities.getIntArray(options, "columnWidths"); + + final List columnNames; + if (options.has("columnNames")) { + columnNames = new ArrayList(); + String[] strings = JSONUtilities.getStringArray(options, "columnNames"); + for (String s : strings) { + columnNames.add(s); + } + JSONUtilities.safePut(options, "headerLines", 1); + } else { + columnNames = null; + } + + final LineNumberReader lnReader = new LineNumberReader(reader); + + TableDataReader dataReader = new TableDataReader() { + boolean usedColumnNames = false; + + @Override + public List getNextRowOfCells() throws IOException { + if (columnNames != null && !usedColumnNames) { + usedColumnNames = true; + return columnNames; + } else { + String line = lnReader.readLine(); + if (line == null) { + return null; + } else { + return getCells(line, columnWidths); + } + } + } + }; + + readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); + } + /** * Splits the line into columns * @param line @@ -181,60 +109,108 @@ public class FixedWidthImporter implements ReaderImporter, StreamImporter { //TO * @param splitIntoColumns * @return */ - private ArrayList getCells(String line, int[] widths, boolean splitIntoColumns) { - ArrayList cells = new ArrayList(); - if(splitIntoColumns){ - int columnStartCursor = 0; - int columnEndCursor = 0; - for(int width : widths){ - if(columnStartCursor >= line.length()){ - cells.add(null); //FIXME is adding a null cell (to represent no data) OK? - continue; - } - - columnEndCursor = columnStartCursor + width; - - if(columnEndCursor > line.length()) - columnEndCursor = line.length(); - if(columnEndCursor <= columnStartCursor){ - cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK? - continue; - } - - cells.add(line.substring(columnStartCursor, columnEndCursor)); - - columnStartCursor = columnEndCursor; + static private ArrayList getCells(String line, int[] widths) { + ArrayList cells = new ArrayList(); + + int columnStartCursor = 0; + int columnEndCursor = 0; + for (int width : widths) { + if (columnStartCursor >= line.length()) { + cells.add(null); //FIXME is adding a null cell (to represent no data) OK? + continue; } - }else{ - cells.add(line); + + columnEndCursor = columnStartCursor + width; + + if (columnEndCursor > line.length()) { + columnEndCursor = line.length(); + } + if (columnEndCursor <= columnStartCursor) { + cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK? + continue; + } + + cells.add(line.substring(columnStartCursor, columnEndCursor)); + + columnStartCursor = columnEndCursor; + } + + // Residual text + if (columnStartCursor < line.length()) { + cells.add(line.substring(columnStartCursor)); } return cells; } - - /** - * Converts the expected string of comma separated integers into an array of integers. - * Also performs a basic sanity check on the provided data. - * - * @param sep - * A comma separated string of integers. e.g. 4,2,5,22,19 - * @return - * @throws ServletException - */ - public int[] getColumnWidthsFromString(String sep) throws ImportException { - String[] splitSep = Pattern.compile(",").split(sep); - - int[] widths = new int[splitSep.length]; - for(int i = 0; i < splitSep.length; i++){ - try{ - int parsedInt = Integer.parseInt(splitSep[i]); - if( parsedInt < 0 ) - throw new ImportException("A column cannot have a width of less than zero", null); - widths[i] = parsedInt; - }catch(NumberFormatException e){ - throw new ImportException("For a fixed column width import, the column widths must be given as a comma separated string of integers. e.g. 1,3,5,22,19", e); + + static public int[] guessColumnWidths(File file, String encoding) { + try { + InputStream is = new FileInputStream(file); + try { + Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is); + LineNumberReader lineNumberReader = new LineNumberReader(reader); + + int[] counts = null; + int totalBytes = 0; + int lineCount = 0; + String s; + while (totalBytes < 64 * 1024 && + lineCount < 100 && + (s = lineNumberReader.readLine()) != null) { + + totalBytes += s.length() + 1; // count the new line character + if (s.length() == 0) { + continue; + } + lineCount++; + + if (counts == null) { + counts = new int[s.length()]; + for (int c = 0; c < counts.length; c++) { + counts[c] = 0; + } + } + + for (int c = 0; c < counts.length && c < s.length(); c++) { + char ch = s.charAt(c); + if (ch == ' ') { + counts[c]++; + } + } + } + + if (counts != null) { + List widths = new ArrayList(); + + int startIndex = 0; + for (int c = 0; c < counts.length; c++) { + int count = counts[c]; + if (count == lineCount && c > startIndex) { + widths.add(c - startIndex + 1); + startIndex = c + 1; + } + } + + for (int i = widths.size() - 1; i > 0; i--) { + if (widths.get(i) == 1) { + widths.remove(i); + widths.set(i - 1, widths.get(i - 1) + 1); + } + } + + int[] widthA = new int[widths.size()]; + for (int i = 0; i < widthA.length; i++) { + widthA[i] = widths.get(i); + } + return widthA; + } + } finally { + is.close(); } + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); } - return widths; + return null; } - } diff --git a/main/src/com/google/refine/importers/ImportException.java b/main/src/com/google/refine/importers/ImportException.java index 0cdbc2124..070518231 100644 --- a/main/src/com/google/refine/importers/ImportException.java +++ b/main/src/com/google/refine/importers/ImportException.java @@ -38,11 +38,9 @@ package com.google.refine.importers; * indicating the underlying cause of the problem. */ public class ImportException extends Exception { - - private static final long serialVersionUID = 7077314805989174181L; - - public ImportException(String message, Throwable cause) { + private static final long serialVersionUID = 7077314805989174181L; + + public ImportException(String message, Throwable cause) { super(message, cause); } - } diff --git a/main/src/com/google/refine/importers/ImporterRegistry.java b/main/src/com/google/refine/importers/ImporterRegistry.java deleted file mode 100644 index eff576fab..000000000 --- a/main/src/com/google/refine/importers/ImporterRegistry.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - -Copyright 2010, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.importers; - -import java.net.URL; -import java.util.HashMap; -import java.util.Map; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - - -abstract public class ImporterRegistry { - final static Logger logger = LoggerFactory.getLogger("importer-registry"); - - static final private Map importers = new HashMap(); - - private static final String[][] importerNames = { - {"ExcelImporter", "com.google.refine.importers.ExcelImporter"}, - {"XmlImporter", "com.google.refine.importers.XmlImporter"}, - {"RdfTripleImporter", "com.google.refine.importers.RdfTripleImporter"}, - {"MarcImporter", "com.google.refine.importers.MarcImporter"}, - {"TsvCsvImporter", "com.google.refine.importers.TsvCsvImporter"}, - {"JsonImporter", "com.google.refine.importers.JsonImporter"}, - {"FixedWidthImporter", "com.google.refine.importers.FixedWidthImporter"} - }; - - static { - registerImporters(importerNames); - } - - static public boolean registerImporters(String[][] importers) { - boolean status = true; - for (String[] importer : importerNames) { - String importerName = importer[0]; - String className = importer[1]; - logger.debug("Loading command " + importerName + " class: " + className); - Importer cmd; - try { - // TODO: May need to use the servlet container's class loader here - cmd = (Importer) Class.forName(className).newInstance(); - } catch (InstantiationException e) { - logger.error("Failed to load importer class " + className, e); - status = false; - continue; - } catch (IllegalAccessException e) { - logger.error("Failed to load importer class " + className, e); - status = false; - continue; - } catch (ClassNotFoundException e) { - logger.error("Failed to load importer class " + className, e); - status = false; - continue; - } - status |= registerImporter(importerName, cmd); - } - return status; - } - - /** - * Register a single importer. - * - * @param name importer verb for importer - * @param importerObject object implementing the importer - * - * @return true if importer was loaded and registered successfully - */ - static public boolean registerImporter(String name, Importer importerObject) { - if (importers.containsKey(name)) { - return false; - } - importers.put(name, importerObject); - return true; - } - - // Currently only for test purposes - static protected boolean unregisterImporter(String verb) { - return importers.remove(verb) != null; - } - - static public Importer guessImporter(String contentType, String fileName, boolean provideDefault) { - for (Importer i : importers.values()){ - if(i.canImportData(contentType, fileName)){ - return i; - } - } - if (provideDefault) { - return new TsvCsvImporter(); // default - } else { - return null; - } - } - - static public Importer guessImporter(String contentType, String filename) { - return guessImporter(contentType, filename, true); - } - - static public Importer guessUrlImporter(URL url) { - for (Importer importer : importers.values()){ - if (importer instanceof UrlImporter - && ((UrlImporter) importer).canImportData(url)) { - return importer; - } - } - return null; - } -} diff --git a/main/src/com/google/refine/importers/ImporterUtilities.java b/main/src/com/google/refine/importers/ImporterUtilities.java index 71b68a9d1..a95da18c1 100644 --- a/main/src/com/google/refine/importers/ImporterUtilities.java +++ b/main/src/com/google/refine/importers/ImporterUtilities.java @@ -33,15 +33,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.importers; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.InputStream; import java.io.Serializable; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; +import org.json.JSONObject; + +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingUtilities; import com.google.refine.model.Column; +import com.google.refine.model.ModelException; import com.google.refine.model.Project; import com.google.refine.model.Row; +import com.google.refine.util.TrackingInputStream; public class ImporterUtilities { @@ -117,6 +127,33 @@ public class ImporterUtilities { columnNames.add(""); } } + + static public Column getOrAllocateColumn(Project project, List currentFileColumnNames, int index) { + if (index < currentFileColumnNames.size()) { + return project.columnModel.getColumnByName(currentFileColumnNames.get(index)); + } else if (index == currentFileColumnNames.size()) { + String prefix = "Column "; + int i = 1; + while (true) { + String columnName = prefix + i; + if (project.columnModel.getColumnByName(columnName) != null) { + // Already taken name + i++; + } else { + Column column = new Column(project.columnModel.allocateNewCellIndex(), columnName); + try { + project.columnModel.addColumn(project.columnModel.columns.size(), column, false); + } catch (ModelException e) { + // Ignore: shouldn't get in here since we just checked for duplicate names. + } + currentFileColumnNames.add(columnName); + return column; + } + } + } else { + throw new RuntimeException("Unexpected code path"); + } + } static public void setupColumns(Project project, List columnNames) { Map nameToIndex = new HashMap(); @@ -125,9 +162,10 @@ public class ImporterUtilities { if (cell.isEmpty()) { cell = "Column"; } else if (cell.startsWith("\"") && cell.endsWith("\"")) { - cell = cell.substring(1, cell.length() - 1).trim(); //FIXME is trimming quotation marks appropriate? + // FIXME: is trimming quotation marks appropriate? + cell = cell.substring(1, cell.length() - 1).trim(); } - + if (nameToIndex.containsKey(cell)) { int index = nameToIndex.get(cell); nameToIndex.put(cell, index + 1); @@ -136,11 +174,75 @@ public class ImporterUtilities { } else { nameToIndex.put(cell, 2); } - - Column column = new Column(c, cell); - - project.columnModel.columns.add(column); + + columnNames.set(c, cell); + if (project.columnModel.getColumnByName(cell) == null) { + Column column = new Column(project.columnModel.allocateNewCellIndex(), cell); + try { + project.columnModel.addColumn(project.columnModel.columns.size(), column, false); + } catch (ModelException e) { + // Ignore: shouldn't get in here since we just checked for duplicate names. + } + } } } + + static public interface MultiFileReadingProgress { + public void startFile(String fileSource); + public void readingFile(String fileSource, long bytesRead); + public void endFile(String fileSource, long bytesRead); + } + + static public MultiFileReadingProgress createMultiFileReadingProgress( + final ImportingJob job, List fileRecords) { + long totalSize = 0; + for (JSONObject fileRecord : fileRecords) { + File file = ImportingUtilities.getFile(job, fileRecord); + totalSize += file.length(); + } + + final long totalSize2 = totalSize; + return new MultiFileReadingProgress() { + long totalBytesRead = 0; + + void setProgress(String fileSource, long bytesRead) { + ImportingUtilities.setCreatingProjectProgress( + job, + "Reading " + fileSource, + (int) (100 * (totalBytesRead + bytesRead) / totalSize2)); + } + + @Override + public void startFile(String fileSource) { + setProgress(fileSource, 0); + } + @Override + public void readingFile(String fileSource, long bytesRead) { + setProgress(fileSource, bytesRead); + } + + @Override + public void endFile(String fileSource, long bytesRead) { + totalBytesRead += bytesRead; + } + }; + } + + static public InputStream openAndTrackFile( + final String fileSource, + final File file, + final MultiFileReadingProgress progress) throws FileNotFoundException { + InputStream inputStream = new FileInputStream(file); + return progress == null ? inputStream : new TrackingInputStream(inputStream) { + @Override + protected long track(long bytesRead) { + long l = super.track(bytesRead); + + progress.readingFile(fileSource, this.bytesRead); + + return l; + } + }; + } } diff --git a/main/src/com/google/refine/importers/ImportingParserBase.java b/main/src/com/google/refine/importers/ImportingParserBase.java new file mode 100644 index 000000000..ccd6e74da --- /dev/null +++ b/main/src/com/google/refine/importers/ImportingParserBase.java @@ -0,0 +1,138 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.importers; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.List; + +import org.apache.commons.lang.NotImplementedException; +import org.json.JSONObject; + +import com.google.refine.ProjectMetadata; +import com.google.refine.importers.ImporterUtilities.MultiFileReadingProgress; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingParser; +import com.google.refine.importing.ImportingUtilities; +import com.google.refine.model.Project; + +abstract public class ImportingParserBase implements ImportingParser { + final protected boolean useInputStream; + + protected ImportingParserBase(boolean useInputStream) { + this.useInputStream = useInputStream; + } + + @Override + public void parse(Project project, ProjectMetadata metadata, + final ImportingJob job, List fileRecords, String format, + int limit, JSONObject options, List exceptions) { + MultiFileReadingProgress progress = ImporterUtilities.createMultiFileReadingProgress(job, fileRecords); + for (JSONObject fileRecord : fileRecords) { + if (job.canceled) { + break; + } + + try { + parseOneFile(project, metadata, job, fileRecord, limit, options, exceptions, progress); + } catch (IOException e) { + exceptions.add(e); + } + + if (limit > 0 && project.rows.size() >= limit) { + break; + } + } + } + + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + JSONObject fileRecord, + int limit, + JSONObject options, + List exceptions, + final MultiFileReadingProgress progress + ) throws IOException { + final File file = ImportingUtilities.getFile(job, fileRecord); + final String fileSource = ImportingUtilities.getFileSource(fileRecord); + + progress.startFile(fileSource); + try { + InputStream inputStream = ImporterUtilities.openAndTrackFile(fileSource, file, progress); + try { + if (useInputStream) { + parseOneFile(project, metadata, job, fileSource, inputStream, limit, options, exceptions); + } else { + Reader reader = ImportingUtilities.getReaderFromStream(inputStream, fileRecord); + + parseOneFile(project, metadata, job, fileSource, reader, limit, options, exceptions); + } + } finally { + inputStream.close(); + } + } finally { + progress.endFile(fileSource, file.length()); + } + } + + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + Reader reader, + int limit, + JSONObject options, + List exceptions + ) { + throw new NotImplementedException(); + } + + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + InputStream inputStream, + int limit, + JSONObject options, + List exceptions + ) { + throw new NotImplementedException(); + } +} diff --git a/main/src/com/google/refine/importers/JsonImporter.java b/main/src/com/google/refine/importers/JsonImporter.java index 87fd6bf35..dca023d2e 100644 --- a/main/src/com/google/refine/importers/JsonImporter.java +++ b/main/src/com/google/refine/importers/JsonImporter.java @@ -33,95 +33,328 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.importers; -import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.PushbackInputStream; -import java.util.Properties; +import java.io.Reader; +import java.util.List; +import javax.servlet.ServletException; + +import org.codehaus.jackson.JsonFactory; +import org.codehaus.jackson.JsonParseException; +import org.codehaus.jackson.JsonParser; +import org.codehaus.jackson.JsonToken; +import org.json.JSONArray; +import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.refine.ProjectMetadata; -import com.google.refine.importers.TreeImportUtilities.ImportColumnGroup; -import com.google.refine.importers.parsers.JSONParser; -import com.google.refine.importers.parsers.TreeParser; +import com.google.refine.importers.tree.ImportColumnGroup; +import com.google.refine.importers.tree.TreeImportingParserBase; +import com.google.refine.importers.tree.TreeReader; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingUtilities; import com.google.refine.model.Project; +import com.google.refine.util.JSONUtilities; -public class JsonImporter implements StreamImporter{ - final static Logger logger = LoggerFactory.getLogger("JsonImporter"); - - public static final int BUFFER_SIZE = 64 * 1024; - - @Override - public void read(InputStream inputStream, Project project, - ProjectMetadata metadata, Properties options) - throws ImportException { - //FIXME the below is a close duplicate of the XmlImporter code. - //Should wrap a lot of the below into methods and put them in a common superclass - logger.trace("JsonImporter.read"); - PushbackInputStream pis = new PushbackInputStream(inputStream,BUFFER_SIZE); - - String[] recordPath = null; - { - byte[] buffer = new byte[BUFFER_SIZE]; - int bytes_read = 0; - try {//fill the buffer with data - while (bytes_read < BUFFER_SIZE) { - int c = pis.read(buffer, bytes_read, BUFFER_SIZE - bytes_read); - if (c == -1) break; - bytes_read +=c ; +public class JsonImporter extends TreeImportingParserBase { + public JsonImporter() { + super(false); + } + + static private class PreviewParsingState { + int tokenCount; + } + + final static private int PREVIEW_PARSING_LIMIT = 1000; + + @Override + public JSONObject createParserUIInitializationData( + ImportingJob job, List fileRecords, String format) { + JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); + try { + JSONObject firstFileRecord = fileRecords.get(0); + File file = ImportingUtilities.getFile(job, firstFileRecord); + InputStream is = new FileInputStream(file); + try { + JsonFactory factory = new JsonFactory(); + JsonParser parser = factory.createJsonParser(is); + + PreviewParsingState state = new PreviewParsingState(); + Object rootValue = parseForPreview(parser, state); + if (rootValue != null) { + JSONUtilities.safePut(options, "dom", rootValue); } - pis.unread(buffer, 0, bytes_read); + } finally { + is.close(); + } + } catch (IOException e) { + // Ignore + } + + return options; + } + + final static private Object parseForPreview(JsonParser parser, PreviewParsingState state, JsonToken token) + throws JsonParseException, IOException { + if (token != null) { + switch (token) { + case START_ARRAY: + return parseArrayForPreview(parser, state); + case START_OBJECT: + return parseObjectForPreview(parser, state); + case VALUE_STRING: + return parser.getText(); + case VALUE_NUMBER_INT: + return Integer.valueOf(parser.getIntValue()); + case VALUE_NUMBER_FLOAT: + return Float.valueOf(parser.getFloatValue()); + case VALUE_TRUE: + return Boolean.TRUE; + case VALUE_FALSE: + return Boolean.FALSE; + case VALUE_NULL: + return null; + } + } + return null; + } + + final static private Object parseForPreview(JsonParser parser, PreviewParsingState state) { + try { + JsonToken token = parser.nextToken(); + state.tokenCount++; + return parseForPreview(parser, state, token); + } catch (Exception e) { + return null; + } + } + + final static private JSONObject parseObjectForPreview(JsonParser parser, PreviewParsingState state) { + JSONObject result = new JSONObject(); + loop:while (state.tokenCount < PREVIEW_PARSING_LIMIT) { + try { + JsonToken token = parser.nextToken(); + if (token == null) { + break; + } + state.tokenCount++; + + switch (token) { + case FIELD_NAME: + String fieldName = parser.getText(); + Object fieldValue = parseForPreview(parser, state); + JSONUtilities.safePut(result, fieldName, fieldValue); + break; + case END_OBJECT: + break loop; + default: + break loop; + } + } catch (Exception e) { + break; + } + } + return result; + } + + final static private JSONArray parseArrayForPreview(JsonParser parser, PreviewParsingState state) { + JSONArray result = new JSONArray(); + loop:while (state.tokenCount < PREVIEW_PARSING_LIMIT) { + try { + JsonToken token = parser.nextToken(); + if (token == null) { + break; + } + state.tokenCount++; + + switch (token) { + case END_ARRAY: + break loop; + default: + Object element = parseForPreview(parser, state, token); + JSONUtilities.append(result, element); + } + } catch (Exception e) { + break; + } + } + return result; + } + + @Override + public void parseOneFile(Project project, ProjectMetadata metadata, + ImportingJob job, String fileSource, Reader reader, + ImportColumnGroup rootColumnGroup, int limit, JSONObject options, List exceptions) { + + parseOneFile(project, metadata, job, fileSource, + new JSONTreeReader(reader), rootColumnGroup, limit, options, exceptions); + } + + static public class JSONTreeReader implements TreeReader { + final static Logger logger = LoggerFactory.getLogger("JsonParser"); + + JsonFactory factory = new JsonFactory(); + JsonParser parser = null; + + //The following is a workaround for inconsistent Jackson JsonParser + Boolean lastTokenWasAFieldNameAndCurrentTokenIsANewEntity = false; + Boolean thisTokenIsAFieldName = false; + String lastFieldName = null; + //end of workaround + + public JSONTreeReader(Reader reader) { + try { + parser = factory.createJsonParser(reader); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * Does nothing. All Json is treated as elements + */ + @Override + public int getAttributeCount() { + // TODO Auto-generated method stub + return 0; + } + + /** + * Does nothing. All Json is treated as elements + */ + @Override + public String getAttributeLocalName(int index) { + return null; + } + + /** + * Does nothing. All Json is treated as elements + */ + @Override + public String getAttributePrefix(int index) { + // TODO Auto-generated method stub + return null; + } + + /** + * Does nothing. All Json is treated as elements + */ + @Override + public String getAttributeValue(int index) { + // TODO Auto-generated method stub + return null; + } + + @Override + public Token current() throws ServletException { + return this.mapToToken(parser.getCurrentToken()); + } + + @Override + public String getFieldName() throws ServletException{ + try { + String text = parser.getCurrentName(); + + //The following is a workaround for inconsistent Jackson JsonParser + if(text == null){ + if(this.lastTokenWasAFieldNameAndCurrentTokenIsANewEntity) + text = this.lastFieldName; + else + text = "__anonymous__"; + } + //end of workaround + + return text; + } catch (Exception e) { + throw new ServletException(e); + } + } + + /** + * Does nothing. Json does not have prefixes + */ + @Override + public String getPrefix() { + return null; + } + + @Override + public String getFieldValue() throws ServletException { + try { + return parser.getText(); + } catch (Exception e) { + throw new ServletException(e); + } + } + + @Override + public boolean hasNext() throws ServletException { + return true; //FIXME fairly obtuse, is there a better way (advancing, then rewinding?) + } + + @Override + public Token next() throws ServletException { + JsonToken next; + try { + next = parser.nextToken(); + } catch (JsonParseException e) { + throw new ServletException(e); } catch (IOException e) { - throw new ImportException("Read error",e); + throw new ServletException(e); } - - InputStream iStream = new ByteArrayInputStream(buffer, 0, bytes_read); - TreeParser parser = new JSONParser(iStream); - if (options.containsKey("importer-record-tag")) { - try{ - recordPath = XmlImportUtilities.detectPathFromTag( - parser, - options.getProperty("importer-record-tag")); - }catch(Exception e){ - // silent - // e.printStackTrace(); + + if(next == null) + throw new ServletException("No more Json Tokens in stream"); + + //The following is a workaround for inconsistent Jackson JsonParser + if(next == JsonToken.FIELD_NAME){ + try { + this.thisTokenIsAFieldName = true; + this.lastFieldName = parser.getCurrentName(); + } catch (Exception e) { + //silent } - } else { - recordPath = XmlImportUtilities.detectRecordElement(parser); + }else if(next == JsonToken.START_ARRAY || next == JsonToken.START_OBJECT){ + if(this.thisTokenIsAFieldName){ + this.lastTokenWasAFieldNameAndCurrentTokenIsANewEntity = true; + this.thisTokenIsAFieldName = false; + }else{ + this.lastTokenWasAFieldNameAndCurrentTokenIsANewEntity = false; + this.lastFieldName = null; + } + }else{ + this.lastTokenWasAFieldNameAndCurrentTokenIsANewEntity = false; + this.lastFieldName = null; + this.thisTokenIsAFieldName = false; + } + //end of workaround + + return mapToToken(next); + } + + protected Token mapToToken(JsonToken token){ + switch(token){ + case START_ARRAY: return Token.StartEntity; + case END_ARRAY: return Token.EndEntity; + case START_OBJECT: return Token.StartEntity; + case END_OBJECT: return Token.EndEntity; + case VALUE_STRING: return Token.Value; + case FIELD_NAME: return Token.Ignorable; //returned by the getLocalName function() + case VALUE_NUMBER_INT: return Token.Value; + //Json does not have START_DOCUMENT token type (so ignored as default) + //Json does not have END_DOCUMENT token type (so ignored as default) + case VALUE_TRUE : return Token.Value; + case VALUE_NUMBER_FLOAT : return Token.Value; + case VALUE_NULL : return Token.Value; + case VALUE_FALSE : return Token.Value; + case VALUE_EMBEDDED_OBJECT : return Token.Ignorable; + case NOT_AVAILABLE : return Token.Ignorable; + default: return Token.Ignorable; } } - - if (recordPath == null) - return; - ImportColumnGroup rootColumnGroup = new ImportColumnGroup(); - XmlImportUtilities.importTreeData(new JSONParser(pis), project, recordPath, rootColumnGroup); - XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup); - - project.columnModel.update(); - - } - - @Override - public boolean canImportData(String contentType, String fileName) { - if (contentType != null) { - contentType = contentType.toLowerCase().trim(); - - if("application/json".equals(contentType) || - "text/json".equals(contentType)) { - return true; - } - } else if (fileName != null) { - fileName = fileName.toLowerCase(); - if ( - fileName.endsWith(".json") || - fileName.endsWith(".js") - ) { - return true; - } - } - return false; - } - + } } diff --git a/main/src/com/google/refine/importers/LineBasedFormatGuesser.java b/main/src/com/google/refine/importers/LineBasedFormatGuesser.java new file mode 100644 index 000000000..d18d1d595 --- /dev/null +++ b/main/src/com/google/refine/importers/LineBasedFormatGuesser.java @@ -0,0 +1,21 @@ +package com.google.refine.importers; + +import java.io.File; + +import com.google.refine.importing.FormatGuesser; + +public class LineBasedFormatGuesser implements FormatGuesser { + + @Override + public String guess(File file, String encoding, String seedFormat) { + SeparatorBasedImporter.Separator sep = SeparatorBasedImporter.guessSeparator(file, encoding); + if (sep != null) { + return "text/line-based/*sv"; + } + int[] widths = FixedWidthImporter.guessColumnWidths(file, encoding); + if (widths != null) { + return "text/line-based/fixed-width"; + } + return null; + } +} diff --git a/main/src/com/google/refine/importers/LineBasedImporter.java b/main/src/com/google/refine/importers/LineBasedImporter.java new file mode 100644 index 000000000..ff7df50c8 --- /dev/null +++ b/main/src/com/google/refine/importers/LineBasedImporter.java @@ -0,0 +1,105 @@ +package com.google.refine.importers; + +import java.io.IOException; +import java.io.LineNumberReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +import org.json.JSONObject; + +import com.google.refine.ProjectMetadata; +import com.google.refine.importing.ImportingJob; +import com.google.refine.model.Project; +import com.google.refine.util.JSONUtilities; + +public class LineBasedImporter extends TabularImportingParserBase { + public LineBasedImporter() { + super(false); + } + + @Override + public JSONObject createParserUIInitializationData( + ImportingJob job, List fileRecords, String format) { + JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); + + JSONUtilities.safePut(options, "lineSeparator", "\n"); + JSONUtilities.safePut(options, "linesPerRow", 1); + JSONUtilities.safePut(options, "headerLines", 0); + JSONUtilities.safePut(options, "guessCellValueTypes", true); + + return options; + } + + @Override + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + Reader reader, + int limit, + JSONObject options, + List exceptions + ) { + final int linesPerRow = JSONUtilities.getInt(options, "linesPerRow", 1); + + final List columnNames; + if (options.has("columnNames")) { + columnNames = new ArrayList(); + String[] strings = JSONUtilities.getStringArray(options, "columnNames"); + for (String s : strings) { + columnNames.add(s); + } + JSONUtilities.safePut(options, "headerLines", 1); + } else { + columnNames = null; + JSONUtilities.safePut(options, "headerLines", 0); + } + + final LineNumberReader lnReader = new LineNumberReader(reader); + + try { + int skip = JSONUtilities.getInt(options, "ignoreLines", -1); + while (skip > 0) { + lnReader.readLine(); + skip--; + } + } catch (IOException e) { + e.printStackTrace(); + } + JSONUtilities.safePut(options, "ignoreLines", -1); + + TableDataReader dataReader = new TableDataReader() { + boolean usedColumnNames = false; + + @Override + public List getNextRowOfCells() throws IOException { + if (columnNames != null && !usedColumnNames) { + usedColumnNames = true; + return columnNames; + } else { + List cells = null; + for (int i = 0; i < linesPerRow; i++) { + String line = lnReader.readLine(); + if (i == 0) { + if (line == null) { + return null; + } else { + cells = new ArrayList(linesPerRow); + cells.add(line); + } + } else if (line != null) { + cells.add(line); + } else { + break; + } + } + return cells; + } + } + }; + + readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); + } +} diff --git a/main/src/com/google/refine/importers/MarcImporter.java b/main/src/com/google/refine/importers/MarcImporter.java index 2038082f1..07a4b4122 100644 --- a/main/src/com/google/refine/importers/MarcImporter.java +++ b/main/src/com/google/refine/importers/MarcImporter.java @@ -40,56 +40,44 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.util.Properties; +import java.util.List; +import org.json.JSONObject; import org.marc4j.MarcPermissiveStreamReader; import org.marc4j.MarcWriter; import org.marc4j.MarcXmlWriter; import org.marc4j.marc.Record; import com.google.refine.ProjectMetadata; +import com.google.refine.importers.tree.ImportColumnGroup; +import com.google.refine.importing.ImportingJob; import com.google.refine.model.Project; -public class MarcImporter implements StreamImporter { - +public class MarcImporter extends XmlImporter { @Override - public void read( - InputStream inputStream, - Project project, - ProjectMetadata metadata, Properties options - ) throws ImportException { - int limit = ImporterUtilities.getIntegerOption("limit",options,-1); - int skip = ImporterUtilities.getIntegerOption("skip",options,0); - + public void parseOneFile(Project project, ProjectMetadata metadata, + ImportingJob job, String fileSource, InputStream inputStream, + ImportColumnGroup rootColumnGroup, int limit, JSONObject options, + List exceptions) { + File tempFile; try { tempFile = File.createTempFile("refine-import-", ".marc.xml"); } catch (IOException e) { - throw new ImportException("Unexpected error creating temp file",e); + exceptions.add(new ImportException("Unexpected error creating temp file", e)); + return; } + try { OutputStream os = new FileOutputStream(tempFile); try { - MarcPermissiveStreamReader reader = new MarcPermissiveStreamReader( - inputStream, - true, - true - ); MarcWriter writer = new MarcXmlWriter(os, true); - - int count = 0; + + MarcPermissiveStreamReader reader = new MarcPermissiveStreamReader( + inputStream, true, true); while (reader.hasNext()) { Record record = reader.next(); - if (skip <= 0) { - if (limit == -1 || count < limit) { - writer.write(record); - count++; - } else { - break; - } - } else { - skip--; - } + writer.write(record); } writer.close(); } finally { @@ -102,7 +90,8 @@ public class MarcImporter implements StreamImporter { InputStream is = new FileInputStream(tempFile); try { - new XmlImporter().read(is, project, metadata, options); + super.parseOneFile(project, metadata, job, fileSource, inputStream, + rootColumnGroup, limit, options, exceptions); } finally { try { is.close(); @@ -111,31 +100,10 @@ public class MarcImporter implements StreamImporter { } } } catch (FileNotFoundException e) { - throw new ImportException("Input file not found", e); + exceptions.add(new ImportException("Input file not found", e)); + return; } finally { tempFile.delete(); } } - - @Override - public boolean canImportData(String contentType, String fileName) { - if (contentType != null) { - contentType = contentType.toLowerCase().trim(); - - if ("application/marc".equals(contentType)) { - return true; - } - } else if (fileName != null) { - fileName = fileName.toLowerCase(); - if ( - fileName.endsWith(".mrc") || - fileName.endsWith(".marc") || - fileName.contains(".mrc.") || - fileName.contains(".marc.") - ) { - return true; - } - } - return false; - } } diff --git a/main/src/com/google/refine/importers/RdfTripleImporter.java b/main/src/com/google/refine/importers/RdfTripleImporter.java index ed8226249..cb0919729 100644 --- a/main/src/com/google/refine/importers/RdfTripleImporter.java +++ b/main/src/com/google/refine/importers/RdfTripleImporter.java @@ -33,66 +33,74 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.importers; -import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Map.Entry; +import org.apache.commons.lang.NotImplementedException; import org.jrdf.JRDFFactory; import org.jrdf.SortedMemoryJRDFFactory; import org.jrdf.collection.MemMapFactory; import org.jrdf.graph.Graph; import org.jrdf.graph.Triple; -import org.jrdf.parser.ParseException; -import org.jrdf.parser.StatementHandlerException; import org.jrdf.parser.line.GraphLineParser; import org.jrdf.parser.line.LineHandler; import org.jrdf.parser.ntriples.NTriplesParserFactory; import org.jrdf.util.ClosableIterable; +import org.json.JSONObject; + import static org.jrdf.graph.AnyObjectNode.ANY_OBJECT_NODE; import static org.jrdf.graph.AnyPredicateNode.ANY_PREDICATE_NODE; import static org.jrdf.graph.AnySubjectNode.ANY_SUBJECT_NODE; import com.google.refine.ProjectMetadata; import com.google.refine.expr.ExpressionUtils; +import com.google.refine.importing.ImportingJob; import com.google.refine.model.Cell; import com.google.refine.model.Column; import com.google.refine.model.ModelException; import com.google.refine.model.Project; import com.google.refine.model.Row; +import com.google.refine.util.JSONUtilities; -public class RdfTripleImporter implements ReaderImporter{ +public class RdfTripleImporter extends ImportingParserBase { private JRDFFactory _jrdfFactory; private NTriplesParserFactory _nTriplesParserFactory; private MemMapFactory _newMapFactory; - public RdfTripleImporter(){ + public RdfTripleImporter() { + super(false); _jrdfFactory = SortedMemoryJRDFFactory.getFactory(); _nTriplesParserFactory = new NTriplesParserFactory(); _newMapFactory = new MemMapFactory(); } - + @Override - public void read(Reader reader, Project project, ProjectMetadata metadata, Properties options) throws ImportException { - String baseUrl = options.getProperty("base-url"); - + public JSONObject createParserUIInitializationData(ImportingJob job, + List fileRecords, String format) { + throw new NotImplementedException(); + } + + @Override + public void parseOneFile(Project project, ProjectMetadata metadata, + ImportingJob job, String fileSource, Reader reader, int limit, + JSONObject options, List exceptions) { + + String baseUrl = JSONUtilities.getString(options, "baseUrl", ""); + Graph graph = _jrdfFactory.getNewGraph(); LineHandler lineHandler = _nTriplesParserFactory.createParser(graph, _newMapFactory); GraphLineParser parser = new GraphLineParser(graph, lineHandler); try { parser.parse(reader, baseUrl); // fills JRDF graph - } catch (IOException e) { - throw new ImportException("i/o error while parsing RDF",e); - } catch (ParseException e) { - throw new ImportException("error parsing RDF",e); - } catch (StatementHandlerException e) { - throw new ImportException("error parsing RDF",e); + } catch (Exception e) { + exceptions.add(e); + return; } - + Map> subjectToRows = new HashMap>(); Column subjectColumn = new Column(0, "subject"); @@ -152,24 +160,4 @@ public class RdfTripleImporter implements ReaderImporter{ triples.iterator().close(); } } - - - @Override - public boolean canImportData(String contentType, String fileName) { - if (contentType != null) { - contentType = contentType.toLowerCase().trim(); - - if("application/rdf+xml".equals(contentType)) { - return true; - } - } else if (fileName != null) { - fileName = fileName.toLowerCase(); - if ( - fileName.endsWith(".rdf")) { - return true; - } - } - return false; - } - } diff --git a/main/src/com/google/refine/importers/SeparatorBasedImporter.java b/main/src/com/google/refine/importers/SeparatorBasedImporter.java new file mode 100644 index 000000000..801d38751 --- /dev/null +++ b/main/src/com/google/refine/importers/SeparatorBasedImporter.java @@ -0,0 +1,245 @@ +/* + +Copyright 2010, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.importers; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.json.JSONObject; + +import au.com.bytecode.opencsv.CSVParser; + +import com.google.refine.ProjectMetadata; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingUtilities; +import com.google.refine.model.Project; +import com.google.refine.util.JSONUtilities; + +public class SeparatorBasedImporter extends TabularImportingParserBase { + public SeparatorBasedImporter() { + super(false); + } + + @Override + public JSONObject createParserUIInitializationData(ImportingJob job, + List fileRecords, String format) { + JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); + + JSONUtilities.safePut(options, "lineSeparator", "\n"); + + String separator = guessSeparator(job, fileRecords); + JSONUtilities.safePut(options, "separator", separator != null ? separator : "\t"); + + JSONUtilities.safePut(options, "guessCellValueTypes", true); + JSONUtilities.safePut(options, "processQuotes", true); + + return options; + } + + @Override + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + Reader reader, + int limit, + JSONObject options, + List exceptions + ) { + // String lineSeparator = JSONUtilities.getString(options, "lineSeparator", "\n"); + String sep = JSONUtilities.getString(options, "separator", "\t"); + boolean processQuotes = JSONUtilities.getBoolean(options, "processQuotes", true); + + final CSVParser parser = new CSVParser( + sep.toCharArray()[0],//HACK changing string to char - won't work for multi-char separators. + CSVParser.DEFAULT_QUOTE_CHARACTER, + (char) 0, // escape character + CSVParser.DEFAULT_STRICT_QUOTES, + CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE, + !processQuotes); + + final LineNumberReader lnReader = new LineNumberReader(reader); + + TableDataReader dataReader = new TableDataReader() { + long bytesRead = 0; + + @Override + public List getNextRowOfCells() throws IOException { + String line = lnReader.readLine(); + if (line == null) { + return null; + } else { + bytesRead += line.length(); + return getCells(line, parser, lnReader); + } + } + }; + + readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); + } + + static protected ArrayList getCells(String line, CSVParser parser, LineNumberReader lnReader) + throws IOException{ + + ArrayList cells = new ArrayList(); + String[] tokens = parser.parseLineMulti(line); + for (String s : tokens){ + cells.add(s); + } + while (parser.isPending()) { + tokens = parser.parseLineMulti(lnReader.readLine()); + for (String s : tokens) { + cells.add(s); + } + } + return cells; + } + + static public String guessSeparator(ImportingJob job, List fileRecords) { + for (int i = 0; i < 5 && i < fileRecords.size(); i++) { + JSONObject fileRecord = fileRecords.get(i); + String encoding = ImportingUtilities.getEncoding(fileRecord); + String location = JSONUtilities.getString(fileRecord, "location", null); + + if (location != null) { + File file = new File(job.getRawDataDir(), location); + Separator separator = guessSeparator(file, encoding); + if (separator != null) { + return Character.toString(separator.separator); + } + } + } + return null; + } + + static public class Separator { + char separator; + int totalCount = 0; + int totalOfSquaredCount = 0; + int currentLineCount = 0; + + double averagePerLine; + double stddev; + } + + static public Separator guessSeparator(File file, String encoding) { + try { + InputStream is = new FileInputStream(file); + try { + Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is); + LineNumberReader lineNumberReader = new LineNumberReader(reader); + + List separators = new ArrayList(); + Map separatorMap = new HashMap(); + + int totalBytes = 0; + int lineCount = 0; + String s; + while (totalBytes < 64 * 1024 && + lineCount < 100 && + (s = lineNumberReader.readLine()) != null) { + + totalBytes += s.length() + 1; // count the new line character + if (s.length() == 0) { + continue; + } + lineCount++; + + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (!Character.isLetterOrDigit(c) && + !"\"' .-".contains(s.subSequence(i, i + 1))) { + Separator separator = separatorMap.get(c); + if (separator == null) { + separator = new Separator(); + separator.separator = c; + + separatorMap.put(c, separator); + separators.add(separator); + } + separator.currentLineCount++; + } + } + + for (Separator separator : separators) { + separator.totalCount += separator.currentLineCount; + separator.totalOfSquaredCount += separator.currentLineCount * separator.currentLineCount; + separator.currentLineCount = 0; + } + } + + if (separators.size() > 0) { + for (Separator separator : separators) { + separator.averagePerLine = separator.totalCount / (double) lineCount; + separator.stddev = Math.sqrt( + separator.totalOfSquaredCount / (double) lineCount - + separator.averagePerLine * separator.averagePerLine); + } + + Collections.sort(separators, new Comparator() { + @Override + public int compare(Separator sep0, Separator sep1) { + return Double.compare(sep0.stddev, sep1.stddev); + } + }); + for (Separator separator : separators) { + if (separator.stddev / separator.averagePerLine < 0.1) { + return separator; + } + } + } + } finally { + is.close(); + } + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return null; + } +} diff --git a/main/src/com/google/refine/importers/TabularImportingParserBase.java b/main/src/com/google/refine/importers/TabularImportingParserBase.java new file mode 100644 index 000000000..be4f10189 --- /dev/null +++ b/main/src/com/google/refine/importers/TabularImportingParserBase.java @@ -0,0 +1,205 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.importers; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import org.json.JSONObject; + +import com.google.refine.ProjectMetadata; +import com.google.refine.expr.ExpressionUtils; +import com.google.refine.importing.ImportingJob; +import com.google.refine.model.Cell; +import com.google.refine.model.Column; +import com.google.refine.model.ModelException; +import com.google.refine.model.Project; +import com.google.refine.model.Row; +import com.google.refine.util.JSONUtilities; + +abstract public class TabularImportingParserBase extends ImportingParserBase { + static public interface TableDataReader { + public List getNextRowOfCells() throws IOException; + } + + @Override + public JSONObject createParserUIInitializationData(ImportingJob job, + List fileRecords, String format) { + JSONObject options = new JSONObject(); + + JSONUtilities.safePut(options, "ignoreLines", -1); // number of blank lines at the beginning to ignore + JSONUtilities.safePut(options, "headerLines", 1); // number of header lines + + JSONUtilities.safePut(options, "skipDataLines", 0); // number of initial data lines to skip + JSONUtilities.safePut(options, "storeBlankRows", true); + JSONUtilities.safePut(options, "storeBlankCellsAsNulls", true); + + JSONUtilities.safePut(options, "includeFileSources", fileRecords.size() > 1); + + return options; + } + + protected TabularImportingParserBase(boolean useInputStream) { + super(useInputStream); + } + + protected void readTable( + Project project, + ProjectMetadata metadata, + ImportingJob job, + TableDataReader reader, + String fileSource, + int limit, + JSONObject options, + List exceptions + ) { + int ignoreLines = JSONUtilities.getInt(options, "ignoreLines", -1); + int headerLines = JSONUtilities.getInt(options, "headerLines", 1); + int skipDataLines = JSONUtilities.getInt(options, "skipDataLines", 0); + int limit2 = JSONUtilities.getInt(options, "limit", -1); + if (limit > 0) { + if (limit2 > 0) { + limit2 = Math.min(limit, limit2); + } else { + limit2 = limit; + } + } + + boolean guessCellValueTypes = JSONUtilities.getBoolean(options, "guessCellValueTypes", true); + + boolean storeBlankRows = JSONUtilities.getBoolean(options, "storeBlankRows", true); + boolean storeBlankCellsAsNulls = JSONUtilities.getBoolean(options, "storeBlankCellsAsNulls", true); + boolean includeFileSources = JSONUtilities.getBoolean(options, "includeFileSources", false); + + String fileNameColumnName = "File"; + if (includeFileSources) { + if (project.columnModel.getColumnByName(fileNameColumnName) == null) { + try { + project.columnModel.addColumn( + 0, new Column(project.columnModel.allocateNewCellIndex(), fileNameColumnName), false); + } catch (ModelException e) { + // Ignore: We already checked for duplicate name. + } + } + } + + List columnNames = new ArrayList(); + + List cells = null; + int rowsWithData = 0; + + try { + while (!job.canceled && (cells = reader.getNextRowOfCells()) != null) { + if (ignoreLines > 0) { + ignoreLines--; + continue; + } + + if (headerLines > 0) { // header lines + for (int c = 0; c < cells.size(); c++) { + Object cell = cells.get(c); + + String columnName; + if (cell == null) { + // add column even if cell is blank + columnName = ""; + } else if (cell instanceof Cell) { + columnName = ((Cell) cell).value.toString().trim(); + } else { + columnName = cell.toString().trim(); + } + + ImporterUtilities.appendColumnName(columnNames, c, columnName); + } + + headerLines--; + if (headerLines == 0) { + ImporterUtilities.setupColumns(project, columnNames); + } + } else { // data lines + Row row = new Row(columnNames.size()); + + if (storeBlankRows) { + rowsWithData++; + } else if (cells.size() > 0) { + rowsWithData++; + } + + if (skipDataLines <= 0 || rowsWithData > skipDataLines) { + boolean rowHasData = false; + for (int c = 0; c < cells.size(); c++) { + Column column = ImporterUtilities.getOrAllocateColumn(project, columnNames, c); + + Object value = cells.get(c); + if (value != null && value instanceof Cell) { + row.setCell(column.getCellIndex(), (Cell) value); + rowHasData = true; + } else if (ExpressionUtils.isNonBlankData(value)) { + Serializable storedValue; + if (value instanceof String) { + storedValue = guessCellValueTypes ? + ImporterUtilities.parseCellValue((String) value) : (String) value; + } else { + storedValue = ExpressionUtils.wrapStorable(value); + } + + row.setCell(column.getCellIndex(), new Cell(storedValue, null)); + rowHasData = true; + } else if (!storeBlankCellsAsNulls) { + row.setCell(column.getCellIndex(), new Cell("", null)); + } + } + + if (rowHasData || storeBlankRows) { + if (includeFileSources) { + row.setCell( + project.columnModel.getColumnByName(fileNameColumnName).getCellIndex(), + new Cell(fileSource, null)); + } + project.rows.add(row); + } + + if (limit2 > 0 && project.rows.size() >= limit2) { + break; + } + } + } + } + } catch (IOException e) { + exceptions.add(e); + } + } +} diff --git a/main/src/com/google/refine/importers/TextFormatGuesser.java b/main/src/com/google/refine/importers/TextFormatGuesser.java new file mode 100644 index 000000000..1c9551bb0 --- /dev/null +++ b/main/src/com/google/refine/importers/TextFormatGuesser.java @@ -0,0 +1,63 @@ +package com.google.refine.importers; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.nio.CharBuffer; + +import com.google.refine.importing.FormatGuesser; + +public class TextFormatGuesser implements FormatGuesser { + + @Override + public String guess(File file, String encoding, String seedFormat) { + try { + InputStream is = new FileInputStream(file); + try { + Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is); + + int totalBytes = 0; + int bytes; + int lineBreaks = 0; + + CharBuffer charBuffer = CharBuffer.allocate(4096); + while (totalBytes < 64 * 1024 && (bytes = reader.read(charBuffer)) > 0) { + lineBreaks += countSubstrings(charBuffer.toString(), "\n"); + + charBuffer.clear(); + totalBytes += bytes; + } + + if (lineBreaks > 3) { + return "text/line-based"; + } + } finally { + is.close(); + } + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return null; + } + + static public int countSubstrings(String s, String sub) { + int count = 0; + int from = 0; + while (from < s.length()) { + int i = s.indexOf(sub, from); + if (i < 0) { + break; + } else { + from = i + sub.length(); + count++; + } + } + return count; + } +} diff --git a/main/src/com/google/refine/importers/TsvCsvImporter.java b/main/src/com/google/refine/importers/TsvCsvImporter.java deleted file mode 100644 index a7f68ddb5..000000000 --- a/main/src/com/google/refine/importers/TsvCsvImporter.java +++ /dev/null @@ -1,238 +0,0 @@ -/* - -Copyright 2010, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.importers; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.LineNumberReader; -import java.io.Reader; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - -import org.apache.commons.lang.StringUtils; - -import au.com.bytecode.opencsv.CSVParser; - -import com.google.refine.ProjectMetadata; -import com.google.refine.expr.ExpressionUtils; -import com.google.refine.model.Cell; -import com.google.refine.model.Project; -import com.google.refine.model.Row; - -public class TsvCsvImporter implements ReaderImporter,StreamImporter { - - @Override - public void read(Reader reader, Project project, ProjectMetadata metadata, Properties options) throws ImportException { - boolean splitIntoColumns = ImporterUtilities.getBooleanOption("split-into-columns", options, true); - - String sep = options.getProperty("separator"); // auto-detect if not present - int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1); - int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1); - - int limit = ImporterUtilities.getIntegerOption("limit",options,-1); - int skip = ImporterUtilities.getIntegerOption("skip",options,0); - boolean guessValueType = ImporterUtilities.getBooleanOption("guess-value-type", options, true); - boolean ignoreQuotes = ImporterUtilities.getBooleanOption("ignore-quotes", options, false); - - LineNumberReader lnReader = new LineNumberReader(reader); - - try { - read(lnReader, project, sep, - limit, skip, ignoreLines, headerLines, - guessValueType, splitIntoColumns, ignoreQuotes - ); - } catch (IOException e) { - throw new ImportException("Import failed",e); - } - } - - /** - * - * @param lnReader - * LineNumberReader used to read file or string contents - * @param project - * The project into which the parsed data will be added - * @param sep - * The character used to denote different the break between data points - * @param limit - * The maximum number of rows of data to import - * @param skip - * The number of initial data rows to skip - * @param ignoreLines - * The number of initial lines within the data source which should be ignored entirely - * @param headerLines - * The number of lines in the data source which describe each column - * @param guessValueType - * Whether the parser should try and guess the type of the value being parsed - * @param splitIntoColumns - * Whether the parser should try and split the data source into columns - * @param ignoreQuotes - * Quotation marks are ignored, and all separators and newlines treated as such regardless of whether they are within quoted values - * @throws IOException - */ - public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns, boolean ignoreQuotes ) throws IOException{ - CSVParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ? - new CSVParser(sep.toCharArray()[0],//HACK changing string to char - won't work for multi-char separators. - CSVParser.DEFAULT_QUOTE_CHARACTER, - (char) 0, // escape character - CSVParser.DEFAULT_STRICT_QUOTES, - CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE, - ignoreQuotes) : null; - List columnNames = new ArrayList(); - String line = null; - int rowsWithData = 0; - - while ((line = lnReader.readLine()) != null) { - if (ignoreLines > 0) { - ignoreLines--; - continue; - } else if (StringUtils.isBlank(line)) { - continue; - } - - //guess separator - if (parser == null) { - int tab = line.indexOf('\t'); - if (tab >= 0) { - parser = new CSVParser('\t', - CSVParser.DEFAULT_QUOTE_CHARACTER, - (char) 0, // escape character - CSVParser.DEFAULT_STRICT_QUOTES, - CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE, - ignoreQuotes); - } else { - parser = new CSVParser(',', - CSVParser.DEFAULT_QUOTE_CHARACTER, - (char) 0, // escape character - CSVParser.DEFAULT_STRICT_QUOTES, - CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE, - ignoreQuotes); - } - } - - - if (headerLines > 0) { - //column headers - headerLines--; - - ArrayList cells = getCells(line, parser, lnReader, splitIntoColumns); - - for (int c = 0; c < cells.size(); c++) { - String cell = cells.get(c).trim(); - //add column even if cell is blank - ImporterUtilities.appendColumnName(columnNames, c, cell); - } - } else { - //data - Row row = new Row(columnNames.size()); - - ArrayList cells = getCells(line, parser, lnReader, splitIntoColumns); - - if( cells != null && cells.size() > 0 ) - rowsWithData++; - - if (skip <=0 || rowsWithData > skip){ - //add parsed data to row - for(String s : cells){ - if (ExpressionUtils.isNonBlankData(s)) { - Serializable value = guessValueType ? ImporterUtilities.parseCellValue(s) : s; - row.cells.add(new Cell(value, null)); - }else{ - row.cells.add(null); - } - } - project.rows.add(row); - project.columnModel.setMaxCellIndex(row.cells.size()); - - ImporterUtilities.ensureColumnsInRowExist(columnNames, row); - - if (limit > 0 && project.rows.size() >= limit) { - break; - } - } - } - } - - ImporterUtilities.setupColumns(project, columnNames); - } - - protected ArrayList getCells(String line, CSVParser parser, LineNumberReader lnReader, boolean splitIntoColumns) throws IOException{ - ArrayList cells = new ArrayList(); - if(splitIntoColumns){ - String[] tokens = parser.parseLineMulti(line); - for(String s : tokens){ - cells.add(s); - } - while(parser.isPending()){ - tokens = parser.parseLineMulti(lnReader.readLine()); - for(String s : tokens){ - cells.add(s); - } - } - }else{ - cells.add(line); - } - return cells; - } - - @Override - public void read(InputStream inputStream, Project project, - ProjectMetadata metadata, Properties options) throws ImportException { - read(new InputStreamReader(inputStream), project, metadata, options); - } - - @Override - public boolean canImportData(String contentType, String fileName) { - if (contentType != null) { - contentType = contentType.toLowerCase().trim(); - return - "text/plain".equals(contentType) || - "text/csv".equals(contentType) || - "text/x-csv".equals(contentType) || - "text/tab-separated-value".equals(contentType); - - } else if (fileName != null) { - fileName = fileName.toLowerCase(); - if (fileName.endsWith(".tsv")) { - return true; - }else if (fileName.endsWith(".csv")){ - return true; - } - } - return false; - } -} diff --git a/main/src/com/google/refine/importers/XmlImporter.java b/main/src/com/google/refine/importers/XmlImporter.java index f1a24fb4a..c86638ec3 100644 --- a/main/src/com/google/refine/importers/XmlImporter.java +++ b/main/src/com/google/refine/importers/XmlImporter.java @@ -33,99 +33,274 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.importers; -import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.PushbackInputStream; -import java.util.Properties; +import java.util.List; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import javax.servlet.ServletException; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; + +import org.json.JSONArray; +import org.json.JSONObject; import com.google.refine.ProjectMetadata; -import com.google.refine.importers.TreeImportUtilities.ImportColumnGroup; -import com.google.refine.importers.parsers.TreeParser; -import com.google.refine.importers.parsers.XmlParser; +import com.google.refine.importers.tree.ImportColumnGroup; +import com.google.refine.importers.tree.TreeImportingParserBase; +import com.google.refine.importers.tree.TreeReader; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingUtilities; import com.google.refine.model.Project; +import com.google.refine.util.JSONUtilities; -public class XmlImporter implements StreamImporter { - - final static Logger logger = LoggerFactory.getLogger("XmlImporter"); - - public static final int BUFFER_SIZE = 64 * 1024; - +public class XmlImporter extends TreeImportingParserBase { + public XmlImporter() { + super(true); + } + + static private class PreviewParsingState { + int tokenCount; + } + + final static private int PREVIEW_PARSING_LIMIT = 1000; + @Override - public void read( - InputStream inputStream, - Project project, - ProjectMetadata metadata, Properties options - ) throws ImportException { - logger.trace("XmlImporter.read"); - PushbackInputStream pis = new PushbackInputStream(inputStream,BUFFER_SIZE); + public JSONObject createParserUIInitializationData( + ImportingJob job, List fileRecords, String format) { + JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); + try { + JSONObject firstFileRecord = fileRecords.get(0); + File file = ImportingUtilities.getFile(job, firstFileRecord); + InputStream is = new FileInputStream(file); + try { + XMLStreamReader parser = createXMLStreamReader(is); + PreviewParsingState state = new PreviewParsingState(); + + while (parser.hasNext() && state.tokenCount < PREVIEW_PARSING_LIMIT) { + int tokenType = parser.next(); + state.tokenCount++; + if (tokenType == XMLStreamConstants.START_ELEMENT) { + JSONObject rootElement = descendElement(parser, state); + if (rootElement != null) { + JSONUtilities.safePut(options, "dom", rootElement); + break; + } + } else { + // ignore everything else + } + } + } finally { + is.close(); + } + } catch (XMLStreamException e) { + // Ignore + } catch (IOException e) { + // Ignore + } - String[] recordPath = null; + return options; + } + + final static private JSONObject descendElement(XMLStreamReader parser, PreviewParsingState state) throws XMLStreamException { + JSONObject result = new JSONObject(); { - byte[] buffer = new byte[BUFFER_SIZE]; - int bytes_read = 0; - try {//fill the buffer with data - while (bytes_read < BUFFER_SIZE) { - int c = pis.read(buffer, bytes_read, BUFFER_SIZE - bytes_read); - if (c == -1) break; - bytes_read +=c ; - } - pis.unread(buffer, 0, bytes_read); - } catch (IOException e) { - throw new ImportException("Read error",e); + String name = parser.getLocalName(); + JSONUtilities.safePut(result, "n", name); + + String prefix = parser.getPrefix(); + if (prefix != null) { + JSONUtilities.safePut(result, "p", prefix); } - - InputStream iStream = new ByteArrayInputStream(buffer, 0, bytes_read); - TreeParser parser = new XmlParser(iStream); - if (options.containsKey("importer-record-tag")) { - try{ - recordPath = XmlImportUtilities.detectPathFromTag( - parser, - options.getProperty("importer-record-tag")); - }catch(Exception e){ - // silent - // e.printStackTrace(); + String nsUri = parser.getNamespaceURI(); + if (nsUri != null) { + JSONUtilities.safePut(result, "uri", nsUri); + } + } + + int namespaceCount = parser.getNamespaceCount(); + if (namespaceCount > 0) { + JSONArray namespaces = new JSONArray(); + JSONUtilities.safePut(result, "ns", namespaces); + + for (int i = 0; i < namespaceCount; i++) { + JSONObject namespace = new JSONObject(); + JSONUtilities.append(namespaces, namespace); + JSONUtilities.safePut(namespace, "p", parser.getNamespacePrefix(i)); + JSONUtilities.safePut(namespace, "uri", parser.getNamespaceURI(i)); + } + } + + int attributeCount = parser.getAttributeCount(); + if (attributeCount > 0) { + JSONArray attributes = new JSONArray(); + JSONUtilities.safePut(result, "a", attributes); + + for (int i = 0; i < attributeCount; i++) { + JSONObject attribute = new JSONObject(); + JSONUtilities.append(attributes, attribute); + JSONUtilities.safePut(attribute, "n", parser.getAttributeLocalName(i)); + JSONUtilities.safePut(attribute, "v", parser.getAttributeValue(i)); + String prefix = parser.getAttributePrefix(i); + if (prefix != null) { + JSONUtilities.safePut(attribute, "p", prefix); } + } + } + + JSONArray children = new JSONArray(); + while (parser.hasNext() && state.tokenCount < PREVIEW_PARSING_LIMIT) { + int tokenType = parser.next(); + state.tokenCount++; + if (tokenType == XMLStreamConstants.END_ELEMENT) { + break; + } else if (tokenType == XMLStreamConstants.START_ELEMENT) { + JSONObject childElement = descendElement(parser, state); + if (childElement != null) { + JSONUtilities.append(children, childElement); + } + } else if (tokenType == XMLStreamConstants.CHARACTERS || + tokenType == XMLStreamConstants.CDATA || + tokenType == XMLStreamConstants.SPACE) { + JSONObject childElement = new JSONObject(); + JSONUtilities.safePut(childElement, "t", parser.getText()); + JSONUtilities.append(children, childElement); } else { - recordPath = XmlImportUtilities.detectRecordElement(parser); + // ignore everything else } } - - if (recordPath == null) - return; - - ImportColumnGroup rootColumnGroup = new ImportColumnGroup(); - XmlImportUtilities.importTreeData(new XmlParser(pis), project, recordPath, rootColumnGroup); - XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup); - - project.columnModel.update(); + + if (children.length() > 0) { + JSONUtilities.safePut(result, "c", children); + } + return result; } - + @Override - public boolean canImportData(String contentType, String fileName) { - if (contentType != null) { - contentType = contentType.toLowerCase().trim(); - - if("application/xml".equals(contentType) || - "text/xml".equals(contentType) || - "application/rss+xml".equals(contentType) || - "application/atom+xml".equals(contentType)) { - return true; + public void parseOneFile(Project project, ProjectMetadata metadata, + ImportingJob job, String fileSource, InputStream inputStream, + ImportColumnGroup rootColumnGroup, int limit, JSONObject options, + List exceptions) { + + try { + parseOneFile(project, metadata, job, fileSource, + new XmlParser(inputStream), rootColumnGroup, limit, options, exceptions); + } catch (XMLStreamException e) { + exceptions.add(e); + } + } + + static public class XmlParser implements TreeReader { + final protected XMLStreamReader parser; + + public XmlParser(InputStream inputStream) throws XMLStreamException { + parser = createXMLStreamReader(inputStream); + } + + @Override + public Token next() throws ServletException { + try { + if (!parser.hasNext()) { + throw new ServletException("End of XML stream"); + } + } catch (XMLStreamException e) { + throw new ServletException(e); } - } else if (fileName != null) { - fileName = fileName.toLowerCase(); - if ( - fileName.endsWith(".xml") || - fileName.endsWith(".atom") || - fileName.endsWith(".rss") - ) { - return true; + + int currentToken = -1; + try { + currentToken = parser.next(); + } catch (XMLStreamException e) { + throw new ServletException(e); + } + + return mapToToken(currentToken); + } + + protected Token mapToToken(int token) throws ServletException { + switch(token){ + case XMLStreamConstants.START_ELEMENT: return Token.StartEntity; + case XMLStreamConstants.END_ELEMENT: return Token.EndEntity; + case XMLStreamConstants.CHARACTERS: return Token.Value; + case XMLStreamConstants.START_DOCUMENT: return Token.Ignorable; + case XMLStreamConstants.END_DOCUMENT: return Token.Ignorable; + case XMLStreamConstants.SPACE: return Token.Value; + case XMLStreamConstants.PROCESSING_INSTRUCTION: return Token.Ignorable; + case XMLStreamConstants.NOTATION_DECLARATION: return Token.Ignorable; + case XMLStreamConstants.NAMESPACE: return Token.Ignorable; + case XMLStreamConstants.ENTITY_REFERENCE: return Token.Ignorable; + case XMLStreamConstants.DTD: return Token.Ignorable; + case XMLStreamConstants.COMMENT: return Token.Ignorable; + case XMLStreamConstants.CDATA: return Token.Ignorable; + case XMLStreamConstants.ATTRIBUTE: return Token.Ignorable; + default: + return Token.Ignorable; } } - return false; + + @Override + public Token current() throws ServletException{ + return this.mapToToken(parser.getEventType()); + } + + @Override + public boolean hasNext() throws ServletException{ + try { + return parser.hasNext(); + } catch (XMLStreamException e) { + throw new ServletException(e); + } + } + + @Override + public String getFieldName() throws ServletException{ + try{ + return parser.getLocalName(); + }catch(IllegalStateException e){ + return null; + } + } + + @Override + public String getPrefix(){ + return parser.getPrefix(); + } + + @Override + public String getFieldValue(){ + return parser.getText(); + } + + @Override + public int getAttributeCount(){ + return parser.getAttributeCount(); + } + + @Override + public String getAttributeValue(int index){ + return parser.getAttributeValue(index); + } + + @Override + public String getAttributePrefix(int index){ + return parser.getAttributePrefix(index); + } + + @Override + public String getAttributeLocalName(int index){ + return parser.getAttributeLocalName(index); + } } - + + final static private XMLStreamReader createXMLStreamReader(InputStream inputStream) throws XMLStreamException { + XMLInputFactory factory = XMLInputFactory.newInstance(); + factory.setProperty(XMLInputFactory.IS_COALESCING, true); + factory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, true); + + return factory.createXMLStreamReader(inputStream); + } + } diff --git a/main/src/com/google/refine/importers/parsers/JSONParser.java b/main/src/com/google/refine/importers/parsers/JSONParser.java deleted file mode 100644 index 11e506950..000000000 --- a/main/src/com/google/refine/importers/parsers/JSONParser.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - -Copyright 2010, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.importers.parsers; - -import java.io.IOException; -import java.io.InputStream; - -import javax.servlet.ServletException; -import org.codehaus.jackson.JsonFactory; -import org.codehaus.jackson.JsonParseException; -import org.codehaus.jackson.JsonParser; -import org.codehaus.jackson.JsonToken; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class JSONParser implements TreeParser{ - final static Logger logger = LoggerFactory.getLogger("JsonParser"); - - JsonFactory factory = new JsonFactory(); - JsonParser parser = null; - - //The following is a workaround for inconsistent Jackson JsonParser - Boolean lastTokenWasAFieldNameAndCurrentTokenIsANewEntity = false; - Boolean thisTokenIsAFieldName = false; - String lastFieldName = null; - //end of workaround - - public JSONParser(InputStream inputStream){ - try { - parser = factory.createJsonParser(inputStream); - } catch (Exception e) { - e.printStackTrace(); - } - } - - /** - * Does nothing. All Json is treated as elements - */ - @Override - public int getAttributeCount() { - // TODO Auto-generated method stub - return 0; - } - - /** - * Does nothing. All Json is treated as elements - */ - @Override - public String getAttributeLocalName(int index) { - return null; - } - - /** - * Does nothing. All Json is treated as elements - */ - @Override - public String getAttributePrefix(int index) { - // TODO Auto-generated method stub - return null; - } - - /** - * Does nothing. All Json is treated as elements - */ - @Override - public String getAttributeValue(int index) { - // TODO Auto-generated method stub - return null; - } - - @Override - public TreeParserToken getEventType() throws ServletException { - return this.mapToTreeParserToken(parser.getCurrentToken()); - } - - @Override - public String getLocalName() throws ServletException{ - try { - String text = parser.getCurrentName(); - - //The following is a workaround for inconsistent Jackson JsonParser - if(text == null){ - if(this.lastTokenWasAFieldNameAndCurrentTokenIsANewEntity) - text = this.lastFieldName; - else - text = "__anonymous__"; - } - //end of workaround - - return text; - } catch (Exception e) { - throw new ServletException(e); - } - } - - /** - * Does nothing. Json does not have prefixes - */ - @Override - public String getPrefix() { - return null; - } - - @Override - public String getText() throws ServletException { - try { - return parser.getText(); - } catch (Exception e) { - throw new ServletException(e); - } - } - - @Override - public boolean hasNext() throws ServletException { - return true; //FIXME fairly obtuse, is there a better way (advancing, then rewinding?) - } - - @Override - public TreeParserToken next() throws ServletException { - JsonToken next; - try { - next = parser.nextToken(); - } catch (JsonParseException e) { - throw new ServletException(e); - } catch (IOException e) { - throw new ServletException(e); - } - - if(next == null) - throw new ServletException("No more Json Tokens in stream"); - - //The following is a workaround for inconsistent Jackson JsonParser - if(next == JsonToken.FIELD_NAME){ - try { - this.thisTokenIsAFieldName = true; - this.lastFieldName = parser.getCurrentName(); - } catch (Exception e) { - //silent - } - }else if(next == JsonToken.START_ARRAY || next == JsonToken.START_OBJECT){ - if(this.thisTokenIsAFieldName){ - this.lastTokenWasAFieldNameAndCurrentTokenIsANewEntity = true; - this.thisTokenIsAFieldName = false; - }else{ - this.lastTokenWasAFieldNameAndCurrentTokenIsANewEntity = false; - this.lastFieldName = null; - } - }else{ - this.lastTokenWasAFieldNameAndCurrentTokenIsANewEntity = false; - this.lastFieldName = null; - this.thisTokenIsAFieldName = false; - } - //end of workaround - - return mapToTreeParserToken(next); - } - - protected TreeParserToken mapToTreeParserToken(JsonToken token){ - switch(token){ - case START_ARRAY: return TreeParserToken.StartEntity; - case END_ARRAY: return TreeParserToken.EndEntity; - case START_OBJECT: return TreeParserToken.StartEntity; - case END_OBJECT: return TreeParserToken.EndEntity; - case VALUE_STRING: return TreeParserToken.Value; - case FIELD_NAME: return TreeParserToken.Ignorable; //returned by the getLocalName function() - case VALUE_NUMBER_INT: return TreeParserToken.Value; - //Json does not have START_DOCUMENT token type (so ignored as default) - //Json does not have END_DOCUMENT token type (so ignored as default) - case VALUE_TRUE : return TreeParserToken.Value; - case VALUE_NUMBER_FLOAT : return TreeParserToken.Value; - case VALUE_NULL : return TreeParserToken.Value; - case VALUE_FALSE : return TreeParserToken.Value; - case VALUE_EMBEDDED_OBJECT : return TreeParserToken.Ignorable; - case NOT_AVAILABLE : return TreeParserToken.Ignorable; - default: return TreeParserToken.Ignorable; - } - } - -} diff --git a/main/src/com/google/refine/importers/parsers/NonSplitRowParser.java b/main/src/com/google/refine/importers/parsers/NonSplitRowParser.java deleted file mode 100644 index cac717c23..000000000 --- a/main/src/com/google/refine/importers/parsers/NonSplitRowParser.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - -Copyright 2010, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.importers.parsers; - -import java.io.LineNumberReader; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; - -import com.google.refine.importers.ImporterUtilities; -import com.google.refine.model.Cell; -import com.google.refine.model.Row; - -public class NonSplitRowParser extends RowParser { - - public List split(String line, LineNumberReader lineReader) { - List results = new ArrayList(1); - - results.add(line.trim()); - - return results; - } - - public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) { - if (line.trim().isEmpty()) { - return false; - } else { - Serializable value = guessValueType ? ImporterUtilities.parseCellValue(line) : line; - if (value != null) { - row.cells.add(new Cell(value, null)); - return true; - } else { - row.cells.add(null); - return false; - } - } - } - -} diff --git a/main/src/com/google/refine/importers/parsers/SeparatorRowParser.java b/main/src/com/google/refine/importers/parsers/SeparatorRowParser.java deleted file mode 100644 index 2b82d52e9..000000000 --- a/main/src/com/google/refine/importers/parsers/SeparatorRowParser.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - -Copyright 2010, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.importers.parsers; - -import java.io.LineNumberReader; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.lang.StringUtils; - -import com.google.refine.expr.ExpressionUtils; -import com.google.refine.importers.ImporterUtilities; -import com.google.refine.model.Cell; -import com.google.refine.model.Row; - -public class SeparatorRowParser extends RowParser { - - String sep; - - public SeparatorRowParser(String sep) { - this.sep = sep; - } - - public List split(String line, LineNumberReader lineReader) { - String[] cells = StringUtils.splitPreserveAllTokens(line, sep); - - List results = new ArrayList(); - for (int c = 0; c < cells.length; c++) { - results.add(cells[c]); - } - - return results; - } - - public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) { - boolean hasData = false; - - String[] cells = StringUtils.splitPreserveAllTokens(line, sep); - for (int c = 0; c < cells.length; c++) { - String text = cells[c]; - - Serializable value = guessValueType ? ImporterUtilities.parseCellValue(text) : text; - if (ExpressionUtils.isNonBlankData(value)) { - row.cells.add(new Cell(value, null)); - hasData = true; - } else { - row.cells.add(null); - } - } - return hasData; - } - -} diff --git a/main/src/com/google/refine/importers/parsers/XmlParser.java b/main/src/com/google/refine/importers/parsers/XmlParser.java deleted file mode 100644 index 7ab12a8ae..000000000 --- a/main/src/com/google/refine/importers/parsers/XmlParser.java +++ /dev/null @@ -1,160 +0,0 @@ -/* - -Copyright 2010, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.importers.parsers; - -import java.io.InputStream; - -import javax.servlet.ServletException; -import javax.xml.stream.FactoryConfigurationError; -import javax.xml.stream.XMLInputFactory; -import javax.xml.stream.XMLStreamConstants; -import javax.xml.stream.XMLStreamException; -import javax.xml.stream.XMLStreamReader; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class XmlParser implements TreeParser{ - final static Logger logger = LoggerFactory.getLogger("XmlParser"); - - XMLStreamReader parser = null; - - public XmlParser(InputStream inputStream){ - try { - XMLInputFactory factory = XMLInputFactory.newInstance(); - factory.setProperty(XMLInputFactory.IS_COALESCING, true); - factory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, true); - parser = factory.createXMLStreamReader(inputStream); - } catch (XMLStreamException e) { - // silent - // e.printStackTrace(); - } catch (FactoryConfigurationError e) { - // silent - // e.printStackTrace(); - } - } - - @Override - public TreeParserToken next() throws ServletException{ - try { - if(!parser.hasNext()) - throw new ServletException("End of XML stream"); - } catch (XMLStreamException e) { - throw new ServletException(e); - } - - int currentToken = -1; - try { - currentToken = parser.next(); - } catch (XMLStreamException e) { - throw new ServletException(e); - } - - return mapToTreeParserToken(currentToken); - } - - protected TreeParserToken mapToTreeParserToken(int token) throws ServletException { - switch(token){ - case XMLStreamConstants.START_ELEMENT: return TreeParserToken.StartEntity; - case XMLStreamConstants.END_ELEMENT: return TreeParserToken.EndEntity; - case XMLStreamConstants.CHARACTERS: return TreeParserToken.Value; - case XMLStreamConstants.START_DOCUMENT: return TreeParserToken.Ignorable; - case XMLStreamConstants.END_DOCUMENT: return TreeParserToken.Ignorable; - case XMLStreamConstants.SPACE: return TreeParserToken.Value; - case XMLStreamConstants.PROCESSING_INSTRUCTION: return TreeParserToken.Ignorable; - case XMLStreamConstants.NOTATION_DECLARATION: return TreeParserToken.Ignorable; - case XMLStreamConstants.NAMESPACE: return TreeParserToken.Ignorable; - case XMLStreamConstants.ENTITY_REFERENCE: return TreeParserToken.Ignorable; - case XMLStreamConstants.DTD: return TreeParserToken.Ignorable; - case XMLStreamConstants.COMMENT: return TreeParserToken.Ignorable; - case XMLStreamConstants.CDATA: return TreeParserToken.Ignorable; - case XMLStreamConstants.ATTRIBUTE: return TreeParserToken.Ignorable; - default: - return TreeParserToken.Ignorable; - } - } - - @Override - public TreeParserToken getEventType() throws ServletException{ - return this.mapToTreeParserToken(parser.getEventType()); - } - - @Override - public boolean hasNext() throws ServletException{ - try { - return parser.hasNext(); - } catch (XMLStreamException e) { - throw new ServletException(e); - } - } - - @Override - public String getLocalName() throws ServletException{ - try{ - return parser.getLocalName(); - }catch(IllegalStateException e){ - return null; - } - } - - @Override - public String getPrefix(){ - return parser.getPrefix(); - } - - @Override - public String getText(){ - return parser.getText(); - } - - @Override - public int getAttributeCount(){ - return parser.getAttributeCount(); - } - - @Override - public String getAttributeValue(int index){ - return parser.getAttributeValue(index); - } - - @Override - public String getAttributePrefix(int index){ - return parser.getAttributePrefix(index); - } - - @Override - public String getAttributeLocalName(int index){ - return parser.getAttributeLocalName(index); - } -} diff --git a/main/src/com/google/refine/importers/tree/ImportColumn.java b/main/src/com/google/refine/importers/tree/ImportColumn.java new file mode 100644 index 000000000..ec07a7ca4 --- /dev/null +++ b/main/src/com/google/refine/importers/tree/ImportColumn.java @@ -0,0 +1,23 @@ +package com.google.refine.importers.tree; + + +/** + * A column is used to describe a branch-terminating element in a tree structure + * + */ +public class ImportColumn extends ImportVertical { + public int cellIndex; + public int nextRowIndex; + public boolean blankOnFirstRow; + + public ImportColumn() {} + + public ImportColumn(String name) { //required for testing + super.name = name; + } + + @Override + void tabulate() { + // already done the tabulation elsewhere + } +} \ No newline at end of file diff --git a/main/src/com/google/refine/importers/tree/ImportColumnGroup.java b/main/src/com/google/refine/importers/tree/ImportColumnGroup.java new file mode 100644 index 000000000..f25c5ba05 --- /dev/null +++ b/main/src/com/google/refine/importers/tree/ImportColumnGroup.java @@ -0,0 +1,33 @@ +package com.google.refine.importers.tree; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; + +/** + * A column group describes a branch in tree structured data + */ +public class ImportColumnGroup extends ImportVertical { + public Map subgroups = new HashMap(); + public Map columns = new HashMap(); + public int nextRowIndex; + + @Override + void tabulate() { + for (ImportColumn c : columns.values()) { + c.tabulate(); + nonBlankCount = Math.max(nonBlankCount, c.nonBlankCount); + } + for (ImportColumnGroup g : subgroups.values()) { + g.tabulate(); + nonBlankCount = Math.max(nonBlankCount, g.nonBlankCount); + } + } + + public String toString() { + return String.format("name=%s, columns={%s}, subgroups={{%s}}", + name,StringUtils.join(columns.keySet(), ','), + StringUtils.join(subgroups.keySet(),',')); + } +} \ No newline at end of file diff --git a/main/src/com/google/refine/importers/tree/ImportRecord.java b/main/src/com/google/refine/importers/tree/ImportRecord.java new file mode 100644 index 000000000..78e8a6468 --- /dev/null +++ b/main/src/com/google/refine/importers/tree/ImportRecord.java @@ -0,0 +1,14 @@ +package com.google.refine.importers.tree; + +import java.util.LinkedList; +import java.util.List; + +import com.google.refine.model.Cell; + +/** + * A record describes a data element in a tree-structure + * + */ +public class ImportRecord { + public List> rows = new LinkedList>(); +} \ No newline at end of file diff --git a/main/src/com/google/refine/importers/tree/ImportVertical.java b/main/src/com/google/refine/importers/tree/ImportVertical.java new file mode 100644 index 000000000..ac16ec36c --- /dev/null +++ b/main/src/com/google/refine/importers/tree/ImportVertical.java @@ -0,0 +1,8 @@ +package com.google.refine.importers.tree; + +abstract class ImportVertical { + public String name = ""; + public int nonBlankCount; + + abstract void tabulate(); +} \ No newline at end of file diff --git a/main/src/com/google/refine/importers/tree/RecordElementCandidate.java b/main/src/com/google/refine/importers/tree/RecordElementCandidate.java new file mode 100644 index 000000000..3be1b62c9 --- /dev/null +++ b/main/src/com/google/refine/importers/tree/RecordElementCandidate.java @@ -0,0 +1,16 @@ +package com.google.refine.importers.tree; + +import java.util.Arrays; + +/** + * An element which holds sub-elements we + * shall import as records + */ +class RecordElementCandidate { + String[] path; + int count; + + public String toString() { + return Arrays.toString(path); + } +} \ No newline at end of file diff --git a/main/src/com/google/refine/importers/TreeImportUtilities.java b/main/src/com/google/refine/importers/tree/TreeImportUtilities.java similarity index 72% rename from main/src/com/google/refine/importers/TreeImportUtilities.java rename to main/src/com/google/refine/importers/tree/TreeImportUtilities.java index 83cc2725e..38496519b 100644 --- a/main/src/com/google/refine/importers/TreeImportUtilities.java +++ b/main/src/com/google/refine/importers/tree/TreeImportUtilities.java @@ -31,22 +31,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -package com.google.refine.importers; +package com.google.refine.importers.tree; import java.io.Serializable; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.Comparator; -import java.util.HashMap; -import java.util.LinkedList; import java.util.List; -import java.util.Map; -import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.refine.importers.ImporterUtilities; import com.google.refine.model.Cell; import com.google.refine.model.Column; import com.google.refine.model.Project; @@ -54,83 +50,6 @@ import com.google.refine.model.Project; public abstract class TreeImportUtilities { final static Logger logger = LoggerFactory.getLogger("TreeImportUtilities"); - /** - * An element which holds sub-elements we - * shall import as records - */ - static protected class RecordElementCandidate { - String[] path; - int count; - - public String toString() { - return Arrays.toString(path); - } - } - - - static protected abstract class ImportVertical { - public String name = ""; - public int nonBlankCount; - - abstract void tabulate(); - } - - /** - * A column group describes a branch in tree structured data - */ - static public class ImportColumnGroup extends ImportVertical { - public Map subgroups = new HashMap(); - public Map columns = new HashMap(); - public int nextRowIndex; - - @Override - void tabulate() { - for (ImportColumn c : columns.values()) { - c.tabulate(); - nonBlankCount = Math.max(nonBlankCount, c.nonBlankCount); - } - for (ImportColumnGroup g : subgroups.values()) { - g.tabulate(); - nonBlankCount = Math.max(nonBlankCount, g.nonBlankCount); - } - } - - public String toString() { - return String.format("name=%s, columns={%s}, subgroups={{%s}}", - name,StringUtils.join(columns.keySet(), ','), - StringUtils.join(subgroups.keySet(),',')); - } - } - - /** - * A column is used to describe a branch-terminating element in a tree structure - * - */ - static public class ImportColumn extends ImportVertical { - public int cellIndex; - public int nextRowIndex; - public boolean blankOnFirstRow; - - public ImportColumn() {} - - public ImportColumn(String name) { //required for testing - super.name = name; - } - - @Override - void tabulate() { - // already done the tabulation elsewhere - } - } - - /** - * A record describes a data element in a tree-structure - * - */ - static public class ImportRecord { - public List> rows = new LinkedList>(); - } - static protected void sortRecordElementCandidates(List list) { Collections.sort(list, new Comparator() { public int compare(RecordElementCandidate o1, RecordElementCandidate o2) { diff --git a/main/src/com/google/refine/importers/tree/TreeImportingParserBase.java b/main/src/com/google/refine/importers/tree/TreeImportingParserBase.java new file mode 100644 index 000000000..3bbdb0f1b --- /dev/null +++ b/main/src/com/google/refine/importers/tree/TreeImportingParserBase.java @@ -0,0 +1,169 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.importers.tree; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.List; + +import org.apache.commons.lang.NotImplementedException; +import org.json.JSONObject; + +import com.google.refine.ProjectMetadata; +import com.google.refine.importers.ImporterUtilities; +import com.google.refine.importers.ImporterUtilities.MultiFileReadingProgress; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingParser; +import com.google.refine.importing.ImportingUtilities; +import com.google.refine.model.Project; +import com.google.refine.util.JSONUtilities; + +abstract public class TreeImportingParserBase implements ImportingParser { + final protected boolean useInputStream; + + protected TreeImportingParserBase(boolean useInputStream) { + this.useInputStream = useInputStream; + } + + @Override + public JSONObject createParserUIInitializationData(ImportingJob job, + List fileRecords, String format) { + JSONObject options = new JSONObject(); + return options; + } + + @Override + public void parse(Project project, ProjectMetadata metadata, + ImportingJob job, List fileRecords, String format, + int limit, JSONObject options, List exceptions) { + + MultiFileReadingProgress progress = ImporterUtilities.createMultiFileReadingProgress(job, fileRecords); + ImportColumnGroup rootColumnGroup = new ImportColumnGroup(); + + for (JSONObject fileRecord : fileRecords) { + try { + parseOneFile(project, metadata, job, fileRecord, rootColumnGroup, limit, options, exceptions, progress); + } catch (IOException e) { + exceptions.add(e); + } + + if (limit > 0 && project.rows.size() >= limit) { + break; + } + } + + XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup); + project.columnModel.update(); + } + + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + JSONObject fileRecord, + ImportColumnGroup rootColumnGroup, + int limit, + JSONObject options, + List exceptions, + final MultiFileReadingProgress progress + ) throws IOException { + final File file = ImportingUtilities.getFile(job, fileRecord); + final String fileSource = ImportingUtilities.getFileSource(fileRecord); + + progress.startFile(fileSource); + try { + InputStream inputStream = ImporterUtilities.openAndTrackFile(fileSource, file, progress); + try { + if (useInputStream) { + parseOneFile(project, metadata, job, fileSource, inputStream, + rootColumnGroup, limit, options, exceptions); + } else { + Reader reader = ImportingUtilities.getFileReader(file, fileRecord); + parseOneFile(project, metadata, job, fileSource, reader, + rootColumnGroup, limit, options, exceptions); + } + } finally { + inputStream.close(); + } + } finally { + progress.endFile(fileSource, file.length()); + } + } + + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + Reader reader, + ImportColumnGroup rootColumnGroup, + int limit, + JSONObject options, + List exceptions + ) { + throw new NotImplementedException(); + } + + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + InputStream inputStream, + ImportColumnGroup rootColumnGroup, + int limit, + JSONObject options, + List exceptions + ) { + throw new NotImplementedException(); + } + + protected void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + TreeReader treeParser, + ImportColumnGroup rootColumnGroup, + int limit, + JSONObject options, + List exceptions + ) { + String[] recordPath = JSONUtilities.getStringArray(options, "recordPath"); + + XmlImportUtilities.importTreeData(treeParser, project, recordPath, rootColumnGroup, limit); + } +} diff --git a/main/src/com/google/refine/importers/parsers/TreeParser.java b/main/src/com/google/refine/importers/tree/TreeReader.java similarity index 73% rename from main/src/com/google/refine/importers/parsers/TreeParser.java rename to main/src/com/google/refine/importers/tree/TreeReader.java index 628f04985..732f89558 100644 --- a/main/src/com/google/refine/importers/parsers/TreeParser.java +++ b/main/src/com/google/refine/importers/tree/TreeReader.java @@ -31,17 +31,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -package com.google.refine.importers.parsers; +package com.google.refine.importers.tree; -import javax.servlet.ServletException; +public interface TreeReader { + public enum Token { + Ignorable, + StartEntity, + EndEntity, + Value + //append additional tokens only if necessary (most should be just mapped to Value or Ignorable) + } -public interface TreeParser { - public TreeParserToken next() throws ServletException; - public TreeParserToken getEventType() throws ServletException; //aka getCurrentToken - public boolean hasNext() throws ServletException; - public String getLocalName() throws ServletException; //aka getFieldName + public Token current() throws Exception; //aka getCurrentToken + + public boolean hasNext() throws Exception; + public Token next() throws Exception; + + public String getFieldName() throws Exception; //aka getFieldName public String getPrefix(); - public String getText() throws ServletException; + public String getFieldValue() throws Exception; + public int getAttributeCount(); public String getAttributeValue(int index); public String getAttributePrefix(int index); diff --git a/main/src/com/google/refine/importers/XmlImportUtilities.java b/main/src/com/google/refine/importers/tree/XmlImportUtilities.java similarity index 77% rename from main/src/com/google/refine/importers/XmlImportUtilities.java rename to main/src/com/google/refine/importers/tree/XmlImportUtilities.java index 33f24b15f..9832cabc4 100644 --- a/main/src/com/google/refine/importers/XmlImportUtilities.java +++ b/main/src/com/google/refine/importers/tree/XmlImportUtilities.java @@ -31,7 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -package com.google.refine.importers; +package com.google.refine.importers.tree; import java.util.ArrayList; import java.util.HashMap; @@ -40,13 +40,10 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; -import javax.servlet.ServletException; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.refine.importers.parsers.TreeParser; -import com.google.refine.importers.parsers.TreeParserToken; +import com.google.refine.importers.tree.TreeReader.Token; import com.google.refine.model.Cell; import com.google.refine.model.Project; import com.google.refine.model.Row; @@ -54,11 +51,11 @@ import com.google.refine.model.Row; public class XmlImportUtilities extends TreeImportUtilities { final static Logger logger = LoggerFactory.getLogger("XmlImportUtilities"); - static public String[] detectPathFromTag(TreeParser parser, String tag) { + static public String[] detectPathFromTag(TreeReader parser, String tag) { try { while (parser.hasNext()) { - TreeParserToken eventType = parser.next(); - if (eventType == TreeParserToken.StartEntity) {//XMLStreamConstants.START_ELEMENT) { + Token eventType = parser.next(); + if (eventType == Token.StartEntity) {//XMLStreamConstants.START_ELEMENT) { List path = detectRecordElement(parser, tag); if (path != null) { String[] path2 = new String[path.size()]; @@ -90,14 +87,14 @@ public class XmlImportUtilities extends TreeImportUtilities { * null if the the tag is not found. * @throws ServletException */ - static protected List detectRecordElement(TreeParser parser, String tag) throws ServletException { + static protected List detectRecordElement(TreeReader parser, String tag) throws Exception { try{ - if(parser.getEventType() == TreeParserToken.Ignorable)//XMLStreamConstants.START_DOCUMENT) + if(parser.current() == Token.Ignorable)//XMLStreamConstants.START_DOCUMENT) parser.next(); - String localName = parser.getLocalName(); + String localName = parser.getFieldName(); String fullName = composeName(parser.getPrefix(), localName); - if (tag.equals(parser.getLocalName()) || tag.equals(fullName)) { + if (tag.equals(parser.getFieldName()) || tag.equals(fullName)) { List path = new LinkedList(); path.add(localName); @@ -105,10 +102,10 @@ public class XmlImportUtilities extends TreeImportUtilities { } while (parser.hasNext()) { - TreeParserToken eventType = parser.next(); - if (eventType == TreeParserToken.EndEntity) {//XMLStreamConstants.END_ELEMENT) { + Token eventType = parser.next(); + if (eventType == Token.EndEntity) {//XMLStreamConstants.END_ELEMENT) { break; - } else if (eventType == TreeParserToken.StartEntity) {//XMLStreamConstants.START_ELEMENT) { + } else if (eventType == Token.StartEntity) {//XMLStreamConstants.START_ELEMENT) { List path = detectRecordElement(parser, tag); if (path != null) { path.add(0, localName); @@ -116,7 +113,7 @@ public class XmlImportUtilities extends TreeImportUtilities { } } } - }catch(ServletException e){ + } catch (Exception e) { // silent // e.printStackTrace(); } @@ -136,18 +133,18 @@ public class XmlImportUtilities extends TreeImportUtilities { * The path to the most numerous of the possible candidates. * null if no candidates were found (less than 6 recurrences) */ - static public String[] detectRecordElement(TreeParser parser) { + static public String[] detectRecordElement(TreeReader parser) { logger.trace("detectRecordElement(inputStream)"); List candidates = new ArrayList(); try { while (parser.hasNext()) { - TreeParserToken eventType = parser.next(); - if (eventType == TreeParserToken.StartEntity) { + Token eventType = parser.next(); + if (eventType == Token.StartEntity) { RecordElementCandidate candidate = detectRecordElement( parser, - new String[] { parser.getLocalName() }); + new String[] { parser.getFieldName() }); if (candidate != null) { candidates.add(candidate); @@ -168,8 +165,8 @@ public class XmlImportUtilities extends TreeImportUtilities { return null; } - static protected RecordElementCandidate detectRecordElement(TreeParser parser, String[] path) { - logger.trace("detectRecordElement(TreeParser, String[])"); + static protected RecordElementCandidate detectRecordElement(TreeReader parser, String[] path) { + logger.trace("detectRecordElement(TreeReader, String[])"); List descendantCandidates = new ArrayList(); Map immediateChildCandidateMap = new HashMap(); @@ -178,21 +175,21 @@ public class XmlImportUtilities extends TreeImportUtilities { try { while (parser.hasNext()) { - TreeParserToken eventType = parser.next(); - if (eventType == TreeParserToken.EndEntity ) { + Token eventType = parser.next(); + if (eventType == Token.EndEntity ) { break; - } else if (eventType == TreeParserToken.Value) { + } else if (eventType == Token.Value) { try{ - if (parser.getText().trim().length() > 0) { + if (parser.getFieldValue().trim().length() > 0) { textNodeCount++; } }catch(Exception e){ //silent } - } else if (eventType == TreeParserToken.StartEntity) { + } else if (eventType == Token.StartEntity) { childElementNodeCount++; - String tagName = parser.getLocalName(); + String tagName = parser.getFieldName(); immediateChildCandidateMap.put( tagName, @@ -261,17 +258,18 @@ public class XmlImportUtilities extends TreeImportUtilities { static public void importTreeData( - TreeParser parser, + TreeReader parser, Project project, String[] recordPath, - ImportColumnGroup rootColumnGroup + ImportColumnGroup rootColumnGroup, + int limit ) { - logger.trace("importTreeData(TreeParser, Project, String[], ImportColumnGroup)"); + logger.trace("importTreeData(TreeReader, Project, String[], ImportColumnGroup)"); try { - while (parser.hasNext()) { - TreeParserToken eventType = parser.next(); - if (eventType == TreeParserToken.StartEntity) { - findRecord(project, parser, recordPath, 0, rootColumnGroup); + while (parser.hasNext() && (limit <= 0 || project.rows.size() < limit)) { + Token eventType = parser.next(); + if (eventType == Token.StartEntity) { + findRecord(project, parser, recordPath, 0, rootColumnGroup, limit); } } } catch (Exception e) { @@ -292,26 +290,30 @@ public class XmlImportUtilities extends TreeImportUtilities { */ static protected void findRecord( Project project, - TreeParser parser, + TreeReader parser, String[] recordPath, int pathIndex, - ImportColumnGroup rootColumnGroup - ) throws ServletException { - logger.trace("findRecord(Project, TreeParser, String[], int, ImportColumnGroup"); + ImportColumnGroup rootColumnGroup, + int limit + ) throws Exception { + logger.trace("findRecord(Project, TreeReader, String[], int, ImportColumnGroup"); - if(parser.getEventType() == TreeParserToken.Ignorable){//XMLStreamConstants.START_DOCUMENT){ + if(parser.current() == Token.Ignorable){//XMLStreamConstants.START_DOCUMENT){ logger.warn("Cannot use findRecord method for START_DOCUMENT event"); return; } - String tagName = parser.getLocalName(); - if (tagName.equals(recordPath[pathIndex])) { + String recordPathSegment = recordPath[pathIndex]; + + String localName = parser.getFieldName(); + String fullName = composeName(parser.getPrefix(), localName); + if (recordPathSegment.equals(localName) || recordPathSegment.equals(fullName)) { if (pathIndex < recordPath.length - 1) { - while (parser.hasNext()) { - TreeParserToken eventType = parser.next(); - if (eventType == TreeParserToken.StartEntity) { - findRecord(project, parser, recordPath, pathIndex + 1, rootColumnGroup); - } else if (eventType == TreeParserToken.EndEntity ) { + while (parser.hasNext() && (limit <= 0 || project.rows.size() < limit)) { + Token eventType = parser.next(); + if (eventType == Token.StartEntity) { + findRecord(project, parser, recordPath, pathIndex + 1, rootColumnGroup, limit); + } else if (eventType == Token.EndEntity ) { break; } } @@ -323,12 +325,12 @@ public class XmlImportUtilities extends TreeImportUtilities { } } - static protected void skip(TreeParser parser) throws ServletException { + static protected void skip(TreeReader parser) throws Exception { while (parser.hasNext()) { - TreeParserToken eventType = parser.next(); - if (eventType == TreeParserToken.StartEntity) {//XMLStreamConstants.START_ELEMENT) { + Token eventType = parser.next(); + if (eventType == Token.StartEntity) {//XMLStreamConstants.START_ELEMENT) { skip(parser); - } else if (eventType == TreeParserToken.EndEntity) { //XMLStreamConstants.END_ELEMENT) { + } else if (eventType == Token.EndEntity) { //XMLStreamConstants.END_ELEMENT) { return; } } @@ -344,10 +346,10 @@ public class XmlImportUtilities extends TreeImportUtilities { */ static protected void processRecord( Project project, - TreeParser parser, + TreeReader parser, ImportColumnGroup rootColumnGroup - ) throws ServletException { - logger.trace("processRecord(Project,TreeParser,ImportColumnGroup)"); + ) throws Exception { + logger.trace("processRecord(Project,TreeReader,ImportColumnGroup)"); ImportRecord record = new ImportRecord(); processSubRecord(project, parser, rootColumnGroup, record); @@ -382,19 +384,19 @@ public class XmlImportUtilities extends TreeImportUtilities { */ static protected void processSubRecord( Project project, - TreeParser parser, + TreeReader parser, ImportColumnGroup columnGroup, ImportRecord record - ) throws ServletException { - logger.trace("processSubRecord(Project,TreeParser,ImportColumnGroup,ImportRecord)"); + ) throws Exception { + logger.trace("processSubRecord(Project,TreeReader,ImportColumnGroup,ImportRecord)"); - if(parser.getEventType() == TreeParserToken.Ignorable) + if(parser.current() == Token.Ignorable) return; ImportColumnGroup thisColumnGroup = getColumnGroup( project, columnGroup, - composeName(parser.getPrefix(), parser.getLocalName())); + composeName(parser.getPrefix(), parser.getFieldName())); thisColumnGroup.nextRowIndex = Math.max(thisColumnGroup.nextRowIndex, columnGroup.nextRowIndex); @@ -413,8 +415,8 @@ public class XmlImportUtilities extends TreeImportUtilities { } while (parser.hasNext()) { - TreeParserToken eventType = parser.next(); - if (eventType == TreeParserToken.StartEntity) { + Token eventType = parser.next(); + if (eventType == Token.StartEntity) { processSubRecord( project, parser, @@ -422,9 +424,9 @@ public class XmlImportUtilities extends TreeImportUtilities { record ); } else if (//eventType == XMLStreamConstants.CDATA || - eventType == TreeParserToken.Value) { //XMLStreamConstants.CHARACTERS) { - String text = parser.getText(); - String colName = parser.getLocalName(); + eventType == Token.Value) { //XMLStreamConstants.CHARACTERS) { + String text = parser.getFieldValue(); + String colName = parser.getFieldName(); if(text != null){ text = text.trim(); if (text.length() > 0) { @@ -437,7 +439,7 @@ public class XmlImportUtilities extends TreeImportUtilities { ); } } - } else if (eventType == TreeParserToken.EndEntity) { + } else if (eventType == Token.EndEntity) { break; } } @@ -451,8 +453,4 @@ public class XmlImportUtilities extends TreeImportUtilities { } thisColumnGroup.nextRowIndex = nextRowIndex; } - - - - } diff --git a/main/src/com/google/refine/importing/DefaultImportingController.java b/main/src/com/google/refine/importing/DefaultImportingController.java new file mode 100644 index 000000000..7a0f6614c --- /dev/null +++ b/main/src/com/google/refine/importing/DefaultImportingController.java @@ -0,0 +1,264 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.importing; + +import java.io.IOException; +import java.io.Writer; +import java.util.LinkedList; +import java.util.List; +import java.util.Properties; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONWriter; + +import com.google.refine.RefineServlet; +import com.google.refine.commands.HttpUtilities; +import com.google.refine.importing.ImportingManager.Format; +import com.google.refine.util.JSONUtilities; +import com.google.refine.util.ParsingUtilities; + +public class DefaultImportingController implements ImportingController { + + protected RefineServlet servlet; + + @Override + public void init(RefineServlet servlet) { + this.servlet = servlet; + } + + @Override + public void doGet(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + // TODO Auto-generated method stub + } + + @Override + public void doPost(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + /* + * The uploaded file is in the POST body as a "file part". If + * we call request.getParameter() then the POST body will get + * read and we won't have a chance to parse the body ourselves. + * This is why we have to parse the URL for parameters ourselves. + */ + Properties parameters = ParsingUtilities.parseUrlParameters(request); + String subCommand = parameters.getProperty("subCommand"); + if ("load-raw-data".equals(subCommand)) { + doLoadRawData(request, response, parameters); + } else if ("update-file-selection".equals(subCommand)) { + doUpdateFileSelection(request, response, parameters); + } else if ("initialize-parser-ui".equals(subCommand)) { + doInitializeParserUI(request, response, parameters); + } else if ("update-format-and-options".equals(subCommand)) { + doUpdateFormatAndOptions(request, response, parameters); + } else if ("create-project".equals(subCommand)) { + doCreateProject(request, response, parameters); + } else { + HttpUtilities.respond(response, "error", "No such sub command"); + } + } + + private void doLoadRawData(HttpServletRequest request, HttpServletResponse response, Properties parameters) + throws ServletException, IOException { + + long jobID = Long.parseLong(parameters.getProperty("jobID")); + ImportingJob job = ImportingManager.getJob(jobID); + if (job == null) { + HttpUtilities.respond(response, "error", "No such import job"); + return; + } + + try { + final JSONObject config = getConfig(job); + if (!("new".equals(config.getString("state")))) { + HttpUtilities.respond(response, "error", "Job already started; cannot load more data"); + return; + } + + ImportingUtilities.loadDataAndPrepareJob( + request, response, parameters, job, config); + } catch (JSONException e) { + throw new ServletException(e); + } + } + + private void doUpdateFileSelection(HttpServletRequest request, HttpServletResponse response, Properties parameters) + throws ServletException, IOException { + + long jobID = Long.parseLong(parameters.getProperty("jobID")); + ImportingJob job = ImportingManager.getJob(jobID); + if (job == null) { + HttpUtilities.respond(response, "error", "No such import job"); + return; + } + + try { + JSONObject config = getConfig(job); + if (!("ready".equals(config.getString("state")))) { + HttpUtilities.respond(response, "error", "Job not ready"); + return; + } + + JSONArray fileSelectionArray = ParsingUtilities.evaluateJsonStringToArray( + request.getParameter("fileSelection")); + + ImportingUtilities.updateJobWithNewFileSelection(job, fileSelectionArray); + + replyWithJobData(request, response, job); + } catch (JSONException e) { + throw new ServletException(e); + } + } + + private void doUpdateFormatAndOptions(HttpServletRequest request, HttpServletResponse response, Properties parameters) + throws ServletException, IOException { + + long jobID = Long.parseLong(parameters.getProperty("jobID")); + ImportingJob job = ImportingManager.getJob(jobID); + if (job == null) { + HttpUtilities.respond(response, "error", "No such import job"); + return; + } + + try { + JSONObject config = getConfig(job); + if (!("ready".equals(config.getString("state")))) { + HttpUtilities.respond(response, "error", "Job not ready"); + return; + } + + String format = request.getParameter("format"); + JSONObject optionObj = ParsingUtilities.evaluateJsonStringToObject( + request.getParameter("options")); + + List exceptions = new LinkedList(); + + ImportingUtilities.previewParse(job, format, optionObj, exceptions); + + HttpUtilities.respond(response, "ok", "done"); + } catch (JSONException e) { + throw new ServletException(e); + } + } + + private void doInitializeParserUI(HttpServletRequest request, HttpServletResponse response, Properties parameters) + throws ServletException, IOException { + + long jobID = Long.parseLong(parameters.getProperty("jobID")); + ImportingJob job = ImportingManager.getJob(jobID); + if (job == null) { + HttpUtilities.respond(response, "error", "No such import job"); + return; + } + + String format = request.getParameter("format"); + Format formatRecord = ImportingManager.formatToRecord.get(format); + if (formatRecord != null && formatRecord.parser != null) { + JSONObject options = formatRecord.parser.createParserUIInitializationData( + job, ImportingUtilities.getSelectedFileRecords(job), format); + JSONObject result = new JSONObject(); + JSONUtilities.safePut(result, "status", "ok"); + JSONUtilities.safePut(result, "options", options); + + HttpUtilities.respond(response, result.toString()); + } else { + HttpUtilities.respond(response, "error", "Unrecognized format or format has no parser"); + } + } + + private void doCreateProject(HttpServletRequest request, HttpServletResponse response, Properties parameters) + throws ServletException, IOException { + + long jobID = Long.parseLong(parameters.getProperty("jobID")); + ImportingJob job = ImportingManager.getJob(jobID); + if (job == null) { + HttpUtilities.respond(response, "error", "No such import job"); + return; + } + + try { + JSONObject config = getConfig(job); + if (!("ready".equals(config.getString("state")))) { + HttpUtilities.respond(response, "error", "Job not ready"); + return; + } + + String format = request.getParameter("format"); + JSONObject optionObj = ParsingUtilities.evaluateJsonStringToObject( + request.getParameter("options")); + + List exceptions = new LinkedList(); + + ImportingUtilities.createProject(job, format, optionObj, exceptions); + + HttpUtilities.respond(response, "ok", "done"); + } catch (JSONException e) { + throw new ServletException(e); + } + } + + private JSONObject getConfig(ImportingJob job) { + if (job.config == null) { + job.config = new JSONObject(); + JSONUtilities.safePut(job.config, "state", "new"); + JSONUtilities.safePut(job.config, "hasData", false); + } + return job.config; + } + + private void replyWithJobData(HttpServletRequest request, HttpServletResponse response, ImportingJob job) + throws ServletException, IOException { + + Writer w = response.getWriter(); + JSONWriter writer = new JSONWriter(w); + try { + writer.object(); + writer.key("code"); writer.value("ok"); + writer.key("job"); job.write(writer, new Properties()); + writer.endObject(); + } catch (JSONException e) { + throw new ServletException(e); + } finally { + w.flush(); + w.close(); + } + } +} diff --git a/main/src/com/google/refine/importers/parsers/TreeParserToken.java b/main/src/com/google/refine/importing/FormatGuesser.java similarity index 83% rename from main/src/com/google/refine/importers/parsers/TreeParserToken.java rename to main/src/com/google/refine/importing/FormatGuesser.java index e71afd30a..52d1dd8ba 100644 --- a/main/src/com/google/refine/importers/parsers/TreeParserToken.java +++ b/main/src/com/google/refine/importing/FormatGuesser.java @@ -1,43 +1,40 @@ -/* - -Copyright 2010, Google Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -package com.google.refine.importers.parsers; - - -public enum TreeParserToken { - Ignorable, - StartEntity, - EndEntity, - Value - //append additional tokens only if necessary (most should be just mapped to Value or Ignorable) -} +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.importing; + +import java.io.File; + +public interface FormatGuesser { + public String guess(File file, String encoding, String seedFormat); +} diff --git a/main/src/com/google/refine/importers/Importer.java b/main/src/com/google/refine/importing/ImportingController.java similarity index 80% rename from main/src/com/google/refine/importers/Importer.java rename to main/src/com/google/refine/importing/ImportingController.java index 3e5001582..68e2817c9 100644 --- a/main/src/com/google/refine/importers/Importer.java +++ b/main/src/com/google/refine/importing/ImportingController.java @@ -1,6 +1,6 @@ /* -Copyright 2010, Google Inc. +Copyright 2011, Google Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,17 +31,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -package com.google.refine.importers; +package com.google.refine.importing; +import com.google.refine.HttpResponder; -public interface Importer { - - /** - * Determine whether importer can handle given contentType and filename. - * - * @param contentType - * @param fileName - * @return true if the importer can handle this - */ - public boolean canImportData(String contentType, String fileName); +public interface ImportingController extends HttpResponder { } diff --git a/main/src/com/google/refine/importing/ImportingJob.java b/main/src/com/google/refine/importing/ImportingJob.java new file mode 100644 index 000000000..8ab30c57a --- /dev/null +++ b/main/src/com/google/refine/importing/ImportingJob.java @@ -0,0 +1,106 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.importing; + +import java.io.File; +import java.io.IOException; +import java.util.Properties; + +import org.apache.commons.io.FileUtils; +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONWriter; + +import com.google.refine.Jsonizable; +import com.google.refine.ProjectMetadata; +import com.google.refine.model.Project; + + +public class ImportingJob implements Jsonizable { + final public long id; + final public File dir; // Temporary directory where the data about this job is stored + + public long lastTouched; + public JSONObject config = null; + + public Project project; + public ProjectMetadata metadata; + public boolean canceled; + + public ImportingJob(long id, File dir) { + this.id = id; + this.dir = dir; + + dir.mkdirs(); + } + + public void touch() { + lastTouched = System.currentTimeMillis(); + } + + public void prepareNewProject() { + if (project != null) { + project.dispose(); + } + project = new Project(); + metadata = new ProjectMetadata(); + } + + public void dispose() { + if (project != null) { + project.dispose(); + project = null; + } + metadata = null; + + try { + FileUtils.deleteDirectory(dir); + } catch (IOException e) { + } + } + + public File getRawDataDir() { + File dir2 = new File(dir, "raw-data"); + dir2.mkdirs(); + return dir2; + } + + @Override + public void write(JSONWriter writer, Properties options) + throws JSONException { + writer.object(); + writer.key("config"); writer.value(config); + writer.endObject(); + } +} diff --git a/main/src/com/google/refine/importing/ImportingManager.java b/main/src/com/google/refine/importing/ImportingManager.java new file mode 100644 index 000000000..335261521 --- /dev/null +++ b/main/src/com/google/refine/importing/ImportingManager.java @@ -0,0 +1,257 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.importing; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.json.JSONException; +import org.json.JSONWriter; + +import com.google.refine.RefineServlet; + +import edu.mit.simile.butterfly.ButterflyModule; + +public class ImportingManager { + static public class Format { + final public String id; + final public String label; + final public boolean download; + final public String uiClass; + final public ImportingParser parser; + + private Format( + String id, + String label, + boolean download, + String uiClass, + ImportingParser parser + ) { + this.id = id; + this.label = label; + this.download = download; + this.uiClass = uiClass; + this.parser = parser; + } + } + + static private RefineServlet servlet; + static private File importDir; + final static private Map jobs = new HashMap(); + + // Mapping from format to label, e.g., "text" to "Text files", "text/xml" to "XML files" + final static public Map formatToRecord = new HashMap(); + + // Mapping from format to guessers + final static public Map> formatToGuessers = new HashMap>(); + + // Mapping from file extension to format, e.g., ".xml" to "text/xml" + final static public Map extensionToFormat = new HashMap(); + + // Mapping from mime type to format, e.g., "application/json" to "text/json" + final static public Map mimeTypeToFormat = new HashMap(); + + // URL rewriters + final static public Set urlRewriters = new HashSet(); + + // Mapping from controller name to controller + final static public Map controllers = new HashMap(); + + static public void initialize(RefineServlet servlet) { + ImportingManager.servlet = servlet; + } + + static public void registerFormat(String format, String label) { + registerFormat(format, label, null, null); + } + + static public void registerFormat(String format, String label, String uiClass, ImportingParser parser) { + formatToRecord.put(format, new Format(format, label, true, uiClass, parser)); + } + + static public void registerFormat( + String format, String label, boolean download, String uiClass, ImportingParser parser) { + formatToRecord.put(format, new Format(format, label, download, uiClass, parser)); + } + + static public void registerFormatGuesser(String format, FormatGuesser guesser) { + List guessers = formatToGuessers.get(format); + if (guessers == null) { + guessers = new LinkedList(); + formatToGuessers.put(format, guessers); + } + guessers.add(0, guesser); // prepend so that newer guessers take priority + } + + static public void registerExtension(String extension, String format) { + extensionToFormat.put(extension.startsWith(".") ? extension : ("." + extension), format); + } + + static public void registerMimeType(String mimeType, String format) { + mimeTypeToFormat.put(mimeType, format); + } + + static public void registerUrlRewriter(UrlRewriter urlRewriter) { + urlRewriters.add(urlRewriter); + } + + static public void registerController(ButterflyModule module, String name, ImportingController controller) { + String key = module.getName() + "/" + name; + controllers.put(key, controller); + + controller.init(servlet); + } + + static public File getImportDir() { + if (importDir == null) { + File tempDir = servlet.getTempDir(); + importDir = tempDir == null ? new File(".import-temp") : new File(tempDir, "import"); + + if (importDir.exists()) { + try { + // start fresh + FileUtils.deleteDirectory(importDir); + } catch (IOException e) { + } + } + importDir.mkdirs(); + } + return importDir; + } + + static public ImportingJob createJob() { + long id = System.currentTimeMillis() + (long) (Math.random() * 1000000); + File jobDir = new File(getImportDir(), Long.toString(id)); + + ImportingJob job = new ImportingJob(id, jobDir); + jobs.put(id, job); + + return job; + } + + static public ImportingJob getJob(long id) { + return jobs.get(id); + } + + static public void disposeJob(long id) { + ImportingJob job = getJob(id); + if (job != null) { + job.dispose(); + jobs.remove(id); + } + } + + static public void writeConfiguration(JSONWriter writer, Properties options) throws JSONException { + writer.object(); + + writer.key("formats"); + writer.object(); + for (String format : formatToRecord.keySet()) { + Format record = formatToRecord.get(format); + + writer.key(format); + writer.object(); + writer.key("id"); writer.value(record.id); + writer.key("label"); writer.value(record.label); + writer.key("download"); writer.value(record.download); + writer.key("uiClass"); writer.value(record.uiClass); + writer.endObject(); + } + writer.endObject(); + + writer.key("mimeTypeToFormat"); + writer.object(); + for (String mimeType : mimeTypeToFormat.keySet()) { + writer.key(mimeType); + writer.value(mimeTypeToFormat.get(mimeType)); + } + writer.endObject(); + + writer.key("extensionToFormat"); + writer.object(); + for (String extension : extensionToFormat.keySet()) { + writer.key(extension); + writer.value(extensionToFormat.get(extension)); + } + writer.endObject(); + + writer.endObject(); + } + + static public String getFormatFromFileName(String fileName) { + int start = 0; + while (true) { + int dot = fileName.indexOf('.', start); + if (dot < 0) { + break; + } + + String extension = fileName.substring(dot); + String format = extensionToFormat.get(extension); + if (format != null) { + return format; + } else { + start = dot + 1; + } + } + return null; + } + + static public String getFormatFromMimeType(String mimeType) { + return mimeTypeToFormat.get(mimeType); + } + + static public String getFormat(String fileName, String mimeType) { + String fileNameFormat = getFormatFromFileName(fileName); + String mimeTypeFormat = mimeType == null ? null : getFormatFromMimeType(mimeType); + if (mimeTypeFormat == null) { + return fileNameFormat; + } else if (fileNameFormat == null) { + return mimeTypeFormat; + } else if (fileNameFormat.startsWith(mimeTypeFormat)) { + // file name-based format is more specific + return fileNameFormat; + } else { + return mimeTypeFormat; + } + } +} diff --git a/main/src/com/google/refine/importers/ReaderImporter.java b/main/src/com/google/refine/importing/ImportingParser.java similarity index 57% rename from main/src/com/google/refine/importers/ReaderImporter.java rename to main/src/com/google/refine/importing/ImportingParser.java index fdcc0d1b1..79eae4633 100644 --- a/main/src/com/google/refine/importers/ReaderImporter.java +++ b/main/src/com/google/refine/importing/ImportingParser.java @@ -1,6 +1,6 @@ /* -Copyright 2010, Google Inc. +Copyright 2011, Google Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,33 +31,51 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -package com.google.refine.importers; +package com.google.refine.importing; -import java.io.Reader; -import java.util.Properties; +import java.util.List; + +import org.json.JSONObject; import com.google.refine.ProjectMetadata; import com.google.refine.model.Project; -/** - * Interface for importers which take a Reader as input. - */ -public interface ReaderImporter extends Importer { - +public interface ImportingParser { /** - * Read data from a input reader into project. + * Create data sufficient for the parser UI on the client side to do its work. + * For example, an XML parser UI would need to know some sample elements so it + * can let the user pick which the path to the record elements. * - * @param reader - * reader to import data from. It is assumed to be positioned at - * the correct point and ready to go. - * @param project - * project which will contain data - * @param metadata - * metadata of new project - * @param options - * set of properties with import options - * @throws ImportException + * @param job + * @param fileRecords + * @param format + * @return JSONObject options */ - public void read(Reader reader, Project project, ProjectMetadata metadata, Properties options) - throws ImportException; + public JSONObject createParserUIInitializationData( + ImportingJob job, + List fileRecords, + String format + ); + + /** + * + * @param project + * @param metadata + * @param fileRecords + * @param format + * @param limit maximum number of rows to create + * @param options custom options put together by the UI corresponding to this parser, + * which the parser should understand + * @param exceptions + */ + public void parse( + Project project, + ProjectMetadata metadata, + ImportingJob job, + List fileRecords, + String format, + int limit, + JSONObject options, + List exceptions + ); } diff --git a/main/src/com/google/refine/importing/ImportingUtilities.java b/main/src/com/google/refine/importing/ImportingUtilities.java new file mode 100644 index 000000000..e657f3416 --- /dev/null +++ b/main/src/com/google/refine/importing/ImportingUtilities.java @@ -0,0 +1,895 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.importing; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.zip.GZIPInputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.commons.fileupload.FileItem; +import org.apache.commons.fileupload.FileUploadException; +import org.apache.commons.fileupload.ProgressListener; +import org.apache.commons.fileupload.disk.DiskFileItemFactory; +import org.apache.commons.fileupload.servlet.ServletFileUpload; +import org.apache.commons.fileupload.util.Streams; +import org.apache.commons.io.FileCleaningTracker; +import org.apache.tools.bzip2.CBZip2InputStream; +import org.apache.tools.tar.TarEntry; +import org.apache.tools.tar.TarInputStream; +import org.json.JSONArray; +import org.json.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.refine.ProjectManager; +import com.google.refine.ProjectMetadata; +import com.google.refine.importing.ImportingManager.Format; +import com.google.refine.importing.UrlRewriter.Result; +import com.google.refine.model.Project; +import com.google.refine.util.JSONUtilities; +import com.ibm.icu.text.NumberFormat; + +public class ImportingUtilities { + final static protected Logger logger = LoggerFactory.getLogger("importing-utilities"); + + static public interface Progress { + public void setProgress(String message, int percent); + public boolean isCanceled(); + } + + static public void loadDataAndPrepareJob( + HttpServletRequest request, + HttpServletResponse response, + Properties parameters, + final ImportingJob job, + JSONObject config) throws IOException, ServletException { + + JSONObject retrievalRecord = new JSONObject(); + JSONUtilities.safePut(config, "retrievalRecord", retrievalRecord); + JSONUtilities.safePut(config, "state", "loading-raw-data"); + + final JSONObject progress = new JSONObject(); + JSONUtilities.safePut(config, "progress", progress); + try { + ImportingUtilities.retrieveContentFromPostRequest( + request, + parameters, + job.getRawDataDir(), + retrievalRecord, + new Progress() { + @Override + public void setProgress(String message, int percent) { + if (message != null) { + JSONUtilities.safePut(progress, "message", message); + } + JSONUtilities.safePut(progress, "percent", percent); + } + public boolean isCanceled() { + return job.canceled; + } + } + ); + } catch (FileUploadException e) { + JSONUtilities.safePut(config, "state", "error"); + JSONUtilities.safePut(config, "error", "Error uploading data"); + + throw new ServletException(e); + } + + JSONArray fileSelectionIndexes = new JSONArray(); + JSONUtilities.safePut(config, "fileSelection", fileSelectionIndexes); + + String bestFormat = ImportingUtilities.autoSelectFiles(job, retrievalRecord, fileSelectionIndexes); + bestFormat = ImportingUtilities.guessBetterFormat(job, bestFormat); + + JSONArray rankedFormats = new JSONArray(); + JSONUtilities.safePut(config, "rankedFormats", rankedFormats); + ImportingUtilities.rankFormats(job, bestFormat, rankedFormats); + + JSONUtilities.safePut(config, "state", "ready"); + JSONUtilities.safePut(config, "hasData", true); + config.remove("progress"); + } + + static public void updateJobWithNewFileSelection(ImportingJob job, JSONArray fileSelectionArray) { + JSONUtilities.safePut(job.config, "fileSelection", fileSelectionArray); + + String bestFormat = ImportingUtilities.getCommonFormatForSelectedFiles(job, fileSelectionArray); + bestFormat = ImportingUtilities.guessBetterFormat(job, bestFormat); + + JSONArray rankedFormats = new JSONArray(); + JSONUtilities.safePut(job.config, "rankedFormats", rankedFormats); + ImportingUtilities.rankFormats(job, bestFormat, rankedFormats); + } + + static public void retrieveContentFromPostRequest( + HttpServletRequest request, + Properties parameters, + File rawDataDir, + JSONObject retrievalRecord, + final Progress progress + ) throws FileUploadException, IOException { + JSONArray fileRecords = new JSONArray(); + JSONUtilities.safePut(retrievalRecord, "files", fileRecords); + + int clipboardCount = 0; + int uploadCount = 0; + int downloadCount = 0; + int archiveCount = 0; + + // This tracks the total progress, which involves uploading data from the client + // as well as downloading data from URLs. + final SavingUpdate update = new SavingUpdate() { + @Override + public void savedMore() { + progress.setProgress(null, calculateProgressPercent(totalExpectedSize, totalRetrievedSize)); + } + @Override + public boolean isCanceled() { + return progress.isCanceled(); + } + }; + + DiskFileItemFactory fileItemFactory = new DiskFileItemFactory(); + fileItemFactory.setFileCleaningTracker(new FileCleaningTracker()); + + ServletFileUpload upload = new ServletFileUpload(fileItemFactory); + upload.setProgressListener(new ProgressListener() { + boolean setContentLength = false; + long lastBytesRead = 0; + + @Override + public void update(long bytesRead, long contentLength, int itemCount) { + if (!setContentLength) { + // Only try to set the content length if we really know it. + if (contentLength >= 0) { + update.totalExpectedSize += contentLength; + setContentLength = true; + } + } + if (setContentLength) { + update.totalRetrievedSize += (bytesRead - lastBytesRead); + lastBytesRead = bytesRead; + + update.savedMore(); + } + } + }); + + progress.setProgress("Uploading data ...", -1); + for (Object obj : upload.parseRequest(request)) { + if (progress.isCanceled()) { + break; + } + + FileItem fileItem = (FileItem) obj; + InputStream stream = fileItem.getInputStream(); + + String name = fileItem.getFieldName().toLowerCase(); + if (fileItem.isFormField()) { + if (name.equals("clipboard")) { + File file = allocateFile(rawDataDir, "clipboard.txt"); + + JSONObject fileRecord = new JSONObject(); + JSONUtilities.safePut(fileRecord, "origin", "clipboard"); + JSONUtilities.safePut(fileRecord, "declaredEncoding", request.getCharacterEncoding()); + JSONUtilities.safePut(fileRecord, "declaredMimeType", (String) null); + JSONUtilities.safePut(fileRecord, "format", "text"); + JSONUtilities.safePut(fileRecord, "fileName", "(clipboard)"); + JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir)); + + progress.setProgress("Uploading pasted clipboard text", + calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize)); + + JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null)); + + clipboardCount++; + + JSONUtilities.append(fileRecords, fileRecord); + } else if (name.equals("download")) { + String urlString = Streams.asString(stream); + URL url = new URL(urlString); + + JSONObject fileRecord = new JSONObject(); + JSONUtilities.safePut(fileRecord, "origin", "download"); + JSONUtilities.safePut(fileRecord, "url", urlString); + + for (UrlRewriter rewriter : ImportingManager.urlRewriters) { + Result result = rewriter.rewrite(urlString); + if (result != null) { + urlString = result.rewrittenUrl; + url = new URL(urlString); + + JSONUtilities.safePut(fileRecord, "url", urlString); + JSONUtilities.safePut(fileRecord, "format", result.format); + if (!result.download) { + downloadCount++; + JSONUtilities.append(fileRecords, fileRecord); + continue; + } + } + } + + URLConnection urlConnection = url.openConnection(); + InputStream stream2 = urlConnection.getInputStream(); + try { + String fileName = url.getFile(); + File file = allocateFile(rawDataDir, fileName); + + int contentLength = urlConnection.getContentLength(); + if (contentLength >= 0) { + update.totalExpectedSize += contentLength; + } + + JSONUtilities.safePut(fileRecord, "declaredEncoding", urlConnection.getContentEncoding()); + JSONUtilities.safePut(fileRecord, "declaredMimeType", urlConnection.getContentType()); + JSONUtilities.safePut(fileRecord, "fileName", fileName); + JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir)); + + progress.setProgress("Downloading " + urlString, + calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize)); + + long actualLength = saveStreamToFile(stream, file, update); + JSONUtilities.safePut(fileRecord, "size", actualLength); + if (contentLength >= 0) { + update.totalExpectedSize += (actualLength - contentLength); + } else { + update.totalExpectedSize += actualLength; + } + progress.setProgress("Saving " + urlString + " locally", + calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize)); + + if (postProcessRetrievedFile(file, fileRecord, fileRecords, progress)) { + archiveCount++; + } + + downloadCount++; + } finally { + stream2.close(); + } + } + + } else { // is file content + String fileName = fileItem.getName(); + if (fileName.length() > 0) { + long fileSize = fileItem.getSize(); + + File file = allocateFile(rawDataDir, fileName); + + JSONObject fileRecord = new JSONObject(); + JSONUtilities.safePut(fileRecord, "origin", "upload"); + JSONUtilities.safePut(fileRecord, "declaredEncoding", request.getCharacterEncoding()); + JSONUtilities.safePut(fileRecord, "declaredMimeType", fileItem.getContentType()); + JSONUtilities.safePut(fileRecord, "fileName", fileName); + JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir)); + + progress.setProgress( + "Saving file " + fileName + " locally (" + formatBytes(fileSize) + " bytes)", + calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize)); + + JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null)); + if (postProcessRetrievedFile(file, fileRecord, fileRecords, progress)) { + archiveCount++; + } + + uploadCount++; + } + } + } + + JSONUtilities.safePut(retrievalRecord, "uploadCount", uploadCount); + JSONUtilities.safePut(retrievalRecord, "downloadCount", downloadCount); + JSONUtilities.safePut(retrievalRecord, "clipboardCount", clipboardCount); + JSONUtilities.safePut(retrievalRecord, "archiveCount", archiveCount); + } + + static public String getRelativePath(File file, File dir) { + String location = file.getAbsolutePath().substring(dir.getAbsolutePath().length()); + return (location.startsWith(File.separator)) ? location.substring(1) : location; + } + + static public File allocateFile(File dir, String name) { + File file = new File(dir, name); + + int dot = name.indexOf('.'); + String prefix = dot < 0 ? name : name.substring(0, dot); + String suffix = dot < 0 ? "" : name.substring(dot); + int index = 2; + while (file.exists()) { + file = new File(dir, prefix + "-" + index++ + suffix); + } + + file.getParentFile().mkdirs(); + + return file; + } + + static public Reader getFileReader(ImportingJob job, JSONObject fileRecord) + throws FileNotFoundException { + + return getFileReader(getFile(job, JSONUtilities.getString(fileRecord, "location", "")), fileRecord); + } + + static public Reader getFileReader(File file, JSONObject fileRecord) throws FileNotFoundException { + return getReaderFromStream(new FileInputStream(file), fileRecord); + } + + static public Reader getReaderFromStream(InputStream inputStream, JSONObject fileRecord) { + String encoding = getEncoding(fileRecord); + if (encoding != null) { + try { + return new InputStreamReader(inputStream, encoding); + } catch (UnsupportedEncodingException e) { + // Ignore and fall through + } + } + return new InputStreamReader(inputStream); + } + + static public File getFile(ImportingJob job, JSONObject fileRecord) { + return getFile(job, JSONUtilities.getString(fileRecord, "location", "")); + } + + static public File getFile(ImportingJob job, String location) { + return new File(job.getRawDataDir(), location); + } + + static public String getFileSource(JSONObject fileRecord) { + return JSONUtilities.getString( + fileRecord, + "url", + JSONUtilities.getString(fileRecord, "fileName", "unknown") + ); + } + + static private abstract class SavingUpdate { + public long totalExpectedSize = 0; + public long totalRetrievedSize = 0; + + abstract public void savedMore(); + abstract public boolean isCanceled(); + } + static public long saveStreamToFile(InputStream stream, File file, SavingUpdate update) throws IOException { + long length = 0; + FileOutputStream fos = new FileOutputStream(file); + try { + byte[] bytes = new byte[4096]; + int c; + while ((update == null || !update.isCanceled()) && (c = stream.read(bytes)) > 0) { + fos.write(bytes, 0, c); + length += c; + + if (update != null) { + update.totalRetrievedSize += c; + update.savedMore(); + } + } + return length; + } finally { + fos.close(); + } + } + + static public boolean postProcessRetrievedFile( + File file, JSONObject fileRecord, JSONArray fileRecords, final Progress progress) { + + String mimeType = JSONUtilities.getString(fileRecord, "declaredMimeType", null); + File rawDataDir = file.getParentFile(); + + InputStream archiveIS = tryOpenAsArchive(file, mimeType); + if (archiveIS != null) { + try { + if (explodeArchive(rawDataDir, archiveIS, fileRecord, fileRecords, progress)) { + file.delete(); + return true; + } + } finally { + try { + archiveIS.close(); + } catch (IOException e) { + // TODO: what to do? + } + } + } + + InputStream uncompressedIS = tryOpenAsCompressedFile(file, mimeType); + if (uncompressedIS != null) { + try { + File file2 = uncompressFile(rawDataDir, uncompressedIS, fileRecord, progress); + + file.delete(); + file = file2; + } catch (IOException e) { + // TODO: what to do? + e.printStackTrace(); + } finally { + try { + archiveIS.close(); + } catch (IOException e) { + // TODO: what to do? + } + } + } + + postProcessSingleRetrievedFile(file, fileRecord); + JSONUtilities.append(fileRecords, fileRecord); + + return false; + } + + static public void postProcessSingleRetrievedFile(File file, JSONObject fileRecord) { + if (!fileRecord.has("format")) { + JSONUtilities.safePut(fileRecord, "format", + ImportingManager.getFormat( + file.getName(), + JSONUtilities.getString(fileRecord, "declaredMimeType", null))); + } + } + + static public InputStream tryOpenAsArchive(File file, String mimeType) { + String fileName = file.getName(); + try { + if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) { + return new TarInputStream(new GZIPInputStream(new FileInputStream(file))); + } else if (fileName.endsWith(".tar.bz2")) { + return new TarInputStream(new CBZip2InputStream(new FileInputStream(file))); + } else if (fileName.endsWith(".tar")) { + return new TarInputStream(new FileInputStream(file)); + } else if (fileName.endsWith(".zip")) { + return new ZipInputStream(new FileInputStream(file)); + } + } catch (IOException e) { + } + return null; + } + + static public boolean explodeArchive( + File rawDataDir, + InputStream archiveIS, + JSONObject archiveFileRecord, + JSONArray fileRecords, + final Progress progress + ) { + if (archiveIS instanceof TarInputStream) { + TarInputStream tis = (TarInputStream) archiveIS; + try { + TarEntry te; + while (!progress.isCanceled() && (te = tis.getNextEntry()) != null) { + if (!te.isDirectory()) { + String fileName2 = te.getName(); + File file2 = allocateFile(rawDataDir, fileName2); + + progress.setProgress("Extracting " + fileName2, -1); + + JSONObject fileRecord2 = new JSONObject(); + JSONUtilities.safePut(fileRecord2, "origin", JSONUtilities.getString(archiveFileRecord, "origin", null)); + JSONUtilities.safePut(fileRecord2, "declaredEncoding", (String) null); + JSONUtilities.safePut(fileRecord2, "declaredMimeType", (String) null); + JSONUtilities.safePut(fileRecord2, "fileName", fileName2); + JSONUtilities.safePut(fileRecord2, "archiveFileName", JSONUtilities.getString(archiveFileRecord, "fileName", null)); + JSONUtilities.safePut(fileRecord2, "location", getRelativePath(file2, rawDataDir)); + + JSONUtilities.safePut(fileRecord2, "size", saveStreamToFile(tis, file2, null)); + postProcessSingleRetrievedFile(file2, fileRecord2); + + JSONUtilities.append(fileRecords, fileRecord2); + } + } + } catch (IOException e) { + // TODO: what to do? + e.printStackTrace(); + } + return true; + } else if (archiveIS instanceof ZipInputStream) { + ZipInputStream zis = (ZipInputStream) archiveIS; + try { + ZipEntry ze; + while (!progress.isCanceled() && (ze = zis.getNextEntry()) != null) { + if (!ze.isDirectory()) { + String fileName2 = ze.getName(); + File file2 = allocateFile(rawDataDir, fileName2); + + progress.setProgress("Extracting " + fileName2, -1); + + JSONObject fileRecord2 = new JSONObject(); + JSONUtilities.safePut(fileRecord2, "origin", JSONUtilities.getString(archiveFileRecord, "origin", null)); + JSONUtilities.safePut(fileRecord2, "declaredEncoding", (String) null); + JSONUtilities.safePut(fileRecord2, "declaredMimeType", (String) null); + JSONUtilities.safePut(fileRecord2, "fileName", fileName2); + JSONUtilities.safePut(fileRecord2, "archiveFileName", JSONUtilities.getString(archiveFileRecord, "fileName", null)); + JSONUtilities.safePut(fileRecord2, "location", getRelativePath(file2, rawDataDir)); + + JSONUtilities.safePut(fileRecord2, "size", saveStreamToFile(zis, file2, null)); + postProcessSingleRetrievedFile(file2, fileRecord2); + + JSONUtilities.append(fileRecords, fileRecord2); + } + } + } catch (IOException e) { + // TODO: what to do? + e.printStackTrace(); + } + return true; + } + return false; + } + + static public InputStream tryOpenAsCompressedFile(File file, String mimeType) { + String fileName = file.getName(); + try { + if (fileName.endsWith(".gz")) { + return new GZIPInputStream(new FileInputStream(file)); + } else if (fileName.endsWith(".bz2")) { + return new CBZip2InputStream(new FileInputStream(file)); + } + } catch (IOException e) { + } + return null; + } + + static public File uncompressFile( + File rawDataDir, + InputStream uncompressedIS, + JSONObject fileRecord, + final Progress progress + ) throws IOException { + String fileName = JSONUtilities.getString(fileRecord, "fileName", "unknown"); + File file2 = allocateFile(rawDataDir, fileName); + + progress.setProgress("Uncompressing " + fileName, -1); + + saveStreamToFile(uncompressedIS, file2, null); + + JSONUtilities.safePut(fileRecord, "declaredEncoding", (String) null); + JSONUtilities.safePut(fileRecord, "declaredMimeType", (String) null); + JSONUtilities.safePut(fileRecord, "location", getRelativePath(file2, rawDataDir)); + + return file2; + } + + static private int calculateProgressPercent(long totalExpectedSize, long totalRetrievedSize) { + return totalExpectedSize == 0 ? -1 : (int) (totalRetrievedSize * 100 / totalExpectedSize); + } + + static private String formatBytes(long bytes) { + return NumberFormat.getIntegerInstance().format(bytes); + } + + static public String getEncoding(JSONObject fileRecord) { + String encoding = JSONUtilities.getString(fileRecord, "encoding", null); + if (encoding == null) { + encoding = JSONUtilities.getString(fileRecord, "declaredEncoding", null); + } + return encoding; + } + + static public String autoSelectFiles(ImportingJob job, JSONObject retrievalRecord, JSONArray fileSelectionIndexes) { + final Map formatToCount = new HashMap(); + List formats = new ArrayList(); + + JSONArray fileRecords = JSONUtilities.getArray(retrievalRecord, "files"); + int count = fileRecords.length(); + for (int i = 0; i < count; i++) { + JSONObject fileRecord = JSONUtilities.getObjectElement(fileRecords, i); + String format = JSONUtilities.getString(fileRecord, "format", null); + if (format != null) { + if (formatToCount.containsKey(format)) { + formatToCount.put(format, formatToCount.get(format) + 1); + } else { + formatToCount.put(format, 1); + formats.add(format); + } + } + } + Collections.sort(formats, new Comparator() { + @Override + public int compare(String o1, String o2) { + return formatToCount.get(o2) - formatToCount.get(o1); + } + }); + + String bestFormat = formats.size() > 0 ? formats.get(0) : null; + if (JSONUtilities.getInt(retrievalRecord, "archiveCount", 0) == 0) { + // If there's no archive, then select everything + for (int i = 0; i < count; i++) { + JSONUtilities.append(fileSelectionIndexes, i); + } + } else { + // Otherwise, select files matching the best format + for (int i = 0; i < count; i++) { + JSONObject fileRecord = JSONUtilities.getObjectElement(fileRecords, i); + String format = JSONUtilities.getString(fileRecord, "format", null); + if (format != null && format.equals(bestFormat)) { + JSONUtilities.append(fileSelectionIndexes, i); + } + } + } + return bestFormat; + } + + static public String getCommonFormatForSelectedFiles(ImportingJob job, JSONArray fileSelectionIndexes) { + JSONObject retrievalRecord = JSONUtilities.getObject(job.config, "retrievalRecord"); + + final Map formatToCount = new HashMap(); + List formats = new ArrayList(); + + JSONArray fileRecords = JSONUtilities.getArray(retrievalRecord, "files"); + int count = fileSelectionIndexes.length(); + for (int i = 0; i < count; i++) { + int index = JSONUtilities.getIntElement(fileSelectionIndexes, i, -1); + if (index >= 0 && index < fileRecords.length()) { + JSONObject fileRecord = JSONUtilities.getObjectElement(fileRecords, index); + String format = JSONUtilities.getString(fileRecord, "format", null); + if (format != null) { + if (formatToCount.containsKey(format)) { + formatToCount.put(format, formatToCount.get(format) + 1); + } else { + formatToCount.put(format, 1); + formats.add(format); + } + } + } + } + Collections.sort(formats, new Comparator() { + @Override + public int compare(String o1, String o2) { + return formatToCount.get(o2) - formatToCount.get(o1); + } + }); + + return formats.size() > 0 ? formats.get(0) : null; + } + + static String guessBetterFormat(ImportingJob job, String bestFormat) { + JSONObject retrievalRecord = JSONUtilities.getObject(job.config, "retrievalRecord"); + return retrievalRecord != null ? guessBetterFormat(job, retrievalRecord, bestFormat) : bestFormat; + } + + static String guessBetterFormat(ImportingJob job, JSONObject retrievalRecord, String bestFormat) { + JSONArray fileRecords = JSONUtilities.getArray(retrievalRecord, "files"); + return fileRecords != null ? guessBetterFormat(job, fileRecords, bestFormat) : bestFormat; + } + + static String guessBetterFormat(ImportingJob job, JSONArray fileRecords, String bestFormat) { + if (bestFormat != null && fileRecords != null && fileRecords.length() > 0) { + JSONObject firstFileRecord = JSONUtilities.getObjectElement(fileRecords, 0); + String encoding = getEncoding(firstFileRecord); + String location = JSONUtilities.getString(firstFileRecord, "location", null); + + if (location != null) { + File file = new File(job.getRawDataDir(), location); + + while (true) { + String betterFormat = null; + + List guessers = ImportingManager.formatToGuessers.get(bestFormat); + if (guessers != null) { + for (FormatGuesser guesser : guessers) { + betterFormat = guesser.guess(file, encoding, bestFormat); + if (betterFormat != null) { + break; + } + } + } + + if (betterFormat != null && !betterFormat.equals(bestFormat)) { + bestFormat = betterFormat; + } else { + break; + } + } + } + } + return bestFormat; + } + + static void rankFormats(ImportingJob job, final String bestFormat, JSONArray rankedFormats) { + final Map formatToSegments = new HashMap(); + + boolean download = bestFormat == null ? true : + ImportingManager.formatToRecord.get(bestFormat).download; + + List formats = new ArrayList(ImportingManager.formatToRecord.keySet().size()); + for (String format : ImportingManager.formatToRecord.keySet()) { + Format record = ImportingManager.formatToRecord.get(format); + if (record.uiClass != null && record.parser != null && record.download == download) { + formats.add(format); + formatToSegments.put(format, format.split("/")); + } + } + + if (bestFormat == null) { + Collections.sort(formats); + } else { + Collections.sort(formats, new Comparator() { + @Override + public int compare(String format1, String format2) { + if (format1.equals(bestFormat)) { + return -1; + } else if (format2.equals(bestFormat)) { + return 1; + } else { + return compareBySegments(format1, format2); + } + } + + int compareBySegments(String format1, String format2) { + int c = commonSegments(format2) - commonSegments(format1); + return c != 0 ? c : format1.compareTo(format2); + } + + int commonSegments(String format) { + String[] bestSegments = formatToSegments.get(bestFormat); + String[] segments = formatToSegments.get(format); + if (bestSegments == null || segments == null) { + return 0; + } else { + int i; + for (i = 0; i < bestSegments.length && i < segments.length; i++) { + if (!bestSegments[i].equals(segments[i])) { + break; + } + } + return i; + } + } + }); + } + + for (String format : formats) { + JSONUtilities.append(rankedFormats, format); + } + } + + static public List getSelectedFileRecords(ImportingJob job) { + List results = new ArrayList(); + + JSONObject retrievalRecord = JSONUtilities.getObject(job.config, "retrievalRecord"); + if (retrievalRecord != null) { + JSONArray fileRecordArray = JSONUtilities.getArray(retrievalRecord, "files"); + if (fileRecordArray != null) { + JSONArray fileSelectionArray = JSONUtilities.getArray(job.config, "fileSelection"); + if (fileSelectionArray != null) { + for (int i = 0; i < fileSelectionArray.length(); i++) { + int index = JSONUtilities.getIntElement(fileSelectionArray, i, -1); + if (index >= 0 && index < fileRecordArray.length()) { + results.add(JSONUtilities.getObjectElement(fileRecordArray, index)); + } + } + } + } + } + return results; + } + + static public void previewParse(ImportingJob job, String format, JSONObject optionObj, List exceptions) { + Format record = ImportingManager.formatToRecord.get(format); + if (record == null || record.parser == null) { + // TODO: what to do? + return; + } + + job.prepareNewProject(); + + record.parser.parse( + job.project, + job.metadata, + job, + getSelectedFileRecords(job), + format, + 100, + optionObj, + exceptions + ); + + job.project.update(); // update all internal models, indexes, caches, etc. + } + + static public long createProject( + final ImportingJob job, + final String format, + final JSONObject optionObj, + final List exceptions) { + final Format record = ImportingManager.formatToRecord.get(format); + if (record == null || record.parser == null) { + // TODO: what to do? + return -1; + } + + JSONUtilities.safePut(job.config, "state", "creating-project"); + + final Project project = new Project(); + new Thread() { + public void run() { + ProjectMetadata pm = new ProjectMetadata(); + pm.setName(JSONUtilities.getString(optionObj, "projectName", "Untitled")); + pm.setEncoding(JSONUtilities.getString(optionObj, "encoding", "UTF-8")); + + record.parser.parse( + project, + pm, + job, + getSelectedFileRecords(job), + format, + -1, + optionObj, + exceptions + ); + + if (!job.canceled) { + project.update(); // update all internal models, indexes, caches, etc. + + ProjectManager.singleton.registerProject(project, pm); + + JSONUtilities.safePut(job.config, "projectID", project.id); + JSONUtilities.safePut(job.config, "state", "created-project"); + } + } + }.start(); + + return project.id; + } + + static public void setCreatingProjectProgress(ImportingJob job, String message, int percent) { + JSONObject progress = JSONUtilities.getObject(job.config, "progress"); + if (progress == null) { + progress = new JSONObject(); + JSONUtilities.safePut(job.config, "progress", progress); + } + JSONUtilities.safePut(progress, "message", message); + JSONUtilities.safePut(progress, "percent", percent); + } +} diff --git a/main/webapp/modules/core/scripts/import.js b/main/src/com/google/refine/importing/UrlRewriter.java similarity index 82% rename from main/webapp/modules/core/scripts/import.js rename to main/src/com/google/refine/importing/UrlRewriter.java index d7b760a33..4e6015488 100644 --- a/main/webapp/modules/core/scripts/import.js +++ b/main/src/com/google/refine/importing/UrlRewriter.java @@ -31,23 +31,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -var theImportJob = {}; -var ui = {}; +package com.google.refine.importing; -var Refine = { -}; - -function resize() { - var header = $("#header"); - - var leftPanelWidth = 300; - var width = $(window).width(); - var top = $("#header").outerHeight(); - var height = $(window).height() - top; +public interface UrlRewriter { + static public class Result { + public String rewrittenUrl; + public String format; + public boolean download; + } + public Result rewrite(String url); } - -function onLoad() { - $(window).bind("resize", resize); -} -$(onLoad); \ No newline at end of file diff --git a/main/src/com/google/refine/model/ColumnModel.java b/main/src/com/google/refine/model/ColumnModel.java index b79a0028e..11e2de7b1 100644 --- a/main/src/com/google/refine/model/ColumnModel.java +++ b/main/src/com/google/refine/model/ColumnModel.java @@ -110,28 +110,28 @@ public class ColumnModel implements Jsonizable { } synchronized public void addColumn(int index, Column column, boolean avoidNameCollision) throws ModelException { - String baseName = column.getName(); - - if (_nameToColumn.containsKey(baseName)) { - if (!avoidNameCollision) { - throw new ModelException("Duplicated column name"); - } - } - - String name = baseName; - int i = 1; - while (true) { - if (_nameToColumn.containsKey(name)) { - i++; - name = baseName + i; - } else { - break; - } - } - - column.setName(name); - columns.add(index < 0 ? columns.size() : index, column); - _nameToColumn.put(name, column); // so the next call can check + String baseName = column.getName(); + + if (_nameToColumn.containsKey(baseName)) { + if (!avoidNameCollision) { + throw new ModelException("Duplicated column name"); + } + } + + String name = baseName; + int i = 1; + while (true) { + if (_nameToColumn.containsKey(name)) { + i++; + name = baseName + i; + } else { + break; + } + } + + column.setName(name); + columns.add(index < 0 ? columns.size() : index, column); + _nameToColumn.put(name, column); // so the next call can check } synchronized public Column getColumnByName(String name) { diff --git a/main/src/com/google/refine/model/meta/FileUploadImportSource.java b/main/src/com/google/refine/model/meta/FileUploadImportSource.java deleted file mode 100644 index d8c1e9137..000000000 --- a/main/src/com/google/refine/model/meta/FileUploadImportSource.java +++ /dev/null @@ -1,62 +0,0 @@ -package com.google.refine.model.meta; - -import java.io.File; -import java.io.InputStream; -import java.util.Date; -import java.util.Properties; - -import javax.servlet.http.HttpServletRequest; - -import org.apache.commons.fileupload.FileItemIterator; -import org.apache.commons.fileupload.FileItemStream; -import org.apache.commons.fileupload.servlet.ServletFileUpload; -import org.json.JSONException; -import org.json.JSONObject; -import org.json.JSONWriter; - -import com.google.refine.commands.importing.ImportJob; - -public class FileUploadImportSource extends ImportSource { - public String originalFileName; - - @Override - protected void customWrite(JSONWriter writer, Properties options) - throws JSONException { - writer.key("originalFileName"); writer.value(originalFileName); - } - - @Override - protected void customReconstruct(JSONObject obj) throws JSONException { - if (obj.has("originalFileName")) { - originalFileName = obj.getString("originalFileName"); - } - } - - @Override - public void retrieveContent(HttpServletRequest request, Properties options, ImportJob job) throws Exception { - ServletFileUpload upload = new ServletFileUpload(); - FileItemIterator iter = upload.getItemIterator(request); - while (iter.hasNext()) { - FileItemStream item = iter.next(); - if (!item.isFormField()) { - String fileName = item.getName(); - if (fileName.length() > 0) { - InputStream stream = item.openStream(); - try { - File file = new File(job.dir, "data"); - - this.accessTime = new Date(); - this.contentType = item.getContentType(); - this.encoding = request.getCharacterEncoding(); - this.originalFileName = fileName; - this.size = saveStreamToFileOrDir( - item.openStream(), file, this.contentType, fileName, job, request.getContentLength()); - this.isArchive = file.isDirectory(); - } finally { - stream.close(); - } - } - } - } - } -} \ No newline at end of file diff --git a/main/src/com/google/refine/model/meta/ImportConfig.java b/main/src/com/google/refine/model/meta/ImportConfig.java deleted file mode 100644 index dc2ae87d2..000000000 --- a/main/src/com/google/refine/model/meta/ImportConfig.java +++ /dev/null @@ -1,5 +0,0 @@ -package com.google.refine.model.meta; - -public class ImportConfig { - -} diff --git a/main/src/com/google/refine/model/meta/ImportSource.java b/main/src/com/google/refine/model/meta/ImportSource.java deleted file mode 100644 index 2f695e8d8..000000000 --- a/main/src/com/google/refine/model/meta/ImportSource.java +++ /dev/null @@ -1,167 +0,0 @@ -package com.google.refine.model.meta; - -import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.Date; -import java.util.Properties; -import java.util.zip.GZIPInputStream; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; - -import javax.servlet.http.HttpServletRequest; - -import org.apache.tools.bzip2.CBZip2InputStream; -import org.apache.tools.tar.TarEntry; -import org.apache.tools.tar.TarInputStream; -import org.json.JSONException; -import org.json.JSONObject; -import org.json.JSONWriter; - -import com.google.refine.Jsonizable; -import com.google.refine.commands.importing.ImportJob; -import com.google.refine.commands.importing.ImportManager; -import com.google.refine.util.ParsingUtilities; - -abstract public class ImportSource implements Jsonizable { - public Date accessTime; - public long size; - public boolean isArchive = false; - - public String contentType; - public String encoding; - - @Override - public void write(JSONWriter writer, Properties options) - throws JSONException { - writer.object(); - writer.key("type"); writer.value(ImportManager.getImportSourceClassName(this.getClass())); - writer.key("accessTime"); writer.value(ParsingUtilities.dateToString(accessTime)); - writer.key("size"); writer.value(size); - writer.key("isArchive"); writer.value(isArchive); - writer.key("contentType"); writer.value(contentType); - writer.key("encoding"); writer.value(encoding); - writer.endObject(); - } - - public void reconstruct(JSONObject obj) throws JSONException { - if (obj.has("accessTime")) { - accessTime = ParsingUtilities.stringToDate(obj.getString("accessTime")); - } - if (obj.has("size")) { - size = obj.getLong("size"); - } - if (obj.has("isArchive")) { - isArchive = obj.getBoolean("isArchive"); - } - if (obj.has("contentType")) { - contentType = obj.getString("contentType"); - } - if (obj.has("encoding")) { - encoding = obj.getString("encoding"); - } - customReconstruct(obj); - } - - abstract public void retrieveContent(HttpServletRequest request, Properties options, ImportJob job) - throws Exception; - - abstract protected void customWrite(JSONWriter writer, Properties options) throws JSONException; - abstract protected void customReconstruct(JSONObject obj) throws JSONException; - - static protected long saveStreamToFileOrDir( - InputStream is, - File file, - String contentType, - String fileNameOrUrl, - ImportJob job, - long expectedSize - ) throws IOException { - InputStream archiveIS = null; - if (fileNameOrUrl != null) { - try { - if (fileNameOrUrl.endsWith(".tar.gz") || - fileNameOrUrl.endsWith(".tar.gz.gz") || - fileNameOrUrl.endsWith(".tgz")) { - archiveIS = new TarInputStream(new GZIPInputStream(is)); - } else if (fileNameOrUrl.endsWith(".tar.bz2")) { - archiveIS = new TarInputStream(new CBZip2InputStream(is)); - } else if (fileNameOrUrl.endsWith(".tar")) { - archiveIS = new TarInputStream(is); - } else if (fileNameOrUrl.endsWith(".zip")) { - archiveIS = new ZipInputStream(is); - } - } catch (IOException e) { - archiveIS = null; - } - } - - job.bytesSaved = 0; - if (archiveIS == null) { - saveStreamToFile(is, file, job, true, expectedSize); - } else { - job.retrievingProgress = -1; - - // NOTE(SM): unfortunately, java.io does not provide any generalized class for - // archive-like input streams so while both TarInputStream and ZipInputStream - // behave precisely the same, there is no polymorphic behavior so we have - // to treat each instance explicitly... one of those times you wish you had - // closures - - if (archiveIS instanceof TarInputStream) { - TarInputStream tis = (TarInputStream) archiveIS; - TarEntry te; - while ((te = tis.getNextEntry()) != null) { - if (!te.isDirectory()) { - saveStreamToFile(tis, new File(file, te.getName()), job, false, 0); - } - } - } else if (archiveIS instanceof ZipInputStream) { - ZipInputStream zis = (ZipInputStream) archiveIS; - ZipEntry ze; - long compressedSize = 0; - while ((ze = zis.getNextEntry()) != null) { - if (!ze.isDirectory()) { - saveStreamToFile(zis, new File(file, ze.getName()), job, false, 0); - - compressedSize += ze.getCompressedSize(); // this might be negative if not known - if (compressedSize > 0) { - job.retrievingProgress = (int) (compressedSize * 100 / expectedSize); - } - } - } - } - } - return job.bytesSaved; - } - - static private void saveStreamToFile( - InputStream is, - File file, - ImportJob job, - boolean updateProgress, - long expectedSize - ) throws IOException { - byte data[] = new byte[4096]; - - file.getParentFile().mkdirs(); - - FileOutputStream fos = new FileOutputStream(file); - BufferedOutputStream bos = new BufferedOutputStream(fos, data.length); - - int count; - while ((count = is.read(data, 0, data.length)) != -1) { - bos.write(data, 0, count); - - job.bytesSaved += count; - if (updateProgress) { - job.retrievingProgress = (int) (job.bytesSaved * 100 / expectedSize); - } - } - - bos.flush(); - bos.close(); - } -} diff --git a/main/src/com/google/refine/model/meta/TextImportSource.java b/main/src/com/google/refine/model/meta/TextImportSource.java deleted file mode 100644 index ddfdf1e89..000000000 --- a/main/src/com/google/refine/model/meta/TextImportSource.java +++ /dev/null @@ -1,28 +0,0 @@ -package com.google.refine.model.meta; - -import java.util.Properties; - -import javax.servlet.http.HttpServletRequest; - -import org.json.JSONException; -import org.json.JSONObject; -import org.json.JSONWriter; - -import com.google.refine.commands.importing.ImportJob; - -public class TextImportSource extends ImportSource { - @Override - protected void customWrite(JSONWriter writer, Properties options) - throws JSONException { - } - - @Override - protected void customReconstruct(JSONObject obj) throws JSONException { - } - - @Override - public void retrieveContent(HttpServletRequest request, Properties options, ImportJob job) throws Exception { - // TODO Auto-generated method stub - - } -} \ No newline at end of file diff --git a/main/src/com/google/refine/model/meta/WebImportSource.java b/main/src/com/google/refine/model/meta/WebImportSource.java deleted file mode 100644 index b40748c75..000000000 --- a/main/src/com/google/refine/model/meta/WebImportSource.java +++ /dev/null @@ -1,34 +0,0 @@ -package com.google.refine.model.meta; - -import java.util.Properties; - -import javax.servlet.http.HttpServletRequest; - -import org.json.JSONException; -import org.json.JSONObject; -import org.json.JSONWriter; - -import com.google.refine.commands.importing.ImportJob; - -public class WebImportSource extends ImportSource { - public String url; - - @Override - protected void customWrite(JSONWriter writer, Properties options) - throws JSONException { - writer.key("url"); writer.value(url); - } - - @Override - protected void customReconstruct(JSONObject obj) throws JSONException { - if (obj.has("url")) { - url = obj.getString("url"); - } - } - - @Override - public void retrieveContent(HttpServletRequest request, Properties options, ImportJob job) throws Exception { - // TODO Auto-generated method stub - - } -} \ No newline at end of file diff --git a/main/src/com/google/refine/util/JSONUtilities.java b/main/src/com/google/refine/util/JSONUtilities.java index c08d804dc..0e9757564 100644 --- a/main/src/com/google/refine/util/JSONUtilities.java +++ b/main/src/com/google/refine/util/JSONUtilities.java @@ -35,8 +35,10 @@ package com.google.refine.util; import java.util.ArrayList; import java.util.Calendar; +import java.util.Collection; import java.util.Date; import java.util.List; +import java.util.Map; import org.json.JSONArray; import org.json.JSONException; @@ -44,6 +46,14 @@ import org.json.JSONObject; import org.json.JSONWriter; public class JSONUtilities { + static public JSONObject getObject(JSONObject obj, String key) { + try { + return obj.getJSONObject(key); + } catch (JSONException e) { + return null; + } + } + static public String getString(JSONObject obj, String key, String def) { try { return obj.getString(key); @@ -94,6 +104,14 @@ public class JSONUtilities { } } + static public JSONArray getArray(JSONObject obj, String key) { + try { + return obj.getJSONArray(key); + } catch (JSONException e) { + return null; + } + } + static public int[] getIntArray(JSONObject obj, String key) { try { JSONArray a = obj.getJSONArray(key); @@ -144,24 +162,161 @@ public class JSONUtilities { writer.endArray(); } + static public void writeStringArray(JSONWriter writer, String[] strings) throws JSONException { + writer.array(); + for (String s : strings) { + writer.value(s); + } + writer.endArray(); + } + static public void putField(JSONObject obj, String key, Object value) throws JSONException { - if (value instanceof Integer) { - obj.put(key, ((Integer) value).intValue()); - } else if (value instanceof Long) { - obj.put(key, ((Long) value).intValue()); - } else if (value instanceof Number) { - obj.put(key, ((Double) value).doubleValue()); - } else if (value instanceof Boolean) { - obj.put(key, (Boolean) value); - } else if (value instanceof Date) { - obj.put(key, ParsingUtilities.dateToString((Date) value)); - } else if (value instanceof Calendar) { - obj.put(key, ParsingUtilities.dateToString(((Calendar) value).getTime())); - } else if (value instanceof String) { - obj.put(key, (String) value); - } else { - obj.put(key, value.toString()); - } + if (value instanceof Integer) { + obj.put(key, ((Integer) value).intValue()); + } else if (value instanceof Long) { + obj.put(key, ((Long) value).intValue()); + } else if (value instanceof Number) { + obj.put(key, ((Double) value).doubleValue()); + } else if (value instanceof Boolean) { + obj.put(key, (Boolean) value); + } else if (value instanceof Date) { + obj.put(key, ParsingUtilities.dateToString((Date) value)); + } else if (value instanceof Calendar) { + obj.put(key, ParsingUtilities.dateToString(((Calendar) value).getTime())); + } else if (value instanceof String) { + obj.put(key, (String) value); + } else { + obj.put(key, value.toString()); + } + } + + static public JSONObject getObjectElement(JSONArray a, int i) { + try { + return a.getJSONObject(i); + } catch (JSONException e) { + return null; + } + } + + static public int getIntElement(JSONArray a, int i, int def) { + try { + return a.getInt(i); + } catch (JSONException e) { + return def; + } + } + + static public void append(JSONArray a, JSONObject element) { + try { + a.put(a.length(), element); + } catch (JSONException e) { + } + } + + static public void append(JSONArray a, Object element) { + try { + a.put(a.length(), element); + } catch (JSONException e) { + } + } + + static public void append(JSONArray a, int element) { + try { + a.put(a.length(), element); + } catch (JSONException e) { + } + } + + static public void append(JSONArray a, long element) { + try { + a.put(a.length(), element); + } catch (JSONException e) { + } + } + + static public void append(JSONArray a, double element) { + try { + a.put(a.length(), element); + } catch (JSONException e) { + } + } + + static public void append(JSONArray a, boolean element) { + try { + a.put(a.length(), element); + } catch (JSONException e) { + } + } + + static public void append(JSONArray a, String element) { + try { + a.put(a.length(), element); + } catch (JSONException e) { + } + } + + static public void safePut(JSONObject obj, String key, int value) { + try { + obj.put(key, value); + } catch (JSONException e) { + // Ignore: the JSONObject is just too happy about throwing exceptions. + } + } + + static public void safePut(JSONObject obj, String key, long value) { + try { + obj.put(key, value); + } catch (JSONException e) { + // Ignore: the JSONObject is just too happy about throwing exceptions. + } + } + + static public void safePut(JSONObject obj, String key, double value) { + try { + obj.put(key, value); + } catch (JSONException e) { + // Ignore: the JSONObject is just too happy about throwing exceptions. + } + } + + static public void safePut(JSONObject obj, String key, boolean value) { + try { + obj.put(key, value); + } catch (JSONException e) { + // Ignore: the JSONObject is just too happy about throwing exceptions. + } + } + + static public void safePut(JSONObject obj, String key, String value) { + try { + obj.put(key, value); + } catch (JSONException e) { + // Ignore: the JSONObject is just too happy about throwing exceptions. + } + } + + static public void safePut(JSONObject obj, String key, Collection value) { + try { + obj.put(key, value); + } catch (JSONException e) { + // Ignore: the JSONObject is just too happy about throwing exceptions. + } + } + + static public void safePut(JSONObject obj, String key, Map value) { + try { + obj.put(key, value); + } catch (JSONException e) { + // Ignore: the JSONObject is just too happy about throwing exceptions. + } + } + + static public void safePut(JSONObject obj, String key, Object value) { + try { + obj.put(key, value); + } catch (JSONException e) { + // Ignore: the JSONObject is just too happy about throwing exceptions. + } } static public Object[] toArray(JSONArray a) throws JSONException { diff --git a/main/src/com/google/refine/util/TrackingInputStream.java b/main/src/com/google/refine/util/TrackingInputStream.java new file mode 100644 index 000000000..13393b67a --- /dev/null +++ b/main/src/com/google/refine/util/TrackingInputStream.java @@ -0,0 +1,64 @@ +package com.google.refine.util; + +import java.io.IOException; +import java.io.InputStream; + +public class TrackingInputStream extends InputStream { + final private InputStream is; + protected long bytesRead; + + public TrackingInputStream(InputStream is) { + this.is = is; + } + + public long getBytesRead() { + return bytesRead; + } + + @Override + public int read() throws IOException { + return (int) track(is.read()); + } + + @Override + public int read(byte[] b) throws IOException { + return (int) track(is.read(b)); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + return (int) track(is.read(b, off, len)); + } + + @Override + public long skip(long n) throws IOException { + return track(is.skip(n)); + } + + @Override + public void mark(int readlimit) { + is.mark(readlimit); + } + + @Override + public void reset() throws IOException { + is.reset(); + } + + @Override + public boolean markSupported() { + return is.markSupported(); + } + + @Override + public void close() throws IOException { + is.close(); + } + + protected long track(long bytesRead) { + if (bytesRead > 0) { + this.bytesRead += bytesRead; + } + return bytesRead; + } +} diff --git a/main/tests/server/src/com/google/refine/tests/RefineTest.java b/main/tests/server/src/com/google/refine/tests/RefineTest.java index 6613f7622..f75ff422e 100644 --- a/main/tests/server/src/com/google/refine/tests/RefineTest.java +++ b/main/tests/server/src/com/google/refine/tests/RefineTest.java @@ -33,6 +33,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.tests; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; import org.slf4j.Logger; import org.testng.Assert; import org.testng.annotations.BeforeSuite; @@ -41,6 +48,7 @@ import com.google.refine.model.Cell; import com.google.refine.model.Column; import com.google.refine.model.Project; import com.google.refine.model.Row; +import com.google.refine.util.JSONUtilities; public class RefineTest { @@ -82,4 +90,41 @@ public class RefineTest { logger.info(sb.toString()); } } + + //----helpers---- + + static public void whenGetBooleanOption(String name, JSONObject options, Boolean def){ + when(options.has(name)).thenReturn(true); + when(JSONUtilities.getBoolean(options, name, def)).thenReturn(def); + } + + static public void whenGetIntegerOption(String name, JSONObject options, int def){ + when(options.has(name)).thenReturn(true); + when(JSONUtilities.getInt(options, name, def)).thenReturn(def); + } + + static public void whenGetStringOption(String name, JSONObject options, String def){ + when(options.has(name)).thenReturn(true); + when(JSONUtilities.getString(options, name, def)).thenReturn(def); + } + + static public void whenGetObjectOption(String name, JSONObject options, JSONObject def){ + when(options.has(name)).thenReturn(true); + when(JSONUtilities.getObject(options, name)).thenReturn(def); + } + + static public void whenGetArrayOption(String name, JSONObject options, JSONArray def){ + when(options.has(name)).thenReturn(true); + when(JSONUtilities.getArray(options, name)).thenReturn(def); + } + + static public void verifyGetOption(String name, JSONObject options){ + verify(options, times(1)).has(name); + try { + verify(options, times(1)).get(name); + } catch (JSONException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } } diff --git a/main/tests/server/src/com/google/refine/tests/importers/FixedWidthImporterTests.java b/main/tests/server/src/com/google/refine/tests/importers/FixedWidthImporterTests.java index 879b3c94d..3fc4dc4c3 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/FixedWidthImporterTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/FixedWidthImporterTests.java @@ -1,14 +1,12 @@ package com.google.refine.tests.importers; -import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; import java.io.StringReader; -import java.util.Properties; +import org.json.JSONArray; import org.slf4j.LoggerFactory; import org.testng.Assert; import org.testng.annotations.AfterMethod; @@ -16,13 +14,10 @@ import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; -import com.google.refine.ProjectMetadata; import com.google.refine.importers.FixedWidthImporter; -import com.google.refine.importers.ImportException; -import com.google.refine.model.Project; -import com.google.refine.tests.RefineTest; +import com.google.refine.util.JSONUtilities; -public class FixedWidthImporterTests extends RefineTest { +public class FixedWidthImporterTests extends ImporterTest { @BeforeTest public void init() { logger = LoggerFactory.getLogger(this.getClass()); @@ -30,65 +25,44 @@ public class FixedWidthImporterTests extends RefineTest { //constants String SAMPLE_ROW = "NDB_NoShrt_DescWater"; - String SAMPLE_ROW_WIDTHS = "6,9,5"; //System Under Test FixedWidthImporter SUT = null; - //mock dependencies - Project project = null; - Properties properties = null; - - @BeforeMethod public void SetUp(){ + super.SetUp(); SUT = new FixedWidthImporter(); - project = new Project(); //FIXME - should we try and use mock(Project.class); - seems unnecessary complexity - properties = mock(Properties.class); } @AfterMethod public void TearDown(){ SUT = null; - project = null; - properties = null; - } - - //TODO a lot of these tests are very similar to the TsvCsvImporterTests. It might be possible to overlap them - - @Test - public void canParseSeparator(){ - int[] i = null; - try { - i = SUT.getColumnWidthsFromString("1,2,3"); - } catch (ImportException e) { - Assert.fail(e.getMessage()); - } - - Assert.assertNotNull(i); - Assert.assertEquals(i[0], 1); - Assert.assertEquals(i[1], 2); - Assert.assertEquals(i[2], 3); + super.TearDown(); } //---------------------read tests------------------------ @Test public void readFixedWidth(){ StringReader reader = new StringReader(SAMPLE_ROW + "\nTooShort"); - - when(properties.getProperty("fixed-column-widths")).thenReturn(SAMPLE_ROW_WIDTHS); - whenGetIntegerOption("ignore",properties,0); - whenGetIntegerOption("header-lines",properties,0); - whenGetIntegerOption("limit",properties,-1); - whenGetIntegerOption("skip",properties,0); - + + JSONArray columnWidths = new JSONArray(); + JSONUtilities.append(columnWidths, 6); + JSONUtilities.append(columnWidths, 9); + JSONUtilities.append(columnWidths, 5); + + whenGetArrayOption("columnWidths", options, columnWidths); + whenGetIntegerOption("ignoreLines", options, 0); + whenGetIntegerOption("headerLines", options, 0); + whenGetIntegerOption("skipDataLines", options, 0); + whenGetIntegerOption("limit", options, -1); + try { - SUT.read(reader, project, new ProjectMetadata(), properties); + parseOneFile(SUT, reader); } catch (Exception e) { Assert.fail(e.getMessage()); } - - + Assert.assertEquals(project.rows.size(), 2); Assert.assertEquals(project.rows.get(0).cells.size(), 3); Assert.assertEquals((String)project.rows.get(0).cells.get(0).value, "NDB_No"); @@ -98,28 +72,11 @@ public class FixedWidthImporterTests extends RefineTest { Assert.assertEquals((String)project.rows.get(1).cells.get(0).value, "TooSho"); Assert.assertEquals((String)project.rows.get(1).cells.get(1).value, "rt"); Assert.assertNull(project.rows.get(1).cells.get(2)); - - verify(properties, times(1)).getProperty("fixed-column-widths"); - verifyGetOption("ignore",properties); - verifyGetOption("header-lines",properties); - verifyGetOption("limit",properties); - verifyGetOption("skip",properties); - } - - //----helpers---- - - public void whenGetBooleanOption(String name, Properties properties, Boolean def){ - when(properties.containsKey(name)).thenReturn(true); - when(properties.getProperty(name)).thenReturn(Boolean.toString(def)); - } - - public void whenGetIntegerOption(String name, Properties properties, int def){ - when(properties.containsKey(name)).thenReturn(true); - when(properties.getProperty(name)).thenReturn(Integer.toString(def)); - } - - public void verifyGetOption(String name, Properties properties){ - verify(properties, times(1)).containsKey(name); - verify(properties, times(1)).getProperty(name); + + JSONUtilities.getIntArray(verify(options, times(1)), "columnWidths"); + verifyGetOption("ignore", options); + verifyGetOption("header-lines", options); + verifyGetOption("limit", options); + verifyGetOption("skip", options); } } diff --git a/main/tests/server/src/com/google/refine/tests/importers/ImporterTest.java b/main/tests/server/src/com/google/refine/tests/importers/ImporterTest.java new file mode 100644 index 000000000..aa5c0d9ff --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/importers/ImporterTest.java @@ -0,0 +1,109 @@ +package com.google.refine.tests.importers; + +import static org.mockito.Mockito.mock; + +import java.io.InputStream; +import java.io.Reader; +import java.util.ArrayList; + +import org.json.JSONObject; + +import com.google.refine.ProjectMetadata; +import com.google.refine.importers.ImportingParserBase; +import com.google.refine.importers.tree.ImportColumnGroup; +import com.google.refine.importers.tree.TreeImportingParserBase; +import com.google.refine.importers.tree.XmlImportUtilities; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingManager; +import com.google.refine.model.Project; +import com.google.refine.tests.RefineTest; + +abstract class ImporterTest extends RefineTest { + //mock dependencies + protected Project project; + protected ProjectMetadata metadata; + protected ImportingJob job; + + protected JSONObject options; + + public void SetUp(){ + //FIXME - should we try and use mock(Project.class); - seems unnecessary complexity + project = new Project(); + metadata = new ProjectMetadata(); + job = ImportingManager.createJob(); + + options = mock(JSONObject.class); + } + + public void TearDown(){ + project = null; + metadata = null; + + ImportingManager.disposeJob(job.id); + job = null; + + options = null; + } + + protected void parseOneFile(ImportingParserBase parser, Reader reader) { + parser.parseOneFile( + project, + metadata, + job, + "file-source", + reader, + -1, + options, + new ArrayList() + ); + project.update(); + } + + protected void parseOneFile(ImportingParserBase parser, InputStream inputStream) { + parser.parseOneFile( + project, + metadata, + job, + "file-source", + inputStream, + -1, + options, + new ArrayList() + ); + project.update(); + } + + protected void parseOneFile(TreeImportingParserBase parser, Reader reader) { + ImportColumnGroup rootColumnGroup = new ImportColumnGroup(); + parser.parseOneFile( + project, + metadata, + job, + "file-source", + reader, + rootColumnGroup, + -1, + options, + new ArrayList() + ); + XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup); + project.columnModel.update(); + } + + protected void parseOneFile(TreeImportingParserBase parser, InputStream inputStream) { + ImportColumnGroup rootColumnGroup = new ImportColumnGroup(); + parser.parseOneFile( + project, + metadata, + job, + "file-source", + inputStream, + rootColumnGroup, + -1, + options, + new ArrayList() + ); + XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup); + project.columnModel.update(); + } +} diff --git a/main/tests/server/src/com/google/refine/tests/importers/JsonImporterTests.java b/main/tests/server/src/com/google/refine/tests/importers/JsonImporterTests.java index 86e67d062..bc72399a9 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/JsonImporterTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/JsonImporterTests.java @@ -33,12 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.tests.importers; -import static org.mockito.Mockito.mock; - import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.StringReader; import java.io.UnsupportedEncodingException; -import java.util.Properties; import org.slf4j.LoggerFactory; import org.testng.Assert; @@ -47,15 +45,12 @@ import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; -import com.google.refine.ProjectMetadata; import com.google.refine.importers.JsonImporter; -import com.google.refine.importers.parsers.JSONParser; -import com.google.refine.importers.parsers.TreeParserToken; -import com.google.refine.model.Project; +import com.google.refine.importers.JsonImporter.JSONTreeReader; +import com.google.refine.importers.tree.TreeReader.Token; import com.google.refine.model.Row; -import com.google.refine.tests.RefineTest; -public class JsonImporterTests extends RefineTest { +public class JsonImporterTests extends ImporterTest { @BeforeTest public void init() { logger = LoggerFactory.getLogger(this.getClass()); @@ -63,28 +58,29 @@ public class JsonImporterTests extends RefineTest { //dependencies - Project project = null; - Properties options = null; ByteArrayInputStream inputStream = null; //System Under Test JsonImporter SUT = null; - @BeforeMethod public void SetUp(){ + super.SetUp(); SUT = new JsonImporter(); - project = new Project(); - options = mock(Properties.class); } @AfterMethod - public void TearDown() throws IOException{ + public void TearDown() { SUT = null; - project = null; - options = null; - if (inputStream != null) inputStream.close(); - inputStream = null; + if (inputStream != null) { + try { + inputStream.close(); + } catch (IOException e) { + // Ignore + } + inputStream = null; + } + super.TearDown(); } @Test @@ -181,8 +177,8 @@ public class JsonImporterTests extends RefineTest { String sampleJson2 = "{\"field\":{}}"; String sampleJson3 = "{\"field\":[{},{}]}"; - JSONParser parser = new JSONParser(new ByteArrayInputStream( sampleJson.getBytes( "UTF-8" ) )); - TreeParserToken token = TreeParserToken.Ignorable; + JSONTreeReader parser = new JSONTreeReader(new StringReader(sampleJson)); + Token token = Token.Ignorable; int i = 0; try{ while(token != null){ @@ -191,8 +187,8 @@ public class JsonImporterTests extends RefineTest { break; i++; if(i == 3){ - Assert.assertEquals(TreeParserToken.Value, token); - Assert.assertEquals("field", parser.getLocalName()); + Assert.assertEquals(Token.Value, token); + Assert.assertEquals("field", parser.getFieldName()); } } }catch(Exception e){ @@ -200,8 +196,8 @@ public class JsonImporterTests extends RefineTest { } - parser = new JSONParser(new ByteArrayInputStream( sampleJson2.getBytes( "UTF-8" ) ) ); - token = TreeParserToken.Ignorable; + parser = new JSONTreeReader(new StringReader(sampleJson2)); + token = Token.Ignorable; i = 0; try{ while(token != null){ @@ -210,16 +206,16 @@ public class JsonImporterTests extends RefineTest { break; i++; if(i == 3){ - Assert.assertEquals(TreeParserToken.StartEntity, token); - Assert.assertEquals(parser.getLocalName(), "field"); + Assert.assertEquals(Token.StartEntity, token); + Assert.assertEquals(parser.getFieldName(), "field"); } } }catch(Exception e){ //silent } - parser = new JSONParser(new ByteArrayInputStream( sampleJson3.getBytes( "UTF-8" ) ) ); - token = TreeParserToken.Ignorable; + parser = new JSONTreeReader(new StringReader(sampleJson3)); + token = Token.Ignorable; i = 0; try{ while(token != null){ @@ -228,16 +224,16 @@ public class JsonImporterTests extends RefineTest { break; i++; if(i == 3){ - Assert.assertEquals(token, TreeParserToken.StartEntity); - Assert.assertEquals(parser.getLocalName(), "field"); + Assert.assertEquals(token, Token.StartEntity); + Assert.assertEquals(parser.getFieldName(), "field"); } if(i == 4){ - Assert.assertEquals(token, TreeParserToken.StartEntity); - Assert.assertEquals(parser.getLocalName(), "__anonymous__"); + Assert.assertEquals(token, Token.StartEntity); + Assert.assertEquals(parser.getFieldName(), "__anonymous__"); } if(i == 6){ - Assert.assertEquals(token, TreeParserToken.StartEntity); - Assert.assertEquals(parser.getLocalName(), "__anonymous__"); + Assert.assertEquals(token, Token.StartEntity); + Assert.assertEquals(parser.getFieldName(), "__anonymous__"); } } }catch(Exception e){ @@ -352,7 +348,7 @@ public class JsonImporterTests extends RefineTest { } try { - SUT.read(inputStream, project, new ProjectMetadata(), options); + parseOneFile(SUT, inputStream); } catch (Exception e) { Assert.fail(); } diff --git a/main/tests/server/src/com/google/refine/tests/importers/RdfTripleImporterTests.java b/main/tests/server/src/com/google/refine/tests/importers/RdfTripleImporterTests.java index f9e61ecf5..a371f681d 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/RdfTripleImporterTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/RdfTripleImporterTests.java @@ -34,7 +34,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.tests.importers; import java.io.StringReader; -import java.util.Properties; import org.slf4j.LoggerFactory; import org.testng.Assert; @@ -42,13 +41,10 @@ import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; -import com.google.refine.ProjectMetadata; import com.google.refine.importers.RdfTripleImporter; -import com.google.refine.model.Project; -import com.google.refine.tests.RefineTest; +import com.google.refine.util.JSONUtilities; - -public class RdfTripleImporterTests extends RefineTest { +public class RdfTripleImporterTests extends ImporterTest { @BeforeTest public void init() { @@ -58,15 +54,12 @@ public class RdfTripleImporterTests extends RefineTest { //System Under Test RdfTripleImporter SUT = null; - Project project = null; - Properties options = null; @BeforeMethod public void SetUp(){ + super.SetUp(); SUT = new RdfTripleImporter(); - project = new Project(); - options = new Properties(); - options.put("base-url", "http://rdf.freebase.com"); + JSONUtilities.safePut(options, "base-url", "http://rdf.freebase.com"); } @Test(enabled=false) @@ -75,8 +68,7 @@ public class RdfTripleImporterTests extends RefineTest { StringReader reader = new StringReader(sampleRdf); try { - SUT.read(reader, project, new ProjectMetadata(), options); - project.update(); + parseOneFile(SUT, reader); } catch (Exception e) { Assert.fail(); } @@ -98,8 +90,7 @@ public class RdfTripleImporterTests extends RefineTest { StringReader reader = new StringReader(sampleRdf); try { - SUT.read(reader, project, new ProjectMetadata(), options); - project.update(); + parseOneFile(SUT, reader); } catch (Exception e) { Assert.fail(); } @@ -140,8 +131,7 @@ public class RdfTripleImporterTests extends RefineTest { StringReader reader = new StringReader(sampleRdf); try { - SUT.read(reader, project, new ProjectMetadata(), options); - project.update(); + parseOneFile(SUT, reader); } catch (Exception e) { Assert.fail(); } @@ -175,8 +165,7 @@ public class RdfTripleImporterTests extends RefineTest { StringReader reader = new StringReader(sampleRdf); try { - SUT.read(reader, project, new ProjectMetadata(), options); - project.update(); + parseOneFile(SUT, reader); } catch (Exception e) { Assert.fail(); } diff --git a/main/tests/server/src/com/google/refine/tests/importers/TsvCsvImporterTests.java b/main/tests/server/src/com/google/refine/tests/importers/TsvCsvImporterTests.java index 2ce938a3c..8dd0d4ced 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/TsvCsvImporterTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/TsvCsvImporterTests.java @@ -33,15 +33,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.tests.importers; -import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; -import java.io.IOException; -import java.io.LineNumberReader; import java.io.StringReader; -import java.util.Properties; import org.slf4j.LoggerFactory; import org.testng.Assert; @@ -51,12 +47,10 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import com.google.refine.ProjectMetadata; -import com.google.refine.importers.TsvCsvImporter; -import com.google.refine.model.Project; -import com.google.refine.tests.RefineTest; +import com.google.refine.importers.SeparatorBasedImporter; +import com.google.refine.util.JSONUtilities; -public class TsvCsvImporterTests extends RefineTest { +public class TsvCsvImporterTests extends ImporterTest { @BeforeTest public void init() { @@ -67,25 +61,18 @@ public class TsvCsvImporterTests extends RefineTest { String SAMPLE_ROW = "NDB_No,Shrt_Desc,Water"; //System Under Test - TsvCsvImporter SUT = null; - - //mock dependencies - Project project = null; - Properties properties = null; - + SeparatorBasedImporter SUT = null; @BeforeMethod - public void SetUp(){ - SUT = new TsvCsvImporter(); - project = new Project(); //FIXME - should we try and use mock(Project.class); - seems unnecessary complexity - properties = mock(Properties.class); + public void SetUp() { + super.SetUp(); + SUT = new SeparatorBasedImporter(); } @AfterMethod public void TearDown(){ SUT = null; - project = null; - properties = null; + super.TearDown(); } @Test(dataProvider = "CSV-TSV-AutoDetermine") @@ -94,11 +81,10 @@ public class TsvCsvImporterTests extends RefineTest { String inputSeparator = sep == "\t" ? "\t" : ","; String input = "col1" + inputSeparator + "col2" + inputSeparator + "col3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); - try { - SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 1, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } Assert.assertEquals(project.columnModel.columns.size(), 3); @@ -113,11 +99,10 @@ public class TsvCsvImporterTests extends RefineTest { String inputSeparator = sep == "\t" ? "\t" : ","; String input = "value1" + inputSeparator + "value2" + inputSeparator + "value3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); - try { - SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, false, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 0, false, false, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } Assert.assertEquals(project.columnModel.columns.size(), 1); @@ -135,10 +120,10 @@ public class TsvCsvImporterTests extends RefineTest { "data1" + inputSeparator + "data2" + inputSeparator + "data3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 1, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } @@ -160,13 +145,12 @@ public class TsvCsvImporterTests extends RefineTest { String input = "col1" + inputSeparator + "col2" + inputSeparator + "col3\n" + "data1" + inputSeparator + "234" + inputSeparator + "data3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 1, true, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 1, true, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "col1"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2"); @@ -185,13 +169,12 @@ public class TsvCsvImporterTests extends RefineTest { String inputSeparator = sep == "\t" ? "\t" : ","; String input = "data1" + inputSeparator + "data2" + inputSeparator + "data3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 0, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Column"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "Column2"); @@ -209,13 +192,12 @@ public class TsvCsvImporterTests extends RefineTest { String inputSeparator = sep == "\t" ? "\t" : ","; String input = " data1 " + inputSeparator + " 3.4 " + inputSeparator + " data3 "; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 0, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.rows.size(), 1); Assert.assertEquals(project.rows.get(0).cells.size(), 3); @@ -230,13 +212,12 @@ public class TsvCsvImporterTests extends RefineTest { String inputSeparator = sep == "\t" ? "\t" : ","; String input = " data1" + inputSeparator + " 12" + inputSeparator + " data3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 0, true, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 0, true, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.rows.size(), 1); Assert.assertEquals(project.rows.get(0).cells.size(), 3); @@ -251,13 +232,12 @@ public class TsvCsvImporterTests extends RefineTest { String inputSeparator = sep == "\t" ? "\t" : ","; String input = " data1" + inputSeparator + inputSeparator + " data3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 0, true, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 0, true, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.rows.size(), 1); Assert.assertEquals(project.rows.get(0).cells.size(), 3); @@ -274,13 +254,12 @@ public class TsvCsvImporterTests extends RefineTest { "sub1" + inputSeparator + "sub2" + inputSeparator + "sub3\n" + "data1" + inputSeparator + "data2" + inputSeparator + "data3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 2, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 2, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "col1 sub1"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2 sub2"); @@ -299,13 +278,12 @@ public class TsvCsvImporterTests extends RefineTest { String input = "col1" + inputSeparator + "col2" + inputSeparator + "col3\n" + "data1" + inputSeparator + "data2" + inputSeparator + "data3" + inputSeparator + "data4" + inputSeparator + "data5" + inputSeparator + "data6"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 1, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 6); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "col1"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2"); @@ -330,13 +308,12 @@ public class TsvCsvImporterTests extends RefineTest { String input = "col1" + inputSeparator + "col2" + inputSeparator + "col3\n" + "\"\"\"To Be\"\" is often followed by \"\"or not To Be\"\"\"" + inputSeparator + "data2"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 1, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "col1"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2"); @@ -355,13 +332,12 @@ public class TsvCsvImporterTests extends RefineTest { "col1" + inputSeparator + "col2" + inputSeparator + "col3\n" + "data1" + inputSeparator + "data2" + inputSeparator + "data3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 1, 1, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 1, 1, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "col1"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2"); @@ -381,13 +357,12 @@ public class TsvCsvImporterTests extends RefineTest { "skip1\n" + "data1" + inputSeparator + "data2" + inputSeparator + "data3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 1, 0, 1, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 1, 0, 1, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "col1"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2"); @@ -411,13 +386,12 @@ public class TsvCsvImporterTests extends RefineTest { "skip1\n" + "data1" + inputSeparator + "data2" + inputSeparator + "data3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 1, 3, 2, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 1, 3, 2, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "col1 sub1"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2 sub2"); @@ -444,10 +418,10 @@ public class TsvCsvImporterTests extends RefineTest { "data-row2-cell1" + inputSeparator + "data-row2-cell2" + inputSeparator + "\n" + //missing last data point of this row on purpose "data-row3-cell1" + inputSeparator + "data-row3-cell2" + inputSeparator + "data-row1-cell3"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, 2, 2, 3, 2, false, true, false); - } catch (IOException e) { + prepareOptions(sep, 2, 2, 3, 2, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } Assert.assertEquals(project.columnModel.columns.size(), 3); @@ -471,13 +445,12 @@ public class TsvCsvImporterTests extends RefineTest { String inputSeparator = sep == "\t" ? "\t" : ","; String input = "data1" + inputSeparator + "data2\"" + inputSeparator + "data3" + inputSeparator + "data4"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true, true); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 0, false, true, true); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 4); Assert.assertEquals(project.rows.size(), 1); Assert.assertEquals(project.rows.get(0).cells.size(), 4); @@ -493,13 +466,12 @@ public class TsvCsvImporterTests extends RefineTest { String input = "col1" + inputSeparator + "col2" + inputSeparator + "col3\n" + "\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\"" + inputSeparator + "data2"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 1, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "col1"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2"); @@ -517,13 +489,12 @@ public class TsvCsvImporterTests extends RefineTest { String input = "col1" + inputSeparator + "col2" + inputSeparator + "col3\n" + "\"A line with many \n\n\n\n\n empty lines\"" + inputSeparator + "data2"; - LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); try { - SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false); - } catch (IOException e) { + prepareOptions(sep, -1, 0, 0, 1, false, true, false); + parseOneFile(SUT, new StringReader(input)); + } catch (Exception e) { Assert.fail(); } - Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "col1"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2"); @@ -536,35 +507,34 @@ public class TsvCsvImporterTests extends RefineTest { //---------------------read tests------------------------ @Test - public void readCsvWithProperties(){ + public void readCsvWithProperties() { StringReader reader = new StringReader(SAMPLE_ROW); - - when(properties.getProperty("separator")).thenReturn(","); - whenGetIntegerOption("ignore",properties,0); - whenGetIntegerOption("header-lines",properties,0); - whenGetIntegerOption("limit",properties,-1); - whenGetIntegerOption("skip",properties,0); - whenGetIntegerOption("ignore-quotes",properties,0); - + + when(JSONUtilities.getString(options, "separator", null)).thenReturn(","); + whenGetIntegerOption("ignore", options, 0); + whenGetIntegerOption("header-lines", options, 0); + whenGetIntegerOption("limit", options, -1); + whenGetIntegerOption("skip", options, 0); + whenGetIntegerOption("ignore-quotes", options, 0); + try { - SUT.read(reader, project, new ProjectMetadata(), properties); + parseOneFile(SUT, reader); } catch (Exception e) { Assert.fail(); } - - + Assert.assertEquals(project.rows.size(), 1); Assert.assertEquals(project.rows.get(0).cells.size(), 3); Assert.assertEquals((String)project.rows.get(0).cells.get(0).value, "NDB_No"); Assert.assertEquals((String)project.rows.get(0).cells.get(1).value, "Shrt_Desc"); Assert.assertEquals((String)project.rows.get(0).cells.get(2).value, "Water"); - verify(properties, times(1)).getProperty("separator"); - verifyGetOption("ignore",properties); - verifyGetOption("header-lines",properties); - verifyGetOption("limit",properties); - verifyGetOption("skip",properties); - verifyGetOption("ignore-quotes",properties); + JSONUtilities.getString(verify(options, times(1)), "separator", null); + verifyGetOption("ignore", options); + verifyGetOption("header-lines", options); + verifyGetOption("limit", options); + verifyGetOption("skip", options); + verifyGetOption("ignore-quotes", options); } @Test @@ -572,20 +542,19 @@ public class TsvCsvImporterTests extends RefineTest { String input = "data1,data2\",data3,data4"; StringReader reader = new StringReader(input); - when(properties.getProperty("separator")).thenReturn(","); - whenGetIntegerOption("ignore",properties,0); - whenGetIntegerOption("header-lines",properties,0); - whenGetIntegerOption("limit",properties,-1); - whenGetIntegerOption("skip",properties,0); - whenGetBooleanOption("ignore-quotes",properties,true); - + when(JSONUtilities.getString(options, "separator", null)).thenReturn(","); + whenGetIntegerOption("ignore", options, 0); + whenGetIntegerOption("header-lines", options, 0); + whenGetIntegerOption("limit", options, -1); + whenGetIntegerOption("skip", options, 0); + whenGetBooleanOption("ignore-quotes", options, true); + try { - SUT.read(reader, project, new ProjectMetadata(), properties); + parseOneFile(SUT, reader); } catch (Exception e) { Assert.fail(); } - - + Assert.assertEquals(project.rows.size(), 1); Assert.assertEquals(project.rows.get(0).cells.size(), 4); Assert.assertEquals((String)project.rows.get(0).cells.get(0).value, "data1"); @@ -593,12 +562,12 @@ public class TsvCsvImporterTests extends RefineTest { Assert.assertEquals((String)project.rows.get(0).cells.get(2).value, "data3"); Assert.assertEquals((String)project.rows.get(0).cells.get(3).value, "data4"); - verify(properties, times(1)).getProperty("separator"); - verifyGetOption("ignore",properties); - verifyGetOption("header-lines",properties); - verifyGetOption("limit",properties); - verifyGetOption("skip",properties); - verifyGetOption("ignore-quotes",properties); + JSONUtilities.getString(verify(options, times(1)), "separator", null); + verifyGetOption("ignore", options); + verifyGetOption("header-lines", options); + verifyGetOption("limit", options); + verifyGetOption("skip", options); + verifyGetOption("ignore-quotes", options); } //--helpers-- @@ -611,20 +580,17 @@ public class TsvCsvImporterTests extends RefineTest { {","},{"\t"},{null} }; } - - public void whenGetBooleanOption(String name, Properties properties, Boolean def){ - when(properties.containsKey(name)).thenReturn(true); - when(properties.getProperty(name)).thenReturn(Boolean.toString(def)); + + private void prepareOptions( + String sep, int limit, int skip, int ignoreLines, + int headerLines, boolean guessValueType, boolean splitIntoColumns, boolean ignoreQuotes) { + JSONUtilities.safePut(options, "separator", sep); + JSONUtilities.safePut(options, "limit", limit); + JSONUtilities.safePut(options, "skipDataLines", skip); + JSONUtilities.safePut(options, "ignoreLines", ignoreLines); + JSONUtilities.safePut(options, "headerLines", headerLines); + JSONUtilities.safePut(options, "guessCellValueTypes", guessValueType); + JSONUtilities.safePut(options, "splitIntoColumns", splitIntoColumns); + JSONUtilities.safePut(options, "processQuotes", !ignoreQuotes); } - - public void whenGetIntegerOption(String name, Properties properties, int def){ - when(properties.containsKey(name)).thenReturn(true); - when(properties.getProperty(name)).thenReturn(Integer.toString(def)); - } - - public void verifyGetOption(String name, Properties properties){ - verify(properties, times(1)).containsKey(name); - verify(properties, times(1)).getProperty(name); - } - } diff --git a/main/tests/server/src/com/google/refine/tests/importers/XmlImportUtilitiesStub.java b/main/tests/server/src/com/google/refine/tests/importers/XmlImportUtilitiesStub.java index 7f8d60b3b..c0efeae93 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/XmlImportUtilitiesStub.java +++ b/main/tests/server/src/com/google/refine/tests/importers/XmlImportUtilitiesStub.java @@ -35,27 +35,27 @@ package com.google.refine.tests.importers; import java.util.List; -import javax.servlet.ServletException; - -import com.google.refine.importers.XmlImportUtilities; -import com.google.refine.importers.parsers.TreeParser; +import com.google.refine.importers.tree.ImportColumnGroup; +import com.google.refine.importers.tree.ImportRecord; +import com.google.refine.importers.tree.TreeReader; +import com.google.refine.importers.tree.XmlImportUtilities; import com.google.refine.model.Project; public class XmlImportUtilitiesStub extends XmlImportUtilities { - public List detectRecordElementWrapper(TreeParser parser, String tag) throws ServletException{ + public List detectRecordElementWrapper(TreeReader parser, String tag) throws Exception{ return super.detectRecordElement(parser, tag); } - public void ProcessSubRecordWrapper(Project project, TreeParser parser, ImportColumnGroup columnGroup, ImportRecord record) throws ServletException{ + public void ProcessSubRecordWrapper(Project project, TreeReader parser, ImportColumnGroup columnGroup, ImportRecord record) throws Exception{ super.processSubRecord(project, parser, columnGroup, record); } - public void findRecordWrapper(Project project, TreeParser parser, String[] recordPath, int pathIndex, ImportColumnGroup rootColumnGroup) throws ServletException{ - super.findRecord(project, parser, recordPath, pathIndex, rootColumnGroup); + public void findRecordWrapper(Project project, TreeReader parser, String[] recordPath, int pathIndex, ImportColumnGroup rootColumnGroup) throws Exception{ + super.findRecord(project, parser, recordPath, pathIndex, rootColumnGroup, -1); } - public void processRecordWrapper(Project project, TreeParser parser, ImportColumnGroup rootColumnGroup) throws ServletException{ + public void processRecordWrapper(Project project, TreeReader parser, ImportColumnGroup rootColumnGroup) throws Exception{ super.processRecord(project, parser, rootColumnGroup); } diff --git a/main/tests/server/src/com/google/refine/tests/importers/XmlImportUtilitiesTests.java b/main/tests/server/src/com/google/refine/tests/importers/XmlImportUtilitiesTests.java index bb3292a22..c7c70cea5 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/XmlImportUtilitiesTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/XmlImportUtilitiesTests.java @@ -35,11 +35,12 @@ package com.google.refine.tests.importers; import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; -import javax.servlet.ServletException; +import javax.xml.stream.XMLStreamException; import org.slf4j.LoggerFactory; import org.testng.Assert; @@ -48,13 +49,12 @@ import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; -import com.google.refine.importers.TreeImportUtilities.ImportColumn; -import com.google.refine.importers.TreeImportUtilities.ImportColumnGroup; -import com.google.refine.importers.TreeImportUtilities.ImportRecord; -import com.google.refine.importers.parsers.JSONParser; -import com.google.refine.importers.parsers.TreeParser; -import com.google.refine.importers.parsers.TreeParserToken; -import com.google.refine.importers.parsers.XmlParser; +import com.google.refine.importers.JsonImporter.JSONTreeReader; +import com.google.refine.importers.XmlImporter.XmlParser; +import com.google.refine.importers.tree.ImportColumn; +import com.google.refine.importers.tree.ImportColumnGroup; +import com.google.refine.importers.tree.ImportRecord; +import com.google.refine.importers.tree.TreeReader; import com.google.refine.model.Project; import com.google.refine.model.Row; import com.google.refine.tests.RefineTest; @@ -69,7 +69,7 @@ public class XmlImportUtilitiesTests extends RefineTest { //dependencies Project project; - TreeParser parser; + TreeReader parser; ImportColumnGroup columnGroup; ImportRecord record; ByteArrayInputStream inputStream; @@ -134,7 +134,7 @@ public class XmlImportUtilitiesTests extends RefineTest { List response = new ArrayList(); try { response = SUT.detectRecordElementWrapper(parser, tag); - } catch (ServletException e) { + } catch (Exception e) { Assert.fail(e.getMessage()); } Assert.assertNotNull(response); @@ -152,7 +152,7 @@ public class XmlImportUtilitiesTests extends RefineTest { List response = new ArrayList(); try { response = SUT.detectRecordElementWrapper(parser, tag); - } catch (ServletException e) { + } catch (Exception e) { Assert.fail(e.getMessage()); } Assert.assertNotNull(response); @@ -171,7 +171,7 @@ public class XmlImportUtilitiesTests extends RefineTest { List response = new ArrayList(); try { response = SUT.detectRecordElementWrapper(parser, tag); - } catch (ServletException e) { + } catch (Exception e) { Assert.fail(e.getMessage()); } Assert.assertNull(response); @@ -181,7 +181,7 @@ public class XmlImportUtilitiesTests extends RefineTest { public void detectRecordElementRegressionXmlTest(){ loadSampleXml(); - String[] path = XmlImportUtilitiesStub.detectRecordElement(new XmlParser(inputStream)); + String[] path = XmlImportUtilitiesStub.detectRecordElement(createXmlParser()); Assert.assertNotNull(path); Assert.assertEquals(path.length, 2); Assert.assertEquals(path[0], "library"); @@ -192,7 +192,8 @@ public class XmlImportUtilitiesTests extends RefineTest { public void detectRecordElementRegressionJsonTest(){ loadSampleJson(); - String[] path = XmlImportUtilitiesStub.detectRecordElement(new JSONParser(inputStream)); + String[] path = XmlImportUtilitiesStub.detectRecordElement( + new JSONTreeReader(new InputStreamReader(inputStream))); Assert.assertNotNull(path); Assert.assertEquals(path.length, 2); Assert.assertEquals(path[0], "__anonymous__"); @@ -204,7 +205,7 @@ public class XmlImportUtilitiesTests extends RefineTest { loadSampleXml(); String[] recordPath = new String[]{"library","book"}; - XmlImportUtilitiesStub.importTreeData(new XmlParser(inputStream), project, recordPath, columnGroup ); + XmlImportUtilitiesStub.importTreeData(createXmlParser(), project, recordPath, columnGroup, -1); log(project); assertProjectCreated(project, 0, 6); @@ -224,7 +225,7 @@ public class XmlImportUtilitiesTests extends RefineTest { loadData(XmlImporterTests.getSampleWithVaryingStructure()); String[] recordPath = new String[]{"library", "book"}; - XmlImportUtilitiesStub.importTreeData(new XmlParser(inputStream), project, recordPath, columnGroup); + XmlImportUtilitiesStub.importTreeData(createXmlParser(), project, recordPath, columnGroup, -1); log(project); assertProjectCreated(project, 0, 6); @@ -278,7 +279,7 @@ public class XmlImportUtilitiesTests extends RefineTest { try { SUT.findRecordWrapper(project, parser, recordPath, pathIndex, columnGroup); - } catch (ServletException e) { + } catch (Exception e) { Assert.fail(); } @@ -297,7 +298,7 @@ public class XmlImportUtilitiesTests extends RefineTest { try { SUT.processRecordWrapper(project, parser, columnGroup); - } catch (ServletException e) { + } catch (Exception e) { Assert.fail(); } log(project); @@ -318,7 +319,7 @@ public class XmlImportUtilitiesTests extends RefineTest { try { SUT.processRecordWrapper(project, parser, columnGroup); - } catch (ServletException e) { + } catch (Exception e) { Assert.fail(); } log(project); @@ -343,7 +344,7 @@ public class XmlImportUtilitiesTests extends RefineTest { try { SUT.processRecordWrapper(project, parser, columnGroup); - } catch (ServletException e) { + } catch (Exception e) { Assert.fail(); } log(project); @@ -367,7 +368,7 @@ public class XmlImportUtilitiesTests extends RefineTest { try { SUT.ProcessSubRecordWrapper(project, parser, columnGroup, record); - } catch (ServletException e) { + } catch (Exception e) { Assert.fail(); } log(project); @@ -429,18 +430,24 @@ public class XmlImportUtilitiesTests extends RefineTest { public void ParserSkip(){ try { - if(parser.getEventType() == TreeParserToken.Ignorable){ + if (parser.current() == TreeReader.Token.Ignorable){ parser.next(); //move parser forward once e.g. skip the START_DOCUMENT parser event } - } catch (ServletException e1) { + } catch (Exception e1) { Assert.fail(); } } - public void createXmlParser(){ - parser = new XmlParser(inputStream); + public TreeReader createXmlParser(){ + try { + parser = new XmlParser(inputStream); + return parser; + } catch (XMLStreamException e) { + return null; + } } - public void createJsonParser(){ - parser = new JSONParser(inputStream); + public TreeReader createJsonParser(){ + parser = new JSONTreeReader(new InputStreamReader(inputStream)); + return parser; } } diff --git a/main/tests/server/src/com/google/refine/tests/importers/XmlImporterTests.java b/main/tests/server/src/com/google/refine/tests/importers/XmlImporterTests.java index ddaf153b7..077559305 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/XmlImporterTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/XmlImporterTests.java @@ -33,12 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.tests.importers; -import static org.mockito.Mockito.mock; - import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; -import java.util.Properties; import org.slf4j.LoggerFactory; import org.testng.Assert; @@ -47,14 +44,11 @@ import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; -import com.google.refine.ProjectMetadata; import com.google.refine.importers.XmlImporter; -import com.google.refine.model.Project; import com.google.refine.model.Row; -import com.google.refine.tests.RefineTest; -public class XmlImporterTests extends RefineTest { +public class XmlImporterTests extends ImporterTest { @BeforeTest public void init() { @@ -62,28 +56,29 @@ public class XmlImporterTests extends RefineTest { } //dependencies - Project project = null; - Properties options = null; ByteArrayInputStream inputStream = null; //System Under Test XmlImporter SUT = null; - - + @BeforeMethod public void SetUp(){ + super.SetUp(); SUT = new XmlImporter(); - project = new Project(); - options = mock(Properties.class); } @AfterMethod - public void TearDown() throws IOException{ + public void TearDown() { SUT = null; - project = null; - options = null; - if (inputStream != null) inputStream.close(); - inputStream = null; + if (inputStream != null) { + try { + inputStream.close(); + } catch (IOException e) { + // Ignore + } + inputStream = null; + } + super.TearDown(); } @Test @@ -309,11 +304,9 @@ public class XmlImporterTests extends RefineTest { } try { - SUT.read(inputStream, project, new ProjectMetadata(), options); + parseOneFile(SUT, inputStream); } catch (Exception e) { Assert.fail(); } } - - } diff --git a/main/webapp/modules/core/MOD-INF/controller.js b/main/webapp/modules/core/MOD-INF/controller.js index f52ae7b0f..35d15a3d9 100644 --- a/main/webapp/modules/core/MOD-INF/controller.js +++ b/main/webapp/modules/core/MOD-INF/controller.js @@ -50,9 +50,10 @@ function registerCommands() { RS.registerCommand(module, "get-version", new Packages.com.google.refine.commands.GetVersionCommand()); - RS.registerCommand(module, "create-import-job", new Packages.com.google.refine.commands.importing.CreateImportJobCommand()); - RS.registerCommand(module, "retrieve-import-content", new Packages.com.google.refine.commands.importing.RetrieveImportContentCommand()); - RS.registerCommand(module, "get-import-job-status", new Packages.com.google.refine.commands.importing.GetImportJobStatusCommand()); + RS.registerCommand(module, "get-importing-configuration", new Packages.com.google.refine.commands.importing.GetImportingConfigurationCommand()); + RS.registerCommand(module, "create-importing-job", new Packages.com.google.refine.commands.importing.CreateImportingJobCommand()); + RS.registerCommand(module, "get-importing-job-status", new Packages.com.google.refine.commands.importing.GetImportingJobStatusCommand()); + RS.registerCommand(module, "importing-controller", new Packages.com.google.refine.commands.importing.ImportingControllerCommand()); RS.registerCommand(module, "create-project-from-upload", new Packages.com.google.refine.commands.project.CreateProjectCommand()); RS.registerCommand(module, "import-project", new Packages.com.google.refine.commands.project.ImportProjectCommand()); @@ -120,12 +121,9 @@ function registerCommands() { RS.registerCommand(module, "get-expression-language-info", new Packages.com.google.refine.commands.expr.GetExpressionLanguageInfoCommand()); RS.registerCommand(module, "get-expression-history", new Packages.com.google.refine.commands.expr.GetExpressionHistoryCommand()); - RS.registerCommand(module, "get-starred-expressions", new Packages.com.google.refine.commands.expr.GetStarredExpressionsCommand()); - RS.registerCommand(module, "toggle-starred-expression", new Packages.com.google.refine.commands.expr.ToggleStarredExpressionCommand()); RS.registerCommand(module, "log-expression", new Packages.com.google.refine.commands.expr.LogExpressionCommand()); RS.registerCommand(module, "preview-expression", new Packages.com.google.refine.commands.expr.PreviewExpressionCommand()); - RS.registerCommand(module, "get-preference", new Packages.com.google.refine.commands.GetPreferenceCommand()); RS.registerCommand(module, "get-all-preferences", new Packages.com.google.refine.commands.GetAllPreferencesCommand()); RS.registerCommand(module, "set-preference", new Packages.com.google.refine.commands.SetPreferenceCommand()); @@ -168,11 +166,98 @@ function registerOperations() { OR.registerOperation(module, "recon-copy-across-columns", Packages.com.google.refine.operations.recon.ReconCopyAcrossColumnsOperation); } -function registerImportSourceClasses() { - var RM = Packages.com.google.refine.commands.importing.ImportManager; - RM.registerImportSourceClass("file-upload", Packages.com.google.refine.model.meta.FileUploadImportSource); - RM.registerImportSourceClass("text", Packages.com.google.refine.model.meta.TextImportSource); - RM.registerImportSourceClass("web", Packages.com.google.refine.model.meta.WebImportSource); +function registerImporting() { + var IM = Packages.com.google.refine.importing.ImportingManager; + + /* + * Formats and their UI class names and parsers: + * - UI class names are used on the client-side in Javascript to instantiate code that lets the user + * configure the parser's options. + * - Parsers are server-side code that do the actual parsing. Because they have access to the raw files, + * they also generate defaults for the client-side UIs to initialize. + */ + + IM.registerFormat("text", "Text files"); // generic format, no parser to handle it + IM.registerFormat("text/line-based", "Line-based text files", "LineBasedParserUI", + new Packages.com.google.refine.importers.LineBasedImporter()); + IM.registerFormat("text/line-based/*sv", "CSV / TSV / separator-based files", "SeparatorBasedParserUI", + new Packages.com.google.refine.importers.SeparatorBasedImporter()); + IM.registerFormat("text/line-based/fixed-width", "Fixed-width field text files", "FixedWidthParserUI", + new Packages.com.google.refine.importers.FixedWidthImporter()); + + IM.registerFormat("text/xml", "XML files", "XmlParserUI", new Packages.com.google.refine.importers.XmlImporter()); + IM.registerFormat("text/xml/xlsx", "Excel (.xlsx) files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter()); + IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfParserUI", new Packages.com.google.refine.importers.RdfTripleImporter()); + IM.registerFormat("text/json", "JSON files", "JsonParserUI", new Packages.com.google.refine.importers.JsonImporter()); + IM.registerFormat("text/marc", "MARC files"); + + IM.registerFormat("binary", "Binary files"); // generic format, no parser to handle it + IM.registerFormat("binary/xls", "Excel files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter()); + + IM.registerFormat("service", "Services"); // generic format, no parser to handle it + + /* + * Extension to format mappings + */ + IM.registerExtension(".txt", "text/line-based"); + IM.registerExtension(".csv", "text/line-based/*sv"); + IM.registerExtension(".tsv", "text/line-based/*sv"); + + IM.registerExtension(".xml", "text/xml"); + IM.registerExtension(".rdf", "text/xml/rdf"); + + IM.registerExtension(".json", "text/json"); + IM.registerExtension(".js", "text/json"); + + IM.registerExtension(".xls", "binary/xls"); + IM.registerExtension(".xlsx", "text/xml/xlsx"); + + IM.registerExtension(".marc", "text/marc"); + IM.registerExtension(".mrc", "text/marc"); + + /* + * Mime type to format mappings + */ + IM.registerMimeType("text/plain", "text/line-based"); + IM.registerMimeType("text/csv", "text/line-based/*sv"); + IM.registerMimeType("text/x-csv", "text/line-based/*sv"); + IM.registerMimeType("text/tab-separated-value", "text/line-based/*sv"); + + IM.registerMimeType("text/fixed-width", "text/line-based/fixed-width"); + + IM.registerMimeType("application/msexcel", "binary/xls"); + IM.registerMimeType("application/x-msexcel", "binary/xls"); + IM.registerMimeType("application/x-ms-excel", "binary/xls"); + IM.registerMimeType("application/vnd.ms-excel", "binary/xls"); + IM.registerMimeType("application/x-excel", "binary/xls"); + IM.registerMimeType("application/xls", "binary/xls"); + IM.registerMimeType("application/x-xls", "text/xml/xlsx"); + + IM.registerMimeType("application/json", "text/json"); + IM.registerMimeType("text/json", "text/json"); + + IM.registerMimeType("application/rdf+xml", "text/xml/rdf"); + + IM.registerMimeType("application/marc", "text/marc"); + + /* + * Format guessers: these take a format derived from extensions or mime-types, + * look at the actual files' content, and try to guess a better format. + */ + IM.registerFormatGuesser("text", new Packages.com.google.refine.importers.TextFormatGuesser()); + IM.registerFormatGuesser("text/line-based", new Packages.com.google.refine.importers.LineBasedFormatGuesser()); + + /* + * Controllers: these implement high-level UI flows for importing data. For example, the default + * controller lets the user specify one or more source files, either local or remote or on the clipboard, + * lets the user select which files to actually import in case any of the original file is an archive + * containing several files, and then lets the user configure parsing options. + */ + IM.registerController( + module, + "default-importing-controller", + new Packages.com.google.refine.importing.DefaultImportingController() + ); } /* @@ -183,7 +268,7 @@ function init() { registerCommands(); registerOperations(); - registerImportSourceClasses(); + registerImporting(); var RC = Packages.com.google.refine.model.recon.ReconConfig; RC.registerReconConfig(module, "standard-service", Packages.com.google.refine.model.recon.StandardReconConfig); @@ -193,12 +278,36 @@ function init() { module, [ "externals/jquery-1.4.2.min.js", + "externals/jquery.cookie.js", + "externals/jquery.eventstack-0.3.js", "externals/jquery-ui/jquery-ui-1.8.custom.min.js", "externals/date.js", + + "scripts/util/misc.js", + "scripts/util/url.js", "scripts/util/string.js", + "scripts/util/ajax.js", + "scripts/util/menu.js", + "scripts/util/dialog.js", "scripts/util/dom.js", + "scripts/index.js", - "scripts/index/import-sources.js" + "scripts/index/create-project-ui.js", + "scripts/index/open-project-ui.js", + "scripts/index/import-project-ui.js", + + "scripts/index/default-importing-controller/controller.js", + "scripts/index/default-importing-controller/file-selection-panel.js", + "scripts/index/default-importing-controller/parsing-panel.js", + + "scripts/index/default-importing-sources/sources.js", + "scripts/index/parser-interfaces/preview-table.js", + "scripts/index/parser-interfaces/separator-based-parser-ui.js", + "scripts/index/parser-interfaces/line-based-parser-ui.js", + "scripts/index/parser-interfaces/fixed-width-parser-ui.js", + "scripts/index/parser-interfaces/excel-parser-ui.js", + "scripts/index/parser-interfaces/xml-parser-ui.js", + "scripts/index/parser-interfaces/json-parser-ui.js" ] ); @@ -210,32 +319,20 @@ function init() { "styles/jquery-ui-overrides.less", "styles/common.less", "styles/pure.css", - "styles/index.less" - ] - ); - - ClientSideResourceManager.addPaths( - "import/scripts", - module, - [ - "externals/jquery-1.4.2.min.js", - "externals/jquery-ui/jquery-ui-1.8.custom.min.js", - "externals/date.js", - "scripts/util/string.js", - "scripts/util/dom.js", - "scripts/import.js" - ] - ); - - ClientSideResourceManager.addPaths( - "import/styles", - module, - [ - "externals/jquery-ui/css/ui-lightness/jquery-ui-1.8.custom.css", - "styles/jquery-ui-overrides.less", - "styles/common.less", - "styles/pure.css", - "styles/import.less" + "styles/index.less", + "styles/index/create-project-ui.less", + "styles/index/open-project-ui.less", + "styles/index/import-project-ui.less", + + "styles/index/default-importing-controller.less", + "styles/index/default-importing-file-selection-panel.less", + "styles/index/default-importing-parsing-panel.less", + + "styles/index/default-importing-sources.less", + "styles/views/data-table-view.less", // for the preview table's styles + "styles/index/fixed-width-parser-ui.less", + "styles/index/xml-parser-ui.less", + "styles/index/json-parser-ui.less" ] ); diff --git a/main/webapp/modules/core/index.vt b/main/webapp/modules/core/index.vt index a140681a3..5ad498c85 100644 --- a/main/webapp/modules/core/index.vt +++ b/main/webapp/modules/core/index.vt @@ -41,182 +41,33 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. $styleInjection - #if($params.new && $params.new == "1") - #set($newStyle = "") - #set($oldStyle = "display: none; ") - #else - #set($oldStyle = "") - #set($newStyle = "display: none; ") - #end -
- -
- Google Refine -

A power tool for working with messy data.

-
-
- - -
-

Open a Project

-
- -
-
-

Create a New Project

-
- - - - - - - -
-
Import data from
-
-

What kinds of data files can I import?

-
TSV, CSV, *SV, Excel (.xls and .xlsx), JSON, XML, RDF as XML, and - Google Spreadsheets are all supported. Support for other formats can - be added with Refine extensions. -
-
- -
-
- - - - - - - - -
-
-
- -
- -
- -
- - - -
- -
-

Create a New Project

-

- or Import an Existing Project -

-
- - - - - - - - - - - - - - - - -
-
-
-

Advanced Options

-
- Limit load to: -
- rows (blank for all) -
-
-
- Ignore: -
- initial non-blank lines -
-
-
- Skip: -
- initial data rows -
-
-
-
- When parsing text files: -
- - Split into columns -
-
- Column separator: -
- (leave blank to auto-detect) -
-
- - Auto-detect value types
- (numbers, dates, etc) -
-
- Header lines:
- (use 0 if your data has no header) -
-
- - Ignore quotation marks -
-
-
-
-
- - -
+ + + + +
+
+ + diff --git a/main/webapp/modules/core/scripts/index.js b/main/webapp/modules/core/scripts/index.js index 0539c3720..a8f06a49f 100644 --- a/main/webapp/modules/core/scripts/index.js +++ b/main/webapp/modules/core/scripts/index.js @@ -31,424 +31,144 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -function onClickUploadFileButton(evt) { - var projectName = $("#project-name-input")[0].value; - var dataURL = $.trim($("#project-url-input")[0].value); - if (! $.trim(projectName).length) { - window.alert("You must specify a project name."); - - } else if ($("#project-file-input")[0].files.length === 0 && ! dataURL.length) { - window.alert("You must specify a data file to upload or a URL to retrieve."); - - } else { - $("#file-upload-form").attr("action", - "/command/core/create-project-from-upload?" + [ - "url=" + escape(dataURL), - "split-into-columns=" + $("#split-into-columns-input")[0].checked, - "separator=" + $("#separator-input")[0].value, - "ignore=" + $("#ignore-input")[0].value, - "header-lines=" + $("#header-lines-input")[0].value, - "skip=" + $("#skip-input")[0].value, - "limit=" + $("#limit-input")[0].value, - "guess-value-type=" + $("#guess-value-type-input")[0].checked, - "ignore-quotes=" + $("#ignore-quotes-input")[0].checked - ].join("&")); - - return true; - } - - evt.preventDefault(); - return false; -} - -function formatDate(d) { - var d = new Date(d); - var last_year = Date.today().add({ years: -1 }); - var last_month = Date.today().add({ months: -1 }); - var last_week = Date.today().add({ days: -7 }); - var today = Date.today(); - var tomorrow = Date.today().add({ days: 1 }); - - if (d > today) { - return "today " + d.toString("h:mm tt"); - } else if (d.between(last_week, today)) { - var diff = Math.floor(today.getDayOfYear() - d.getDayOfYear()); - return (diff <= 1) ? ("yesterday " + d.toString("h:mm tt")) : (diff + " days ago"); - } else if (d.between(last_month, today)) { - var diff = Math.floor((today.getDayOfYear() - d.getDayOfYear()) / 7); - if (diff < 0) {diff += 52;} - return (diff == 1) ? "a week ago" : diff.toFixed(0) + " weeks ago" ; - } else if (d.between(last_year, today)) { - var diff = Math.floor(today.getMonth() - d.getMonth()); - if (diff < 0) {diff += 12;} - return (diff == 1) ? "a month ago" : diff + " months ago"; - } else { - var diff = Math.floor(today.getYear() - d.getYear()); - return (diff == 1) ? "a year ago" : diff + " years ago"; - } -} - -function isThereNewRelease() { - var thisRevision = GoogleRefineVersion.revision; - - var revision_pattern = /r([0-9]+)/; - - if (!revision_pattern.test(thisRevision)) { // probably "trunk" - return false; - } - - var latestRevision = GoogleRefineReleases.releases[0].revision; - - var thisRev = parseInt(revision_pattern.exec(thisRevision)[1],10); - var latestRev = parseInt(revision_pattern.exec(GoogleRefineReleases.releases[0].revision)[1],10); - - return latestRev > thisRev; -} - -function fetchProjects() { - $.getJSON( - "/command/core/get-all-project-metadata", - null, - function(data) { - renderProjects(data); - }, - "json" - ); -} - -function renderProjects(data) { - var projects = []; - for (var n in data.projects) { - if (data.projects.hasOwnProperty(n)) { - var project = data.projects[n]; - project.id = n; - project.date = Date.parseExact(project.modified, "yyyy-MM-ddTHH:mm:ssZ"); - projects.push(project); - } - } - projects.sort(function(a, b) { return b.date.getTime() - a.date.getTime(); }); - - var container = $("#projects-container").empty(); - if (!projects.length) { - $("#no-project-message").clone().show().appendTo(container); - } else { - var table = $( - '' + - '' + - '' + - '' + - '' + - '
NameLast modified
' - ).appendTo(container)[0]; - - var renderProject = function(project) { - var tr = table.insertRow(table.rows.length); - tr.className = "project"; - - var nameLink = $('') - .addClass("list-table-itemname") - .text(project.name) - .attr("href", "/project?project=" + project.id) - .appendTo(tr.insertCell(tr.cells.length)); - - var renameLink = $('') - .text("rename") - .addClass("secondary") - .attr("href", "javascript:{}") - .css("visibility", "hidden") - .click(function() { - var name = window.prompt("New project name:", project.name); - if (name == null) { - return; - } - - name = $.trim(name); - if (project.name == name || name.length == 0) { - return; - } - - $.ajax({ - type: "POST", - url: "/command/core/rename-project", - data: { "project" : project.id, "name" : name }, - dataType: "json", - success: function (data) { - if (data && typeof data.code != 'undefined' && data.code == "ok") { - nameLink.text(name); - } else { - alert("Failed to rename project: " + data.message); - } - } - }); - }).appendTo(tr.insertCell(tr.cells.length)); - - var deleteLink = $('') - .addClass("delete-project") - .attr("title","Delete this project") - .attr("href","") - .css("visibility", "hidden") - .html("") - .click(function() { - if (window.confirm("Are you sure you want to delete project \"" + project.name + "\"?")) { - $.ajax({ - type: "POST", - url: "/command/core/delete-project", - data: { "project" : project.id }, - dataType: "json", - success: function (data) { - if (data && typeof data.code != 'undefined' && data.code == "ok") { - fetchProjects(); - } - } - }); - } - return false; - }).appendTo(tr.insertCell(tr.cells.length)); - - - $('
') - .html(formatDate(project.date)) - .addClass("last-modified") - .attr("title", project.date.toString()) - .appendTo(tr.insertCell(tr.cells.length)); - - $(tr).mouseenter(function() { - renameLink.css("visibility", "visible"); - deleteLink.css("visibility", "visible"); - }).mouseleave(function() { - renameLink.css("visibility", "hidden"); - deleteLink.css("visibility", "hidden"); - }); - }; - - for (var i = 0; i < projects.length; i++) { - renderProject(projects[i]); - } - } -} - -function showHide(toHide, toShow) { - $("#" + toHide).hide(); - $("#" + toShow).show(); -} - -function openWorkspaceDir() { - $.ajax({ - type: "POST", - url: "/command/core/open-workspace-dir", - dataType: "json", - success: function (data) { - if (data.code != "ok" && "message" in data) { - alert(data.message); - } - } - }); -} - var GoogleRefineVersion; -function showVersion() { - $.getJSON( + +var Refine = { + actionAreas: [] +}; + +Refine.selectActionArea = function(id) { + $('.action-area-tab').removeClass('selected'); + $('.action-area-tab-body').css('visibility', 'hidden').css('z-index', '100'); + + for (var i = 0; i < Refine.actionAreas.length; i++) { + var actionArea = Refine.actionAreas[i]; + if (id == actionArea.id) { + actionArea.tabElmt.addClass('selected'); + actionArea.bodyElmt.css('visibility', 'visible').css('z-index', '110');; + } + } +}; + +$(function() { + var isThereNewRelease = function() { + var thisRevision = GoogleRefineVersion.revision; + + var revision_pattern = /r([0-9]+)/; + + if (!revision_pattern.test(thisRevision)) { // probably "trunk" + return false; + } + + var latestRevision = GoogleRefineReleases.releases[0].revision; + + var thisRev = parseInt(revision_pattern.exec(thisRevision)[1],10); + var latestRev = parseInt(revision_pattern.exec(GoogleRefineReleases.releases[0].revision)[1],10); + + return latestRev > thisRev; + }; + + var showVersion = function() { + $.getJSON( "/command/core/get-version", null, function(data) { GoogleRefineVersion = data; - + $("#google-refine-version").text("Version " + GoogleRefineVersion.full_version); - + var script = $('') - .attr("src", "http://google-refine.googlecode.com/svn/support/releases.js") - .attr("type", "text/javascript") - .appendTo(document.body); + .attr("src", "http://google-refine.googlecode.com/svn/support/releases.js") + .attr("type", "text/javascript") + .appendTo(document.body); var poll = function() { if ("releases" in window) { if (isThereNewRelease()) { var container = $('
') - .appendTo(document.body); + .appendTo(document.body) var notification = $('
') - .text('New version! ') - .appendTo(container); + .text('New version! ') + .appendTo(container) $('') - .addClass('notification-action') - .attr("href", releases.homepage) - .text('Download ' + releases.releases[0].description + ' now.') - .appendTo(notification); + .addClass('notification-action') + .attr("href", releases.homepage) + .text('Download ' + releases.releases[0].description + ' now.') + .appendTo(notification); } } else { window.setTimeout(poll, 1000); } }; - window.setTimeout(poll, 1000); + window.setTimeout(poll, 1000); } - ); -} + ); + }; -function renderImportPanel() { - var headerContainer = $('#import-panel-tab-headers'); - var bodyContainer = $('#import-panel-tab-bodies'); - - var selectImportSourceTab = function(importSource) { - $('.import-panel-tab-body').hide(); - $('.import-panel-tab-header').removeClass('selected'); - - importSource._divBody.show(); - importSource._divHeader.addClass('selected'); - importSource._ui.focus(); - }; - - var createImportSourceTab = function(importSource) { - importSource._divBody = $('
') - .addClass('import-panel-tab-body') - .appendTo(bodyContainer) - .hide(); - - importSource._divHeader = $('
') - .addClass('import-panel-tab-header') - .text(importSource.label) - .appendTo(headerContainer) - .click(function() { selectImportSourceTab(importSource); }); - - importSource._ui = new importSource.ui(importSource._divBody); - }; - - for (var i= 0; i < ImportSources.length; i++) { - createImportSourceTab(ImportSources[i]); - } - selectImportSourceTab(ImportSources[0]); -} + var resize = function() { + var leftPanelWidth = 150; + // px + var width = $(window).width(); + var height = $(window).height(); + var headerHeight = $('#header').outerHeight(); + var panelHeight = height - headerHeight; -function startImportJob(importSource, form, progressMessage) { - $.post( - "/command/core/create-import-job", - null, - function(data) { - var jobID = data.jobID; - - form.attr("method", "post") - .attr("enctype", "multipart/form-data") - .attr("accept-charset", "UTF-8") - .attr("target", "import-iframe") - .attr("action", "/command/core/retrieve-import-content?" + $.param({ - "jobID" : jobID, - "source" : importSource - })); + $('.main-layout-panel') + .css("top", headerHeight + "px") + .css("bottom", "0px") + .css("height", panelHeight + "px") + .css("visibility", "visible"); - form[0].submit(); - - var start = new Date(); - var timerID = window.setInterval(function() { pollImportJob(start, jobID, timerID); }, 1000); - initializeImportProgressPanel(progressMessage, jobID, timerID); - }, - "json" - ); -} + $('#left-panel') + .css("left", "0px") + .css("width", leftPanelWidth + "px"); + var leftPanelBodyHPaddings = 10; + // px + var leftPanelBodyVPaddings = 0; + // px + $('#left-panel-body') + .css("margin-left", leftPanelBodyHPaddings + "px") + .css("margin-top", leftPanelBodyVPaddings + "px") + .css("width", ($('#left-panel').width() - leftPanelBodyHPaddings) + "px") + .css("height", ($('#left-panel').height() - leftPanelBodyVPaddings) + "px"); -function initializeImportProgressPanel(progressMessage, jobID, timerID) { - $('#import-progress-message').text(progressMessage); - $('#import-progress-bar-body').css("width", "0%"); - $('#import-progress-message-left').text('Starting'); - $('#import-progress-message-center').empty(); - $('#import-progress-message-right').empty(); - - $('#import-panel').hide(); - $('#import-progress-panel').show(); - - $('#import-progress-cancel-button').unbind().click(function() { - $('#import-panel').show(); - $('#import-progress-panel').hide(); - - // stop the iframe - $('#import-iframe')[0].contentWindow.stop(); - - // stop the timed polling - window.clearInterval(timerID); - - // explicitly cancel the import job - $.post("/command/core/cancel-import-job?" + $.param({ "jobID" : jobID })); - }); -} + $('#right-panel') + .css("left", leftPanelWidth + "px") + .css("width", (width - leftPanelWidth) + "px"); -function bytesToString(b) { - if (b >= 1024 * 1024) { - return Math.round(b / (1024 * 1024)) + " MB"; - } else if (b >= 1024) { - return Math.round(b / 1024) + " KB"; - } else { - return b + " bytes"; - } -} - -function pollImportJob(start, jobID, timerID) { - $.post( - "/command/core/get-import-job-status?" + $.param({ "jobID" : jobID }), - null, - function(data) { - if (data.code == "error") { - showImportJobError(data.message); - window.clearInterval(timerID); - } else if (data.state == "error") { - showImportJobError(data.message, data.stack); - window.clearInterval(timerID); - } else if (data.state == "retrieving") { - if (data.progress < 0) { - $('#import-progress-message-left').text(bytesToString(data.bytesSaved) + " saved"); - } else { - $('#import-progress-bar-body').css("width", data.progress + "%"); - $('#import-progress-message-left').text(data.progress + "% saved"); - } - } else if (data.state == "ready") { - window.clearInterval(timerID); - - // Just so if the user clicks Back the progress panel won't be showing if the DOM is cached. - $('#import-progress-panel').hide(); - $('#import-panel').show(); - - window.location = "/import?" + $.param({ "jobID" : jobID }); - } - }, - "json" - ); -} - -function showImportJobError(message, stack) { - $('#import-error-message').text(message); - $('#import-error-stack').text(stack || 'No technical details.'); - - $('#import-progress-panel').hide(); - $('#import-error-panel').show(); - - $('#import-error-ok-button').unbind().click(function() { - $('#import-error-panel').hide(); - $('#import-panel').show(); - }); -} - -function onLoad() { - renderImportPanel(); - - fetchProjects(); - - $("#project-file-input").change(function() { - if ($("#project-name-input")[0].value.length == 0) { - var fileName = this.files[0].fileName; - if (fileName) { - $("#project-name-input")[0].value = fileName.replace(/\.\w+/, "").replace(/[_-]/g, " "); - } - $("#project-name-input").focus().select(); - } - }).keypress(function(evt) { - if (evt.keyCode == 13) { - onClickUploadFileButton(); - } - }); - - $("#upload-file-button").click(onClickUploadFileButton); - $("#more-options-link").click(function() { - $("#more-options-controls").hide(); - $("#more-options").show(); - }); + var rightPanelBodyHPaddings = 5; + // px + var rightPanelBodyVPaddings = 5; + // px + $('#right-panel-body') + .css("margin-left", rightPanelBodyHPaddings + "px") + .css("margin-top", rightPanelBodyVPaddings + "px") + .css("width", ($('#right-panel').width() - rightPanelBodyHPaddings) + "px") + .css("height", ($('#right-panel').height() - rightPanelBodyVPaddings) + "px"); + }; + $(window).bind("resize", resize); + window.setTimeout(resize, 50); // for Chrome, give the window some time to layout first + var renderActionArea = function(actionArea) { + actionArea.bodyElmt = $('
') + .addClass('action-area-tab-body') + .appendTo('#right-panel-body'); + + actionArea.tabElmt = $('
  • ') + .addClass('action-area-tab') + .text(actionArea.label) + .appendTo($('#action-area-tabs')) + .click(function() { + Refine.selectActionArea(actionArea.id); + }); + + actionArea.ui = new actionArea.uiClass(actionArea.bodyElmt); + }; + + for (var i = 0; i < Refine.actionAreas.length; i++) { + renderActionArea(Refine.actionAreas[i]); + } + Refine.selectActionArea('create-project'); + showVersion(); -} - -$(onLoad); +}); diff --git a/main/webapp/modules/core/scripts/index/create-project-ui-source-selection.html b/main/webapp/modules/core/scripts/index/create-project-ui-source-selection.html new file mode 100644 index 000000000..48bff8f37 --- /dev/null +++ b/main/webapp/modules/core/scripts/index/create-project-ui-source-selection.html @@ -0,0 +1,17 @@ +
    + + + + + + + +
    +

    Create a project by importing data. What kinds of data files can I import?

    +
    TSV, CSV, *SV, Excel (.xls and .xlsx), JSON, XML, RDF as XML, and + Google Spreadsheets are all supported. Support for other formats can + be added with Refine extensions. +
    +
    +
    Get data from
    +
    diff --git a/main/webapp/modules/core/scripts/index/create-project-ui.js b/main/webapp/modules/core/scripts/index/create-project-ui.js new file mode 100644 index 000000000..67cc85bf2 --- /dev/null +++ b/main/webapp/modules/core/scripts/index/create-project-ui.js @@ -0,0 +1,145 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +Refine.CreateProjectUI = function(elmt) { + var self = this; + + this._elmt = elmt; + this._sourceSelectionUIs = []; + this._customPanels = []; + this._controllers = []; + + $.post( + "/command/core/get-importing-configuration", + null, + function(data) { + Refine.importingConfig = data.config; + self._initializeUI(); + }, + "json" + ); +}; + +Refine.CreateProjectUI.controllers = []; + +Refine.CreateProjectUI.prototype._initializeUI = function() { + this._sourceSelectionElmt = + $(DOM.loadHTML("core", "scripts/index/create-project-ui-source-selection.html")).appendTo(this._elmt); + + this._sourceSelectionElmts = DOM.bind(this._sourceSelectionElmt); + + for (var i = 0; i < Refine.CreateProjectUI.controllers.length; i++) { + this._controllers.push(new Refine.CreateProjectUI.controllers[i](this)); + } +}; + +Refine.CreateProjectUI.prototype.addSourceSelectionUI = function(sourceSelectionUI) { + var self = this; + + var headerContainer = $('#create-project-ui-source-selection-tabs'); + var bodyContainer = $('#create-project-ui-source-selection-tab-bodies'); + + sourceSelectionUI._divBody = $('
    ') + .addClass('create-project-ui-source-selection-tab-body') + .appendTo(bodyContainer) + .hide(); + + sourceSelectionUI._divHeader = $('
    ') + .addClass('create-project-ui-source-selection-tab') + .text(sourceSelectionUI.label) + .appendTo(headerContainer) + .click(function() { self.selectImportSource(sourceSelectionUI.id); }); + + sourceSelectionUI.ui.attachUI(sourceSelectionUI._divBody); + + this._sourceSelectionUIs.push(sourceSelectionUI); + + if (this._sourceSelectionUIs.length == 1) { + self.selectImportSource(sourceSelectionUI.id); + } +}; + +Refine.CreateProjectUI.prototype.selectImportSource = function(id) { + for (var i = 0; i < this._sourceSelectionUIs.length; i++) { + var sourceSelectionUI = this._sourceSelectionUIs[i]; + if (sourceSelectionUI.id == id) { + $('.create-project-ui-source-selection-tab-body').hide(); + $('.create-project-ui-source-selection-tab').removeClass('selected'); + + sourceSelectionUI._divBody.show(); + sourceSelectionUI._divHeader.addClass('selected'); + + sourceSelectionUI.ui.focus(); + + break; + } + } +}; + +Refine.CreateProjectUI.prototype.addCustomPanel = function() { + var div = $('
    ') + .addClass('create-project-ui-panel') + .appendTo(this._elmt); + + var innerDiv = $('
    ') + .addClass('relative-frame') + .appendTo(div); + + this._customPanels.push(div); + + return innerDiv; +}; + +Refine.CreateProjectUI.prototype.showCustomPanel = function(div) { + var parent = div.parent(); + for (var i = 0; i < this._customPanels.length; i++) { + var panel = this._customPanels[i]; + if (panel[0] === parent[0]) { + $('.create-project-ui-panel').css('visibility', 'hidden'); + this._sourceSelectionElmt.css('visibility', 'hidden'); + panel.css('visibility', 'visible'); + break; + } + } +}; + +Refine.CreateProjectUI.prototype.showSourceSelectionPanel = function() { + $('.create-project-ui-panel').css('visibility', 'hidden'); + this._sourceSelectionElmt.css('visibility', 'visible'); +}; + +Refine.actionAreas.push({ + id: "create-project", + label: "Create Project", + uiClass: Refine.CreateProjectUI +}); diff --git a/main/webapp/modules/core/scripts/index/default-importing-controller/controller.js b/main/webapp/modules/core/scripts/index/default-importing-controller/controller.js new file mode 100644 index 000000000..386bfffbb --- /dev/null +++ b/main/webapp/modules/core/scripts/index/default-importing-controller/controller.js @@ -0,0 +1,395 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +Refine.DefaultImportingController = function(createProjectUI) { + this._createProjectUI = createProjectUI; + + this._progressPanel = createProjectUI.addCustomPanel(); + this._progressPanel.html(DOM.loadHTML("core", "scripts/index/default-importing-controller/progress-panel.html")); + + this._errorPanel = createProjectUI.addCustomPanel(); + this._errorPanel.html(DOM.loadHTML("core", "scripts/index/default-importing-controller/error-panel.html")); + + this._fileSelectionPanel = createProjectUI.addCustomPanel(); + this._parsingPanel = createProjectUI.addCustomPanel(); + + for (var i = 0; i < Refine.DefaultImportingController.sources.length; i++) { + var sourceSelectionUI = Refine.DefaultImportingController.sources[i]; + sourceSelectionUI.ui = new sourceSelectionUI.uiClass(this); + + createProjectUI.addSourceSelectionUI(sourceSelectionUI); + } +}; +Refine.CreateProjectUI.controllers.push(Refine.DefaultImportingController); + +Refine.DefaultImportingController.sources = []; +Refine.DefaultImportingController.parserUIs = {}; + +Refine.DefaultImportingController.prototype._startOver = function() { + this._disposeFileSelectionPanel(); + this._disposeFileSelectionPanel(); + + delete this._fileSelectionPanelElmts; + delete this._parsingPanelElmts; + + delete this._jobID; + delete this._job; + delete this._extensions; + + delete this._format; + delete this._parserOptions; + delete this._projectName; + + this._createProjectUI.showSourceSelectionPanel(); +}; + +Refine.DefaultImportingController.prototype.startImportJob = function(form, progressMessage, callback) { + var self = this; + $.post( + "/command/core/create-importing-job", + null, + function(data) { + var jobID = self._jobID = data.jobID; + + form.attr("method", "post") + .attr("enctype", "multipart/form-data") + .attr("accept-charset", "UTF-8") + .attr("target", "default-importing-iframe") + .attr("action", "/command/core/importing-controller?" + $.param({ + "controller": "core/default-importing-controller", + "jobID": jobID, + "subCommand": "load-raw-data" + })); + form[0].submit(); + + var start = new Date(); + var timerID = window.setInterval( + function() { + self._pollImportJob( + start, jobID, timerID, + function(job) { + return job.config.hasData; + }, + function(jobID, job) { + self._job = job; + self._onImportJobReady(); + if (callback) { + callback(jobID, job); + } + } + ); + }, + 1000 + ); + self._initializeImportProgressPanel(progressMessage, function() { + // stop the iframe + $('#default-importing-iframe')[0].contentWindow.stop(); + + // stop the timed polling + window.clearInterval(timerID); + + // explicitly cancel the import job + $.post("/command/core/cancel-importing-job?" + $.param({ "jobID": jobID })); + + self._createProjectUI.showSourceSelectionPanel(); + }); + }, + "json" + ); +}; + +Refine.DefaultImportingController.prototype._initializeImportProgressPanel = function(progressMessage, onCancel) { + var self = this; + + this._createProjectUI.showCustomPanel(this._progressPanel); + + $('#default-importing-progress-message').text(progressMessage); + $('#default-importing-progress-bar-body').css("width", "0%"); + $('#default-importing-progress-message-left').text('Starting'); + $('#default-importing-progress-message-center').empty(); + $('#default-importing-progress-message-right').empty(); + $('#default-importing-progress-timing').empty(); + + $('#default-importing-progress-cancel-button').unbind().click(onCancel); +}; + +Refine.DefaultImportingController.prototype._pollImportJob = function(start, jobID, timerID, checkDone, callback) { + var self = this; + $.post( + "/command/core/get-importing-job-status?" + $.param({ "jobID": jobID }), + null, + function(data) { + if (!(data)) { + self._showImportJobError("Unknown error"); + window.clearInterval(timerID); + return; + } else if (data.code == "error" || !("job" in data)) { + self._showImportJobError(data.message || "Unknown error"); + window.clearInterval(timerID); + return; + } + + var job = data.job; + if (checkDone(job)) { + $('#default-importing-progress-message').text('Done.'); + + window.clearInterval(timerID); + if (callback) { + callback(jobID, job); + } + } else { + var progress = job.config.progress; + if (progress.percent > 0) { + var secondsSpent = (new Date().getTime() - start.getTime()) / 1000; + var secondsRemaining = (100 / progress.percent) * secondsSpent - secondsSpent; + + $('#default-importing-progress-bar-body') + .removeClass('indefinite') + .css("width", progress.percent + "%"); + + if (secondsRemaining > 1) { + if (secondsRemaining > 60) { + $('#default-importing-progress-timing').text( + Math.ceil(secondsRemaining / 60) + " minutes remaining"); + } else { + $('#default-importing-progress-timing').text( + Math.ceil(secondsRemaining) + " seconds remaining"); + } + } else { + $('#default-importing-progress-timing').text('almost done ...'); + } + } else { + $('#default-importing-progress-bar-body').addClass('indefinite'); + $('#default-importing-progress-timing').empty(); + } + $('#default-importing-progress-message').text(progress.message); + } + }, + "json" + ); +}; + +Refine.DefaultImportingController.prototype._showImportJobError = function(message, stack) { + var self = this; + + $('#default-importing-error-message').text(message); + $('#default-importing-error-stack').text(stack || 'No technical details.'); + + this._createProjectUI.showCustomPanel(this._errorPanel); + $('#default-importing-error-ok-button').unbind().click(function() { + self._createProjectUI.showSourceSelectionPanel(); + }); +}; + +Refine.DefaultImportingController.prototype._onImportJobReady = function() { + this._prepareData(); + if (this._job.config.retrievalRecord.files.length > 1) { + this._showFileSelectionPanel(); + } else { + this._showParsingPanel(false); + } +}; + +Refine.DefaultImportingController.prototype._prepareData = function() { + var extensionMap = {}; + var extensionList = []; + + var files = this._job.config.retrievalRecord.files; + var fileSelection = this._job.config.fileSelection; + for (var i = 0; i < files.length; i++) { + var file = files[i]; + file.selected = false; + + var slash = file.fileName.lastIndexOf('/'); + var dot = file.fileName.lastIndexOf('.'); + if (dot > slash + 1) { + var extension = file.fileName.substring(dot); + if (extension in extensionMap) { + extensionMap[extension].count++; + } else { + extensionMap[extension] = { extension: extension, count: 1 }; + extensionList.push(extensionMap[extension]); + } + } + } + for (var i = 0; i < fileSelection.length; i++) { + files[fileSelection[i]].selected = true; + } + + extensionList.sort(function(a, b) { + return b.count - a.count; + }); + this._extensions = extensionList; +}; + +Refine.DefaultImportingController.prototype._ensureFormatParserUIHasInitializationData = function(format, onDone) { + if (!(format in this._parserOptions)) { + var self = this; + var dismissBusy = DialogSystem.showBusy("Inspecting selected files ..."); + $.post( + "/command/core/importing-controller?" + $.param({ + "controller": "core/default-importing-controller", + "jobID": this._jobID, + "subCommand": "initialize-parser-ui", + "format": format + }), + null, + function(data) { + dismissBusy(); + + if (data.options) { + self._parserOptions[format] = data.options; + onDone(); + } + }, + "json" + ); + } else { + onDone(); + } +}; + +Refine.DefaultImportingController.prototype.updateFormatAndOptions = function(options, callback) { + var self = this; + $.post( + "/command/core/importing-controller?" + $.param({ + "controller": "core/default-importing-controller", + "jobID": this._jobID, + "subCommand": "update-format-and-options" + }), + { + "format" : this._format, + "options" : JSON.stringify(options) + }, + callback, + "json" + ); +}; + +Refine.DefaultImportingController.prototype.getPreviewData = function(callback, numRows) { + var self = this; + var result = {}; + + $.post( + "/command/core/get-models?" + $.param({ "importingJobID" : this._jobID }), + null, + function(data) { + for (var n in data) { + if (data.hasOwnProperty(n)) { + result[n] = data[n]; + } + } + + $.post( + "/command/core/get-rows?" + $.param({ + "importingJobID" : self._jobID, + "start" : 0, + "limit" : numRows || 100 // More than we parse for preview anyway + }), + null, + function(data) { + // Un-pool objects + for (var r = 0; r < data.rows.length; r++) { + var row = data.rows[r]; + for (var c = 0; c < row.cells.length; c++) { + var cell = row.cells[c]; + if ((cell) && ("r" in cell)) { + cell.r = data.pool.recons[cell.r]; + } + } + } + + result.rowModel = data; + callback(result); + }, + "jsonp" + ); + }, + "json" + ); +}; + +Refine.DefaultImportingController.prototype._createProject = function() { + if ((this._formatParserUI) && this._formatParserUI.confirmReadyToCreateProject()) { + var projectName = $.trim(this._parsingPanelElmts.projectNameInput[0].value); + if (projectName.length == 0) { + window.alert("Please name the project."); + this._parsingPanelElmts.focus(); + return; + } + + var self = this; + var options = this._formatParserUI.getOptions(); + options.projectName = projectName; + $.post( + "/command/core/importing-controller?" + $.param({ + "controller": "core/default-importing-controller", + "jobID": this._jobID, + "subCommand": "create-project" + }), + { + "format" : this._format, + "options" : JSON.stringify(options) + }, + function() { + var start = new Date(); + var timerID = window.setInterval( + function() { + self._pollImportJob( + start, + self._jobID, + timerID, + function(job) { + return "projectID" in job.config; + }, + function(jobID, job) { + document.location = "project?project=" + job.config.projectID; + } + ); + }, + 1000 + ); + self._initializeImportProgressPanel("Creating project ...", function() { + // stop the timed polling + window.clearInterval(timerID); + + // explicitly cancel the import job + $.post("/command/core/cancel-importing-job?" + $.param({ "jobID": jobID })); + + self._createProjectUI.showSourceSelectionPanel(); + }); + }, + "json" + ); + } +}; diff --git a/main/webapp/modules/core/scripts/index/default-importing-controller/error-panel.html b/main/webapp/modules/core/scripts/index/default-importing-controller/error-panel.html new file mode 100644 index 000000000..e6453f067 --- /dev/null +++ b/main/webapp/modules/core/scripts/index/default-importing-controller/error-panel.html @@ -0,0 +1,5 @@ +
    + + + +
    \ No newline at end of file diff --git a/main/webapp/modules/core/scripts/index/default-importing-controller/file-selection-panel.html b/main/webapp/modules/core/scripts/index/default-importing-controller/file-selection-panel.html new file mode 100644 index 000000000..248f9b354 --- /dev/null +++ b/main/webapp/modules/core/scripts/index/default-importing-controller/file-selection-panel.html @@ -0,0 +1,36 @@ +
    + + + +
    Select Files to Import
    + +
    +
    + + + + + + + + +
    + There are several files available to import. + Please select the desired ones. +
    + +

    Select by Extension

    +
    + +

    Select by Regex on File Names

    +
    + + + + + + +
    +
    + +
    \ No newline at end of file diff --git a/main/webapp/modules/core/scripts/index/default-importing-controller/file-selection-panel.js b/main/webapp/modules/core/scripts/index/default-importing-controller/file-selection-panel.js new file mode 100644 index 000000000..be083b0af --- /dev/null +++ b/main/webapp/modules/core/scripts/index/default-importing-controller/file-selection-panel.js @@ -0,0 +1,314 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +Refine.DefaultImportingController.prototype._showFileSelectionPanel = function() { + var self = this; + + this._prepareFileSelectionPanel(); + + this._fileSelectionPanelElmts.nextButton.click(function() { + self._commitFileSelection(); + }); + this._renderFileSelectionPanel(); + this._createProjectUI.showCustomPanel(this._fileSelectionPanel); +}; + +Refine.DefaultImportingController.prototype._disposeFileSelectionPanel = function() { + if (this._fileSelectionPanelResizer) { + $(window).unbind("resize", this._fileSelectionPanelResizer); + } + this._fileSelectionPanel.unbind().empty(); +}; + +Refine.DefaultImportingController.prototype._prepareFileSelectionPanel = function() { + var self = this; + + this._fileSelectionPanel.unbind().empty().html( + DOM.loadHTML("core", "scripts/index/default-importing-controller/file-selection-panel.html")); + + this._fileSelectionPanelElmts = DOM.bind(this._fileSelectionPanel); + this._fileSelectionPanelElmts.startOverButton.click(function() { + self._startOver(); + }); + + this._fileSelectionPanelResizer = function() { + var elmts = self._fileSelectionPanelElmts; + var width = self._fileSelectionPanel.width(); + var height = self._fileSelectionPanel.height(); + var headerHeight = elmts.wizardHeader.outerHeight(true); + var controlPanelWidth = 350; + + elmts.controlPanel + .css("left", "0px") + .css("top", headerHeight + "px") + .css("width", (controlPanelWidth - DOM.getHPaddings(elmts.controlPanel)) + "px") + .css("height", (height - headerHeight - DOM.getVPaddings(elmts.controlPanel)) + "px"); + + elmts.filePanel + .css("left", controlPanelWidth + "px") + .css("top", headerHeight + "px") + .css("width", (width - controlPanelWidth - DOM.getHPaddings(elmts.filePanel)) + "px") + .css("height", (height - headerHeight - DOM.getVPaddings(elmts.filePanel)) + "px"); + }; + + $(window).resize(this._fileSelectionPanelResizer); + this._fileSelectionPanelResizer(); +}; + +Refine.DefaultImportingController.prototype._renderFileSelectionPanel = function() { + this._renderFileSelectionPanelFileTable(); + this._renderFileSelectionPanelControlPanel(); +}; + +Refine.DefaultImportingController.prototype._renderFileSelectionPanelFileTable = function() { + var self = this; + + this._fileSelectionPanelElmts.filePanel.empty(); + + var fileTable = $('
    NameMime-typeFormatSize
    ') + .appendTo(this._fileSelectionPanelElmts.filePanel)[0]; + + var files = this._job.config.retrievalRecord.files; + var renderFile = function(fileRecord, index) { + var tr = fileTable.insertRow(fileTable.rows.length); + $(tr).addClass(index % 2 == 0 ? 'even' : 'odd'); + + var tdSelect = $('').appendTo(tr); + var checkbox = $('') + .attr("type", "checkbox") + .attr("index", index) + .appendTo(tdSelect) + .click(function() { + files[index].selected = this.checked; + self._updateFileSelectionSummary(); + }); + if (fileRecord.selected) { + checkbox.attr("checked", "checked"); + } + + $('').text(fileRecord.fileName).addClass("default-importing-file-selection-filename").appendTo(tr); + $('').text(fileRecord.declaredMimeType || fileRecord.mimeType || "unknown").appendTo(tr); + $('').text(fileRecord.format || "unknown").appendTo(tr); + $('').text(fileRecord.size + " bytes").appendTo(tr); + }; + + for (var i = 0; i < files.length; i++) { + renderFile(files[i], i); + } +}; + +Refine.DefaultImportingController.prototype._renderFileSelectionPanelControlPanel = function() { + var self = this; + var files = this._job.config.retrievalRecord.files; + + this._fileSelectionPanelElmts.extensionContainer.empty(); + this._fileSelectionPanelElmts.selectAllButton.unbind().click(function(evt) { + for (var i = 0; i < files.length; i++) { + files[i].selected = true; + } + self._fileSelectionPanelElmts.filePanel.find("input").attr("checked", "checked"); + self._updateFileSelectionSummary(); + }); + this._fileSelectionPanelElmts.unselectAllButton.unbind().click(function(evt) { + for (var i = 0; i < files.length; i++) { + files[i].selected = false; + } + self._fileSelectionPanelElmts.filePanel.find("input").removeAttr("checked"); + self._updateFileSelectionSummary(); + }); + + var table = $('
    ') + .appendTo(this._fileSelectionPanelElmts.extensionContainer)[0]; + + var renderExtension = function(extension) { + var tr = table.insertRow(table.rows.length); + $('').text(extension.extension).appendTo(tr); + $('').text(extension.count + (extension.count > 1 ? " files" : " file")).appendTo(tr); + $('