From 16dda46a61519dfd15ec29b16f8e5f493d69357e Mon Sep 17 00:00:00 2001 From: David Huynh Date: Fri, 5 Feb 2010 19:19:38 +0000 Subject: [PATCH] Refactored importers, adding support for Excel files. git-svn-id: http://google-refine.googlecode.com/svn/trunk@47 7d457c2a-affb-35e4-300a-418c747d4874 --- pom.xml | 9 +- .../metaweb/gridworks/GridworksServlet.java | 4 +- .../com/metaweb/gridworks/ProjectManager.java | 6 +- .../commands/edit/CreateProjectCommand.java | 206 ++++++++++++++++++ .../edit/CreateProjectFromUploadCommand.java | 163 -------------- .../gridworks/importers/ExcelImporter.java | 150 +++++++++++++ .../metaweb/gridworks/importers/Importer.java | 14 ++ .../importers/ImporterUtilities.java | 84 +++++++ .../gridworks/importers/TsvCsvImporter.java | 82 +++++++ .../gridworks/util/ParsingUtilities.java | 11 +- 10 files changed, 557 insertions(+), 172 deletions(-) create mode 100644 src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java delete mode 100644 src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectFromUploadCommand.java create mode 100644 src/main/java/com/metaweb/gridworks/importers/ExcelImporter.java create mode 100644 src/main/java/com/metaweb/gridworks/importers/Importer.java create mode 100644 src/main/java/com/metaweb/gridworks/importers/ImporterUtilities.java create mode 100644 src/main/java/com/metaweb/gridworks/importers/TsvCsvImporter.java diff --git a/pom.xml b/pom.xml index 968650185..c4bfcdacc 100644 --- a/pom.xml +++ b/pom.xml @@ -62,7 +62,14 @@ commons-lang commons-lang 2.1 - + + + + + poi + poi + 2.5.1-final-20040804 + \ No newline at end of file diff --git a/src/main/java/com/metaweb/gridworks/GridworksServlet.java b/src/main/java/com/metaweb/gridworks/GridworksServlet.java index 4b6de9885..d7a071e10 100644 --- a/src/main/java/com/metaweb/gridworks/GridworksServlet.java +++ b/src/main/java/com/metaweb/gridworks/GridworksServlet.java @@ -16,7 +16,7 @@ import org.json.JSONTokener; import com.metaweb.gridworks.commands.Command; import com.metaweb.gridworks.commands.edit.AddColumnCommand; -import com.metaweb.gridworks.commands.edit.CreateProjectFromUploadCommand; +import com.metaweb.gridworks.commands.edit.CreateProjectCommand; import com.metaweb.gridworks.commands.edit.DoTextTransformCommand; import com.metaweb.gridworks.commands.edit.JoinMultiValueCellsCommand; import com.metaweb.gridworks.commands.edit.RemoveColumnCommand; @@ -42,7 +42,7 @@ public class GridworksServlet extends HttpServlet { static protected Map _commands = new HashMap(); static { - _commands.put("create-project-from-upload", new CreateProjectFromUploadCommand()); + _commands.put("create-project-from-upload", new CreateProjectCommand()); _commands.put("export-rows", new ExportRowsCommand()); _commands.put("get-project-metadata", new GetProjectMetadataCommand()); diff --git a/src/main/java/com/metaweb/gridworks/ProjectManager.java b/src/main/java/com/metaweb/gridworks/ProjectManager.java index 210903e46..2c5dcc03a 100644 --- a/src/main/java/com/metaweb/gridworks/ProjectManager.java +++ b/src/main/java/com/metaweb/gridworks/ProjectManager.java @@ -86,13 +86,9 @@ public class ProjectManager implements Serializable { return _dir; } - public Project createProject(ProjectMetadata projectMetadata) { - Project project = new Project(); - + public void registerProject(Project project, ProjectMetadata projectMetadata) { _projects.put(project.id, project); _projectsMetadata.put(project.id, projectMetadata); - - return project; } public ProjectMetadata getProjectMetadata(long id) { diff --git a/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java b/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java new file mode 100644 index 000000000..bb27f08a7 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java @@ -0,0 +1,206 @@ +package com.metaweb.gridworks.commands.edit; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; +import java.net.URL; +import java.net.URLConnection; +import java.util.Properties; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import com.metaweb.gridworks.ProjectManager; +import com.metaweb.gridworks.ProjectMetadata; +import com.metaweb.gridworks.commands.Command; +import com.metaweb.gridworks.importers.ExcelImporter; +import com.metaweb.gridworks.importers.Importer; +import com.metaweb.gridworks.importers.TsvCsvImporter; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.util.ParsingUtilities; +import com.oreilly.servlet.multipart.FilePart; +import com.oreilly.servlet.multipart.MultipartParser; +import com.oreilly.servlet.multipart.ParamPart; +import com.oreilly.servlet.multipart.Part; + +public class CreateProjectCommand extends Command { + @Override + public void doPost(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + try { + Properties options = parseUrlParameters(request); + Project project = new Project(); + + internalImport(request, project, options); + + ProjectMetadata pm = new ProjectMetadata(); + pm.setName(options.getProperty("project-name")); + pm.setPassword(options.getProperty("project-password")); + ProjectManager.singleton.registerProject(project, pm); + + project.columnModel.update(); + + redirect(response, "/project.html?project=" + project.id); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + protected Properties parseUrlParameters(HttpServletRequest request) { + Properties options = new Properties(); + + String query = request.getQueryString(); + if (query != null) { + if (query.startsWith("?")) { + query = query.substring(1); + } + + String[] pairs = query.split("&"); + for (String pairString : pairs) { + int equal = pairString.indexOf('='); + String name = equal >= 0 ? pairString.substring(0, equal) : ""; + String value = equal >= 0 ? ParsingUtilities.decode(pairString.substring(equal + 1)) : ""; + + options.put(name, value); + } + } + return options; + } + + protected void internalImport( + HttpServletRequest request, + Project project, + Properties options + ) throws Exception { + MultipartParser parser = null; + try { + parser = new MultipartParser(request, 20 * 1024 * 1024); + } catch (Exception e) { + // silent + } + + if (parser != null) { + Part part = null; + while ((part = parser.readNextPart()) != null) { + + if (part.isFile()) { + FilePart filePart = (FilePart) part; + Importer importer = guessImporter( + options, null, filePart.getFileName()); + + if (importer.takesReader()) { + Reader reader = new InputStreamReader(filePart.getInputStream()); + try { + importer.read(reader, project, options); + } finally { + reader.close(); + } + } else { + InputStream inputStream = filePart.getInputStream(); + try { + importer.read(inputStream, project, options); + } finally { + inputStream.close(); + } + } + } else if (part.isParam()) { + ParamPart paramPart = (ParamPart) part; + String paramName = paramPart.getName(); + if (paramName.equals("raw-text")) { + StringReader reader = new StringReader(paramPart.getStringValue()); + try { + new TsvCsvImporter().read(reader, project, options); + } finally { + reader.close(); + } + } else if (paramName.equals("url")) { + String url = paramPart.getStringValue(); + if (url.length() > 0) { + internalImportURL(request, project, options, url); + } + } else { + options.put(paramName, paramPart.getStringValue()); + } + } + } + } + } + + protected void internalImportURL( + HttpServletRequest request, + Project project, + Properties options, + String urlString + ) throws Exception { + URL url = new URL(urlString); + URLConnection connection = null; + + try { + connection = url.openConnection(); + connection.setConnectTimeout(5000); + connection.connect(); + } catch (Exception e) { + throw new Exception("Cannot connect to " + urlString, e); + } + + InputStream inputStream = null; + try { + inputStream = connection.getInputStream(); + } catch (Exception e) { + throw new Exception("Cannot retrieve content from " + url, e); + } + + try { + Importer importer = guessImporter( + options, + connection.getContentType(), + url.getPath() + ); + + if (importer.takesReader()) { + String encoding = connection.getContentEncoding(); + + Reader reader = new InputStreamReader( + inputStream, (encoding == null) ? "ISO-8859-1" : encoding); + + importer.read(reader, project, options); + } else { + importer.read(inputStream, project, options); + } + } finally { + inputStream.close(); + } + } + + protected Importer guessImporter( + Properties options, String contentType, String fileName) { + + if (contentType != null) { + contentType = contentType.toLowerCase().trim(); + + if ("application/msexcel".equals(contentType) || + "application/x-msexcel".equals(contentType) || + "application/x-ms-excel".equals(contentType) || + "application/vnd.ms-excel".equals(contentType) || + "application/x-excel".equals(contentType) || + "application/xls".equals(contentType) || + "application/x-xls".equals(contentType)) { + + return new ExcelImporter(); + } + } else if (fileName != null) { + fileName = fileName.toLowerCase(); + if (fileName.endsWith(".xls")) { // Note: we can't handle .xlsx yet + return new ExcelImporter(); + } + } + + return new TsvCsvImporter(); + } + +} diff --git a/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectFromUploadCommand.java b/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectFromUploadCommand.java deleted file mode 100644 index 8be1c24af..000000000 --- a/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectFromUploadCommand.java +++ /dev/null @@ -1,163 +0,0 @@ -package com.metaweb.gridworks.commands.edit; - -import java.io.IOException; -import java.util.Properties; - -import javax.servlet.ServletException; -import javax.servlet.http.HttpServletRequest; -import javax.servlet.http.HttpServletResponse; - -import com.metaweb.gridworks.ProjectManager; -import com.metaweb.gridworks.ProjectMetadata; -import com.metaweb.gridworks.commands.Command; -import com.metaweb.gridworks.model.Cell; -import com.metaweb.gridworks.model.Column; -import com.metaweb.gridworks.model.Project; -import com.metaweb.gridworks.model.Row; - -public class CreateProjectFromUploadCommand extends Command { - @Override - public void doPost(HttpServletRequest request, HttpServletResponse response) - throws ServletException, IOException { - - Properties properties = new Properties(); - String content = readFileUpload(request, properties); - - ProjectMetadata pm = new ProjectMetadata(); - pm.setName(properties.getProperty("project-name")); - pm.setPassword(properties.getProperty("project-password")); - - Project project = ProjectManager.singleton.createProject(pm); - - int start = 0; - String sep = null; - String line = null; - boolean first = true; - int cellCount = 1; - - while (start < content.length()) { - int newline = content.indexOf('\n', start); - if (newline < 0) { - line = content.substring(start); - start = content.length(); - } else { - line = content.substring(start, newline); - start = newline + 1; - } - - if (sep == null) { - int tab = line.indexOf('\t'); - if (tab >= 0) { - sep = "\t"; - } else { - sep = ","; - } - } - - if (first) { - String[] cells = line.split(sep); - - first = false; - for (int c = 0; c < cells.length; c++) { - String cell = cells[c]; - if (cell.startsWith("\"") && cell.endsWith("\"")) { - cell = cell.substring(1, cell.length() - 1); - } - - Column column = new Column(c, cell); - - project.columnModel.columns.add(column); - } - - cellCount = cells.length; - } else { - Row row = new Row(cellCount); - - if ((sep.charAt(0) == ',') ? parseCSVIntoRow(row, line) : parseTSVIntoRow(row, line)) { - project.rows.add(row); - project.columnModel.setMaxCellIndex(Math.max(project.columnModel.getMaxCellIndex(), row.cells.size())); - } - } - } - - project.columnModel.update(); - - redirect(response, "/project.html?project=" + project.id); - } - - static protected boolean parseTSVIntoRow(Row row, String line) { - boolean hasData = false; - - String[] cells = line.split("\t"); - for (int c = 0; c < cells.length; c++) { - String text = cells[c]; - - Cell cell = new Cell(parseCellValue(text), null); - - row.cells.add(cell); - - if (text.length() > 0) { - hasData = true; - } - } - return hasData; - } - - static protected boolean parseCSVIntoRow(Row row, String line) { - boolean hasData = false; - - int start = 0; - while (start < line.length()) { - String text = null; - - if (line.charAt(start) == '"') { - int next = line.indexOf('"', start + 1); - if (next < 0) { - text = line.substring(start); - start = line.length(); - } else { - text = line.substring(start, next + 1); - start = next + 2; - } - } else { - int next = line.indexOf(',', start); - if (next < 0) { - text = line.substring(start); - start = line.length(); - } else { - text = line.substring(start, next); - start = next + 1; - } - } - - Cell cell = new Cell(parseCellValue(text), null); - - row.cells.add(cell); - - if (text.length() > 0) { - hasData = true; - } - } - - return hasData; - } - - static public Object parseCellValue(String text) { - if (text.length() > 0) { - if (text.length() > 1 && text.startsWith("\"") && text.endsWith("\"")) { - return text.substring(1, text.length() - 1); - } - - try { - return Long.parseLong(text); - } catch (NumberFormatException e) { - } - - try { - return Double.parseDouble(text); - } catch (NumberFormatException e) { - } - } - return text; - } -} diff --git a/src/main/java/com/metaweb/gridworks/importers/ExcelImporter.java b/src/main/java/com/metaweb/gridworks/importers/ExcelImporter.java new file mode 100644 index 000000000..dd1bd1aa7 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/importers/ExcelImporter.java @@ -0,0 +1,150 @@ +package com.metaweb.gridworks.importers; + +import java.io.InputStream; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.apache.commons.lang.NotImplementedException; +import org.apache.poi.hssf.usermodel.HSSFCell; +import org.apache.poi.hssf.usermodel.HSSFRow; +import org.apache.poi.hssf.usermodel.HSSFSheet; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Column; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Row; + +public class ExcelImporter implements Importer { + + @Override + public boolean takesReader() { + return false; + } + + @Override + public void read(Reader reader, Project project, Properties options) + throws Exception { + + throw new NotImplementedException(); + } + + @Override + public void read(InputStream inputStream, Project project, + Properties options) throws Exception { + + POIFSFileSystem fs = new POIFSFileSystem(inputStream); + HSSFWorkbook wb = new HSSFWorkbook(fs); + HSSFSheet sheet = wb.getSheetAt(0); + + int firstRow = sheet.getFirstRowNum(); + int lastRow = sheet.getLastRowNum(); + int r = firstRow; + + List nonBlankIndices = null; + List nonBlankHeaderStrings = null; + + /* + * Find the header row + */ + for (; r <= lastRow; r++) { + HSSFRow row = sheet.getRow(r); + if (row == null) { + continue; + } + + short firstCell = row.getFirstCellNum(); + short lastCell = row.getLastCellNum(); + if (firstCell >= 0 && firstCell <= lastCell) { + nonBlankIndices = new ArrayList(lastCell - firstCell + 1); + nonBlankHeaderStrings = new ArrayList(lastCell - firstCell + 1); + + for (short c = firstCell; c <= lastCell; c++) { + HSSFCell cell = row.getCell(c); + if (cell != null) { + String text = cell.getStringCellValue().trim(); + if (text.length() > 0) { + nonBlankIndices.add((int) c); + nonBlankHeaderStrings.add(text); + } + } + } + + if (nonBlankIndices.size() > 0) { + r++; + break; + } + } + } + + if (nonBlankIndices == null || nonBlankIndices.size() == 0) { + return; + } + + /* + * Create columns + */ + for (int c = 0; c < nonBlankIndices.size(); c++) { + Column column = new Column(c, nonBlankHeaderStrings.get(c)); + project.columnModel.columns.add(column); + } + + /* + * Now process the data rows + */ + for (; r <= lastRow; r++) { + HSSFRow row = sheet.getRow(r); + if (row == null) { + continue; + } + + short firstCell = row.getFirstCellNum(); + short lastCell = row.getLastCellNum(); + if (firstCell >= 0 && firstCell <= lastCell) { + Row newRow = new Row(nonBlankIndices.size()); + boolean hasData = false; + + for (short c = 0; c < nonBlankIndices.size(); c++) { + if (c < firstCell || c > lastCell) { + continue; + } + + HSSFCell cell = row.getCell(c); + if (cell == null) { + continue; + } + + int cellType = cell.getCellType(); + if (cellType == HSSFCell.CELL_TYPE_ERROR || + cellType == HSSFCell.CELL_TYPE_BLANK) { + continue; + } + + Object value = null; + if (cellType == HSSFCell.CELL_TYPE_BOOLEAN) { + value = cell.getBooleanCellValue(); + } else if (cellType == HSSFCell.CELL_TYPE_NUMERIC) { + value = cell.getNumericCellValue(); + } else { + String text = cell.getStringCellValue().trim(); + if (text.length() > 0) { + value = text; + } + } + + if (value != null) { + newRow.setCell(c, new Cell(value, null)); + hasData = true; + } + } + + if (hasData) { + project.rows.add(newRow); + } + } + } + } +} diff --git a/src/main/java/com/metaweb/gridworks/importers/Importer.java b/src/main/java/com/metaweb/gridworks/importers/Importer.java new file mode 100644 index 000000000..9ecb3fa96 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/importers/Importer.java @@ -0,0 +1,14 @@ +package com.metaweb.gridworks.importers; + +import java.io.InputStream; +import java.io.Reader; +import java.util.Properties; + +import com.metaweb.gridworks.model.Project; + +public interface Importer { + public boolean takesReader(); + + public void read(Reader reader, Project project, Properties options) throws Exception; + public void read(InputStream inputStream, Project project, Properties options) throws Exception; +} diff --git a/src/main/java/com/metaweb/gridworks/importers/ImporterUtilities.java b/src/main/java/com/metaweb/gridworks/importers/ImporterUtilities.java new file mode 100644 index 000000000..3e8e8a16e --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/importers/ImporterUtilities.java @@ -0,0 +1,84 @@ +package com.metaweb.gridworks.importers; + +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Row; + +public class ImporterUtilities { + + static public Object parseCellValue(String text) { + if (text.length() > 0) { + if (text.length() > 1 && text.startsWith("\"") && text.endsWith("\"")) { + return text.substring(1, text.length() - 1); + } + + try { + return Long.parseLong(text); + } catch (NumberFormatException e) { + } + + try { + return Double.parseDouble(text); + } catch (NumberFormatException e) { + } + } + return text; + } + + static public boolean parseCSVIntoRow(Row row, String line) { + boolean hasData = false; + + int start = 0; + while (start < line.length()) { + String text = null; + + if (line.charAt(start) == '"') { + int next = line.indexOf('"', start + 1); + if (next < 0) { + text = line.substring(start); + start = line.length(); + } else { + text = line.substring(start, next + 1); + start = next + 2; + } + } else { + int next = line.indexOf(',', start); + if (next < 0) { + text = line.substring(start); + start = line.length(); + } else { + text = line.substring(start, next); + start = next + 1; + } + } + + Cell cell = new Cell(parseCellValue(text), null); + + row.cells.add(cell); + + if (text.length() > 0) { + hasData = true; + } + } + + return hasData; + } + + static public boolean parseTSVIntoRow(Row row, String line) { + boolean hasData = false; + + String[] cells = line.split("\t"); + for (int c = 0; c < cells.length; c++) { + String text = cells[c]; + + Cell cell = new Cell(parseCellValue(text), null); + + row.cells.add(cell); + + if (text.length() > 0) { + hasData = true; + } + } + return hasData; + } + +} diff --git a/src/main/java/com/metaweb/gridworks/importers/TsvCsvImporter.java b/src/main/java/com/metaweb/gridworks/importers/TsvCsvImporter.java new file mode 100644 index 000000000..0f43e9a39 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/importers/TsvCsvImporter.java @@ -0,0 +1,82 @@ +package com.metaweb.gridworks.importers; + +import java.io.InputStream; +import java.io.LineNumberReader; +import java.io.Reader; +import java.util.Properties; + +import org.apache.commons.lang.NotImplementedException; + +import com.metaweb.gridworks.model.Column; +import com.metaweb.gridworks.model.Project; +import com.metaweb.gridworks.model.Row; + +public class TsvCsvImporter implements Importer { + + @Override + public void read(Reader reader, Project project, Properties options) + throws Exception { + + LineNumberReader lnReader = new LineNumberReader(reader); + try { + String sep = null; // auto-detect TSV or CSV + String line = null; + boolean first = true; + int cellCount = 1; + + while ((line = lnReader.readLine()) != null) { + if (line.trim().isEmpty()) { + continue; + } + + if (sep == null) { + int tab = line.indexOf('\t'); + if (tab >= 0) { + sep = "\t"; + } else { + sep = ","; + } + } + + if (first) { + String[] cells = line.split(sep); + + first = false; + for (int c = 0; c < cells.length; c++) { + String cell = cells[c]; + if (cell.startsWith("\"") && cell.endsWith("\"")) { + cell = cell.substring(1, cell.length() - 1); + } + + Column column = new Column(c, cell); + + project.columnModel.columns.add(column); + } + + cellCount = cells.length; + } else { + Row row = new Row(cellCount); + + if ((sep.charAt(0) == ',') ? ImporterUtilities.parseCSVIntoRow(row, line) : ImporterUtilities.parseTSVIntoRow(row, line)) { + project.rows.add(row); + project.columnModel.setMaxCellIndex(Math.max(project.columnModel.getMaxCellIndex(), row.cells.size())); + } + } + } + } finally { + lnReader.close(); + } + } + + @Override + public void read(InputStream inputStream, Project project, + Properties options) throws Exception { + + throw new NotImplementedException(); + } + + @Override + public boolean takesReader() { + return true; + } +} diff --git a/src/main/java/com/metaweb/gridworks/util/ParsingUtilities.java b/src/main/java/com/metaweb/gridworks/util/ParsingUtilities.java index 9504447b8..844f422f9 100644 --- a/src/main/java/com/metaweb/gridworks/util/ParsingUtilities.java +++ b/src/main/java/com/metaweb/gridworks/util/ParsingUtilities.java @@ -6,6 +6,7 @@ import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; +import org.apache.commons.codec.DecoderException; import org.apache.commons.codec.net.URLCodec; import org.json.JSONArray; import org.json.JSONException; @@ -55,5 +56,13 @@ public class ParsingUtilities { return s; // should not happen } } - + static public String decode(String s) { + try { + return codec.decode(s, "UTF-8"); + } catch (UnsupportedEncodingException e) { + return s; // should not happen + } catch (DecoderException e) { + return s; // should not happen + } + } }