New importer for Open Document Format (ODF) spreadsheet files (.ods)

git-svn-id: http://google-refine.googlecode.com/svn/trunk@2323 7d457c2a-affb-35e4-300a-418c747d4874
2011-10-11 20:27:40 +00:00 · 2011-10-11 20:27:40 +00:00 · ca17e1ef0a
commit ca17e1ef0a
parent 5bde74275b
9 changed files with 278 additions and 1 deletions
--- a/.classpath
+++ b/.classpath
@ -10,7 +10,7 @@
 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
 	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/ant-tools-1.8.0.jar"/>
 	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/arithcode-1.1.jar"/>
-	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/butterfly-trunk.jar"/>
+	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/butterfly-trunk.jar" sourcepath="main/webapp/WEB-INF/lib-src/butterfly-trunk-sources.jar"/>
 	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/clojure-1.1.0.jar"/>
 	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-codec-1.5.jar"/>
 	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/commons-collections-3.2.1.jar"/>
@ -65,5 +65,10 @@
 	<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/gdata-client-meta-1.0.jar"/>
 	<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/gdata-spreadsheet-meta-3.0.jar"/>
 	<classpathentry kind="lib" path="extensions/gdata/module/MOD-INF/lib/mail.jar"/>
+	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/odfdom-java-0.8.7.jar" sourcepath="main/webapp/WEB-INF/lib-src/odfdom-java-0.8.7-sources.jar"/>
+	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/resolver.jar"/>
+	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/serializer.jar"/>
+	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/xercesImpl-2.11.jar"/>
+	<classpathentry kind="lib" path="main/webapp/WEB-INF/lib/xml-apis.jar"/>
 	<classpathentry kind="output" path="build"/>
 </classpath>
--- a/main/src/com/google/refine/importers/OpenOfficeImporter.java
+++ b/main/src/com/google/refine/importers/OpenOfficeImporter.java
@ -0,0 +1,266 @@
+/*
+
+Copyright 2011, Thomas F. Morris
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,           
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY           
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+package com.google.refine.importers;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.odftoolkit.odfdom.doc.OdfDocument;
+import org.odftoolkit.odfdom.doc.table.OdfTable;
+import org.odftoolkit.odfdom.doc.table.OdfTableCell;
+import org.odftoolkit.odfdom.doc.table.OdfTableRow;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.refine.ProjectMetadata;
+import com.google.refine.importing.ImportingJob;
+import com.google.refine.importing.ImportingUtilities;
+import com.google.refine.model.Cell;
+import com.google.refine.model.Project;
+import com.google.refine.model.Recon;
+import com.google.refine.model.ReconCandidate;
+import com.google.refine.model.Recon.Judgment;
+import com.google.refine.util.JSONUtilities;
+
+
+public class OpenOfficeImporter extends TabularImportingParserBase { 
+    final static Logger logger = LoggerFactory.getLogger("open office");
+
+    public OpenOfficeImporter() {
+        super(true);
+    }
+
+    
+    @Override
+    public JSONObject createParserUIInitializationData(
+            ImportingJob job, List<JSONObject> fileRecords, String format) {
+        JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
+
+        JSONArray sheetRecords = new JSONArray();
+        JSONUtilities.safePut(options, "sheetRecords", sheetRecords);
+        OdfDocument odfDoc = null;
+        try {
+            JSONObject firstFileRecord = fileRecords.get(0);
+            File file = ImportingUtilities.getFile(job, firstFileRecord);
+            InputStream is = new FileInputStream(file);
+            odfDoc = OdfDocument.loadDocument(is);
+            List<OdfTable> tables = odfDoc.getTableList();
+            int sheetCount = tables.size();
+
+            boolean hasData = false;
+            for (int i = 0; i < sheetCount; i++) {
+                OdfTable sheet = tables.get(i);
+                int rows = sheet.getRowCount();
+
+                JSONObject sheetRecord = new JSONObject();
+                JSONUtilities.safePut(sheetRecord, "name", sheet.getTableName());
+                JSONUtilities.safePut(sheetRecord, "rows", rows);
+                if (hasData) {
+                    JSONUtilities.safePut(sheetRecord, "selected", false);
+                } else if (rows > 0) {
+                    JSONUtilities.safePut(sheetRecord, "selected", true);
+                    hasData = true;
+                }
+                JSONUtilities.append(sheetRecords, sheetRecord);
+            }
+        } catch (FileNotFoundException e) {
+            logger.info("File not found",e);
+        } catch (Exception e) {
+            // ODF throws *VERY* wide exceptions
+            logger.info("Error reading ODF spreadsheet",e);
+        } finally {
+            if (odfDoc != null) {
+                odfDoc.close();
+            }
+        }
+        return options;
+    }
+    
+
+    @Override
+    public void parseOneFile(
+            Project project,
+            ProjectMetadata metadata,
+            ImportingJob job,
+            String fileSource,
+            InputStream inputStream,
+            int limit,
+            JSONObject options,
+            List<Exception> exceptions
+    ) {
+        OdfDocument odfDoc;
+        try {
+            odfDoc = OdfDocument.loadDocument(inputStream);
+        } catch (Exception e) { // Ugh! could they throw any wider exception?
+            exceptions.add(e);
+            return;
+        }
+
+        List<OdfTable> tables = odfDoc.getTableList();
+
+        int[] sheets = JSONUtilities.getIntArray(options, "sheets");
+        for (int sheetIndex : sheets) {
+            final OdfTable table = tables.get(sheetIndex);
+            final int lastRow = table.getRowCount();
+
+            TableDataReader dataReader = new TableDataReader() {
+                int nextRow = 0;
+                Map<String, Recon> reconMap = new HashMap<String, Recon>();
+
+                @Override
+                public List<Object> getNextRowOfCells() throws IOException {
+                    if (nextRow > lastRow) {
+                        return null;
+                    }
+
+                    List<Object> cells = new ArrayList<Object>();
+                    OdfTableRow row = table.getRowByIndex(nextRow++);
+                    if (row != null) {
+                        int lastCell = row.getCellCount();
+                        for (int cellIndex = 0; cellIndex <= lastCell; cellIndex++) {
+                            Cell cell = null;
+
+                            OdfTableCell sourceCell = row.getCellByIndex(cellIndex);
+                            if (sourceCell != null) {
+                                cell = extractCell(sourceCell, reconMap);
+                            }
+                            cells.add(cell);
+                        }
+                    }
+                    return cells;
+                }
+            };
+
+            TabularImportingParserBase.readTable(
+                    project,
+                    metadata,
+                    job,
+                    dataReader,
+                    fileSource + "#" + table.getTableName(),
+                    limit,
+                    options,
+                    exceptions
+            );
+        }
+    }
+
+    static protected Serializable extractCell(OdfTableCell cell) {
+        // TODO: how can we tell if a cell contains an error?
+        String formula = cell.getFormula();
+
+        Serializable value = null;
+        // "boolean", "currency", "date", "float", "percentage", "string" or "time"
+        String cellType = cell.getValueType();
+        if ("boolean".equals(cellType)) {
+            value = cell.getBooleanValue();
+        } else if ("float".equals(cellType)) {
+            value = cell.getDoubleValue();
+        } else if ("date".equals(cellType)) {
+            value = cell.getDateValue();
+        } else if ("currency".equals(cellType)) {
+            value = cell.getCurrencyValue();
+        } else if ("percentage".equals(cellType)) {
+            value = cell.getPercentageValue();
+        } else if ("string".equals(cellType)) {
+            value = cell.getStringValue();
+        } else {
+            logger.info("Unexpected cell type " + cellType);
+            value = cell.getDisplayText();
+        }
+        return value;
+    }
+
+    static protected Cell extractCell(OdfTableCell cell, Map<String, Recon> reconMap) {
+        Serializable value = extractCell(cell);
+
+        if (value != null) {
+            Recon recon = null;
+
+            String hyperlink = ""; // TODO: cell.getHyperlink();
+            if (hyperlink != null) {
+                String url = hyperlink; // TODO: hyperlink.getAddress();
+
+                if (url.startsWith("http://") ||
+                        url.startsWith("https://")) {
+
+                    final String sig = "freebase.com/view";
+
+                    int i = url.indexOf(sig);
+                    if (i > 0) {
+                        String id = url.substring(i + sig.length());
+
+                        int q = id.indexOf('?');
+                        if (q > 0) {
+                            id = id.substring(0, q);
+                        }
+                        int h = id.indexOf('#');
+                        if (h > 0) {
+                            id = id.substring(0, h);
+                        }
+
+                        if (reconMap.containsKey(id)) {
+                            recon = reconMap.get(id);
+                            recon.judgmentBatchSize++;
+                        } else {
+                            recon = new Recon(0, null, null);
+                            recon.service = "import";
+                            recon.match = new ReconCandidate(id, value.toString(), new String[0], 100);
+                            recon.matchRank = 0;
+                            recon.judgment = Judgment.Matched;
+                            recon.judgmentAction = "auto";
+                            recon.judgmentBatchSize = 1;
+                            recon.addCandidate(recon.match);
+
+                            reconMap.put(id, recon);
+                        }
+                    }
+                }
+            }
+            return new Cell(value, recon);
+        } else {
+            return null;
+        }
+    }
+
+} 
--- a/main/webapp/WEB-INF/lib-src/odfdom-java-0.8.7-sources.jar
+++ b/main/webapp/WEB-INF/lib-src/odfdom-java-0.8.7-sources.jar
--- a/main/webapp/WEB-INF/lib/odfdom-java-0.8.7.jar
+++ b/main/webapp/WEB-INF/lib/odfdom-java-0.8.7.jar
--- a/main/webapp/WEB-INF/lib/resolver.jar
+++ b/main/webapp/WEB-INF/lib/resolver.jar
--- a/main/webapp/WEB-INF/lib/serializer.jar
+++ b/main/webapp/WEB-INF/lib/serializer.jar
--- a/main/webapp/WEB-INF/lib/xercesImpl-2.11.jar
+++ b/main/webapp/WEB-INF/lib/xercesImpl-2.11.jar
--- a/main/webapp/WEB-INF/lib/xml-apis.jar
+++ b/main/webapp/WEB-INF/lib/xml-apis.jar
--- a/main/webapp/modules/core/MOD-INF/controller.js
+++ b/main/webapp/modules/core/MOD-INF/controller.js
@ -197,6 +197,8 @@ function registerImporting() {

  IM.registerFormat("text/xml", "XML files", "XmlParserUI", new Packages.com.google.refine.importers.XmlImporter());
  IM.registerFormat("text/xml/xlsx", "Excel (.xlsx) files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter());
+  // TODO: Can we get away with just reusing Excel importer UI?
+  IM.registerFormat("text/xml/ods", "Open Document Format spreadsheets (.ods)", "ExcelParserUI", new Packages.com.google.refine.importers.OpenOfficeImporter());
  IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfParserUI", new Packages.com.google.refine.importers.RdfTripleImporter());
  IM.registerFormat("text/json", "JSON files", "JsonParserUI", new Packages.com.google.refine.importers.JsonImporter());
  IM.registerFormat("text/marc", "MARC files");
@ -221,6 +223,8 @@ function registerImporting() {

  IM.registerExtension(".xls", "binary/xls");
  IM.registerExtension(".xlsx", "text/xml/xlsx");
+
+  IM.registerExtension(".ods", "text/xml/ods");
  
  IM.registerExtension(".n3", "text/rdf+n3");

@ -246,6 +250,8 @@ function registerImporting() {
  IM.registerMimeType("application/x-excel", "binary/xls");
  IM.registerMimeType("application/xls", "binary/xls");
  IM.registerMimeType("application/x-xls", "text/xml/xlsx");
+  
+  IM.registerMimeType("application/vnd.oasis.opendocument.spreadsheet","text/xml/ods");

  IM.registerMimeType("application/json", "text/json");
  IM.registerMimeType("text/json", "text/json");