diff --git a/.classpath b/.classpath index 05a4eb25f..89118c7af 100644 --- a/.classpath +++ b/.classpath @@ -10,7 +10,7 @@ - + @@ -65,5 +65,10 @@ + + + + + diff --git a/main/src/com/google/refine/importers/OpenOfficeImporter.java b/main/src/com/google/refine/importers/OpenOfficeImporter.java new file mode 100644 index 000000000..af9da416a --- /dev/null +++ b/main/src/com/google/refine/importers/OpenOfficeImporter.java @@ -0,0 +1,266 @@ +/* + +Copyright 2011, Thomas F. Morris +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.importers; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.json.JSONArray; +import org.json.JSONObject; +import org.odftoolkit.odfdom.doc.OdfDocument; +import org.odftoolkit.odfdom.doc.table.OdfTable; +import org.odftoolkit.odfdom.doc.table.OdfTableCell; +import org.odftoolkit.odfdom.doc.table.OdfTableRow; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.refine.ProjectMetadata; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingUtilities; +import com.google.refine.model.Cell; +import com.google.refine.model.Project; +import com.google.refine.model.Recon; +import com.google.refine.model.ReconCandidate; +import com.google.refine.model.Recon.Judgment; +import com.google.refine.util.JSONUtilities; + + +public class OpenOfficeImporter extends TabularImportingParserBase { + final static Logger logger = LoggerFactory.getLogger("open office"); + + public OpenOfficeImporter() { + super(true); + } + + + @Override + public JSONObject createParserUIInitializationData( + ImportingJob job, List fileRecords, String format) { + JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); + + JSONArray sheetRecords = new JSONArray(); + JSONUtilities.safePut(options, "sheetRecords", sheetRecords); + OdfDocument odfDoc = null; + try { + JSONObject firstFileRecord = fileRecords.get(0); + File file = ImportingUtilities.getFile(job, firstFileRecord); + InputStream is = new FileInputStream(file); + odfDoc = OdfDocument.loadDocument(is); + List tables = odfDoc.getTableList(); + int sheetCount = tables.size(); + + boolean hasData = false; + for (int i = 0; i < sheetCount; i++) { + OdfTable sheet = tables.get(i); + int rows = sheet.getRowCount(); + + JSONObject sheetRecord = new JSONObject(); + JSONUtilities.safePut(sheetRecord, "name", sheet.getTableName()); + JSONUtilities.safePut(sheetRecord, "rows", rows); + if (hasData) { + JSONUtilities.safePut(sheetRecord, "selected", false); + } else if (rows > 0) { + JSONUtilities.safePut(sheetRecord, "selected", true); + hasData = true; + } + JSONUtilities.append(sheetRecords, sheetRecord); + } + } catch (FileNotFoundException e) { + logger.info("File not found",e); + } catch (Exception e) { + // ODF throws *VERY* wide exceptions + logger.info("Error reading ODF spreadsheet",e); + } finally { + if (odfDoc != null) { + odfDoc.close(); + } + } + return options; + } + + + @Override + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + InputStream inputStream, + int limit, + JSONObject options, + List exceptions + ) { + OdfDocument odfDoc; + try { + odfDoc = OdfDocument.loadDocument(inputStream); + } catch (Exception e) { // Ugh! could they throw any wider exception? + exceptions.add(e); + return; + } + + List tables = odfDoc.getTableList(); + + int[] sheets = JSONUtilities.getIntArray(options, "sheets"); + for (int sheetIndex : sheets) { + final OdfTable table = tables.get(sheetIndex); + final int lastRow = table.getRowCount(); + + TableDataReader dataReader = new TableDataReader() { + int nextRow = 0; + Map reconMap = new HashMap(); + + @Override + public List getNextRowOfCells() throws IOException { + if (nextRow > lastRow) { + return null; + } + + List cells = new ArrayList(); + OdfTableRow row = table.getRowByIndex(nextRow++); + if (row != null) { + int lastCell = row.getCellCount(); + for (int cellIndex = 0; cellIndex <= lastCell; cellIndex++) { + Cell cell = null; + + OdfTableCell sourceCell = row.getCellByIndex(cellIndex); + if (sourceCell != null) { + cell = extractCell(sourceCell, reconMap); + } + cells.add(cell); + } + } + return cells; + } + }; + + TabularImportingParserBase.readTable( + project, + metadata, + job, + dataReader, + fileSource + "#" + table.getTableName(), + limit, + options, + exceptions + ); + } + } + + static protected Serializable extractCell(OdfTableCell cell) { + // TODO: how can we tell if a cell contains an error? + String formula = cell.getFormula(); + + Serializable value = null; + // "boolean", "currency", "date", "float", "percentage", "string" or "time" + String cellType = cell.getValueType(); + if ("boolean".equals(cellType)) { + value = cell.getBooleanValue(); + } else if ("float".equals(cellType)) { + value = cell.getDoubleValue(); + } else if ("date".equals(cellType)) { + value = cell.getDateValue(); + } else if ("currency".equals(cellType)) { + value = cell.getCurrencyValue(); + } else if ("percentage".equals(cellType)) { + value = cell.getPercentageValue(); + } else if ("string".equals(cellType)) { + value = cell.getStringValue(); + } else { + logger.info("Unexpected cell type " + cellType); + value = cell.getDisplayText(); + } + return value; + } + + static protected Cell extractCell(OdfTableCell cell, Map reconMap) { + Serializable value = extractCell(cell); + + if (value != null) { + Recon recon = null; + + String hyperlink = ""; // TODO: cell.getHyperlink(); + if (hyperlink != null) { + String url = hyperlink; // TODO: hyperlink.getAddress(); + + if (url.startsWith("http://") || + url.startsWith("https://")) { + + final String sig = "freebase.com/view"; + + int i = url.indexOf(sig); + if (i > 0) { + String id = url.substring(i + sig.length()); + + int q = id.indexOf('?'); + if (q > 0) { + id = id.substring(0, q); + } + int h = id.indexOf('#'); + if (h > 0) { + id = id.substring(0, h); + } + + if (reconMap.containsKey(id)) { + recon = reconMap.get(id); + recon.judgmentBatchSize++; + } else { + recon = new Recon(0, null, null); + recon.service = "import"; + recon.match = new ReconCandidate(id, value.toString(), new String[0], 100); + recon.matchRank = 0; + recon.judgment = Judgment.Matched; + recon.judgmentAction = "auto"; + recon.judgmentBatchSize = 1; + recon.addCandidate(recon.match); + + reconMap.put(id, recon); + } + } + } + } + return new Cell(value, recon); + } else { + return null; + } + } + +} \ No newline at end of file diff --git a/main/webapp/WEB-INF/lib-src/odfdom-java-0.8.7-sources.jar b/main/webapp/WEB-INF/lib-src/odfdom-java-0.8.7-sources.jar new file mode 100644 index 000000000..785925d3f Binary files /dev/null and b/main/webapp/WEB-INF/lib-src/odfdom-java-0.8.7-sources.jar differ diff --git a/main/webapp/WEB-INF/lib/odfdom-java-0.8.7.jar b/main/webapp/WEB-INF/lib/odfdom-java-0.8.7.jar new file mode 100644 index 000000000..60c770796 Binary files /dev/null and b/main/webapp/WEB-INF/lib/odfdom-java-0.8.7.jar differ diff --git a/main/webapp/WEB-INF/lib/resolver.jar b/main/webapp/WEB-INF/lib/resolver.jar new file mode 100644 index 000000000..e535bdc07 Binary files /dev/null and b/main/webapp/WEB-INF/lib/resolver.jar differ diff --git a/main/webapp/WEB-INF/lib/serializer.jar b/main/webapp/WEB-INF/lib/serializer.jar new file mode 100644 index 000000000..de9b007b4 Binary files /dev/null and b/main/webapp/WEB-INF/lib/serializer.jar differ diff --git a/main/webapp/WEB-INF/lib/xercesImpl-2.11.jar b/main/webapp/WEB-INF/lib/xercesImpl-2.11.jar new file mode 100644 index 000000000..0aaa990f3 Binary files /dev/null and b/main/webapp/WEB-INF/lib/xercesImpl-2.11.jar differ diff --git a/main/webapp/WEB-INF/lib/xml-apis.jar b/main/webapp/WEB-INF/lib/xml-apis.jar new file mode 100644 index 000000000..46733464f Binary files /dev/null and b/main/webapp/WEB-INF/lib/xml-apis.jar differ diff --git a/main/webapp/modules/core/MOD-INF/controller.js b/main/webapp/modules/core/MOD-INF/controller.js index 9840f9b5d..145d6902c 100644 --- a/main/webapp/modules/core/MOD-INF/controller.js +++ b/main/webapp/modules/core/MOD-INF/controller.js @@ -197,6 +197,8 @@ function registerImporting() { IM.registerFormat("text/xml", "XML files", "XmlParserUI", new Packages.com.google.refine.importers.XmlImporter()); IM.registerFormat("text/xml/xlsx", "Excel (.xlsx) files", "ExcelParserUI", new Packages.com.google.refine.importers.ExcelImporter()); + // TODO: Can we get away with just reusing Excel importer UI? + IM.registerFormat("text/xml/ods", "Open Document Format spreadsheets (.ods)", "ExcelParserUI", new Packages.com.google.refine.importers.OpenOfficeImporter()); IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfParserUI", new Packages.com.google.refine.importers.RdfTripleImporter()); IM.registerFormat("text/json", "JSON files", "JsonParserUI", new Packages.com.google.refine.importers.JsonImporter()); IM.registerFormat("text/marc", "MARC files"); @@ -221,6 +223,8 @@ function registerImporting() { IM.registerExtension(".xls", "binary/xls"); IM.registerExtension(".xlsx", "text/xml/xlsx"); + + IM.registerExtension(".ods", "text/xml/ods"); IM.registerExtension(".n3", "text/rdf+n3"); @@ -246,6 +250,8 @@ function registerImporting() { IM.registerMimeType("application/x-excel", "binary/xls"); IM.registerMimeType("application/xls", "binary/xls"); IM.registerMimeType("application/x-xls", "text/xml/xlsx"); + + IM.registerMimeType("application/vnd.oasis.opendocument.spreadsheet","text/xml/ods"); IM.registerMimeType("application/json", "text/json"); IM.registerMimeType("text/json", "text/json");