2010-10-20 22:45:52 +02:00
|
|
|
/*
|
|
|
|
|
|
|
|
Copyright 2010, Google Inc.
|
|
|
|
All rights reserved.
|
|
|
|
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
|
|
modification, are permitted provided that the following conditions are
|
|
|
|
met:
|
|
|
|
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above
|
|
|
|
copyright notice, this list of conditions and the following disclaimer
|
|
|
|
in the documentation and/or other materials provided with the
|
|
|
|
distribution.
|
|
|
|
* Neither the name of Google Inc. nor the names of its
|
|
|
|
contributors may be used to endorse or promote products derived from
|
|
|
|
this software without specific prior written permission.
|
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
2010-09-22 19:04:10 +02:00
|
|
|
package com.google.refine.importers;
|
2010-05-05 01:24:48 +02:00
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
import java.io.File;
|
|
|
|
import java.io.FileInputStream;
|
2010-05-05 01:24:48 +02:00
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.InputStream;
|
|
|
|
import java.io.Serializable;
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.HashMap;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
|
|
|
|
import org.apache.poi.common.usermodel.Hyperlink;
|
|
|
|
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
|
|
|
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
|
|
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|
|
|
import org.apache.poi.ss.usermodel.Sheet;
|
|
|
|
import org.apache.poi.ss.usermodel.Workbook;
|
|
|
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
2011-08-02 05:34:47 +02:00
|
|
|
import org.json.JSONArray;
|
|
|
|
import org.json.JSONObject;
|
2011-08-22 23:47:15 +02:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
2010-05-05 01:24:48 +02:00
|
|
|
|
2010-09-22 19:04:10 +02:00
|
|
|
import com.google.refine.ProjectMetadata;
|
2011-08-02 05:34:47 +02:00
|
|
|
import com.google.refine.importing.ImportingJob;
|
|
|
|
import com.google.refine.importing.ImportingUtilities;
|
2010-09-22 19:04:10 +02:00
|
|
|
import com.google.refine.model.Cell;
|
|
|
|
import com.google.refine.model.Project;
|
|
|
|
import com.google.refine.model.Recon;
|
|
|
|
import com.google.refine.model.Recon.Judgment;
|
2011-08-02 23:10:22 +02:00
|
|
|
import com.google.refine.model.ReconCandidate;
|
2011-08-02 05:34:47 +02:00
|
|
|
import com.google.refine.util.JSONUtilities;
|
2010-05-05 01:24:48 +02:00
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
public class ExcelImporter extends TabularImportingParserBase {
|
2011-08-22 23:47:15 +02:00
|
|
|
static final Logger logger = LoggerFactory.getLogger(ExcelImporter.class);
|
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
public ExcelImporter() {
|
|
|
|
super(true);
|
|
|
|
}
|
|
|
|
|
2010-08-06 07:04:25 +02:00
|
|
|
@Override
|
2011-08-02 05:34:47 +02:00
|
|
|
public JSONObject createParserUIInitializationData(
|
|
|
|
ImportingJob job, List<JSONObject> fileRecords, String format) {
|
|
|
|
JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
|
|
|
|
|
|
|
|
boolean xmlBased = "text/xml/xlsx".equals(format);
|
|
|
|
JSONUtilities.safePut(options, "xmlBased", xmlBased);
|
|
|
|
|
|
|
|
JSONArray sheetRecords = new JSONArray();
|
|
|
|
JSONUtilities.safePut(options, "sheetRecords", sheetRecords);
|
|
|
|
try {
|
2011-08-18 19:39:18 +02:00
|
|
|
if (fileRecords.size() > 0) {
|
|
|
|
JSONObject firstFileRecord = fileRecords.get(0);
|
|
|
|
File file = ImportingUtilities.getFile(job, firstFileRecord);
|
|
|
|
InputStream is = new FileInputStream(file);
|
|
|
|
try {
|
|
|
|
Workbook wb = xmlBased ?
|
|
|
|
new XSSFWorkbook(is) :
|
|
|
|
new HSSFWorkbook(new POIFSFileSystem(is));
|
|
|
|
|
|
|
|
int sheetCount = wb.getNumberOfSheets();
|
|
|
|
boolean hasData = false;
|
|
|
|
for (int i = 0; i < sheetCount; i++) {
|
|
|
|
Sheet sheet = wb.getSheetAt(i);
|
|
|
|
int rows = sheet.getLastRowNum() - sheet.getFirstRowNum() + 1;
|
|
|
|
|
|
|
|
JSONObject sheetRecord = new JSONObject();
|
|
|
|
JSONUtilities.safePut(sheetRecord, "name", sheet.getSheetName());
|
|
|
|
JSONUtilities.safePut(sheetRecord, "rows", rows);
|
|
|
|
if (hasData) {
|
|
|
|
JSONUtilities.safePut(sheetRecord, "selected", false);
|
|
|
|
} else if (rows > 1) {
|
|
|
|
JSONUtilities.safePut(sheetRecord, "selected", true);
|
|
|
|
hasData = true;
|
|
|
|
}
|
|
|
|
JSONUtilities.append(sheetRecords, sheetRecord);
|
|
|
|
}
|
|
|
|
} finally {
|
|
|
|
is.close();
|
2011-08-02 05:34:47 +02:00
|
|
|
}
|
2011-08-18 19:39:18 +02:00
|
|
|
}
|
2011-08-02 05:34:47 +02:00
|
|
|
} catch (IOException e) {
|
2011-08-22 23:47:15 +02:00
|
|
|
logger.error("Error generating parser UI initialization data for Excel file", e);
|
2011-08-02 05:34:47 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return options;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void parseOneFile(
|
|
|
|
Project project,
|
|
|
|
ProjectMetadata metadata,
|
|
|
|
ImportingJob job,
|
|
|
|
String fileSource,
|
|
|
|
InputStream inputStream,
|
|
|
|
int limit,
|
|
|
|
JSONObject options,
|
|
|
|
List<Exception> exceptions
|
|
|
|
) {
|
|
|
|
boolean xmlBased = JSONUtilities.getBoolean(options, "xmlBased", false);
|
2010-05-05 01:24:48 +02:00
|
|
|
Workbook wb = null;
|
|
|
|
try {
|
2011-08-02 05:34:47 +02:00
|
|
|
wb = xmlBased ?
|
2010-05-26 15:18:48 +02:00
|
|
|
new XSSFWorkbook(inputStream) :
|
2010-05-05 01:24:48 +02:00
|
|
|
new HSSFWorkbook(new POIFSFileSystem(inputStream));
|
|
|
|
} catch (IOException e) {
|
2011-08-02 05:34:47 +02:00
|
|
|
exceptions.add(new ImportException(
|
2010-11-11 15:25:46 +01:00
|
|
|
"Attempted to parse as an Excel file but failed. " +
|
2010-05-05 01:24:48 +02:00
|
|
|
"Try to use Excel to re-save the file as a different Excel version or as TSV and upload again.",
|
|
|
|
e
|
2011-08-02 05:34:47 +02:00
|
|
|
));
|
|
|
|
return;
|
2010-11-11 15:25:46 +01:00
|
|
|
} catch (ArrayIndexOutOfBoundsException e){
|
2011-08-02 05:34:47 +02:00
|
|
|
exceptions.add(new ImportException(
|
|
|
|
"Attempted to parse file as an Excel file but failed. " +
|
|
|
|
"This is probably caused by a corrupt excel file, or due to the file having previously been created or saved by a non-Microsoft application. " +
|
|
|
|
"Please try opening the file in Microsoft Excel and resaving it, then try re-uploading the file. " +
|
|
|
|
"See https://issues.apache.org/bugzilla/show_bug.cgi?id=48261 for further details",
|
|
|
|
e
|
|
|
|
));
|
|
|
|
return;
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
2010-08-01 06:22:45 +02:00
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
int[] sheets = JSONUtilities.getIntArray(options, "sheets");
|
|
|
|
for (int sheetIndex : sheets) {
|
|
|
|
final Sheet sheet = wb.getSheetAt(sheetIndex);
|
|
|
|
final int lastRow = sheet.getLastRowNum();
|
2010-08-01 06:22:45 +02:00
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
TableDataReader dataReader = new TableDataReader() {
|
|
|
|
int nextRow = 0;
|
|
|
|
Map<String, Recon> reconMap = new HashMap<String, Recon>();
|
2010-08-01 06:22:45 +02:00
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
@Override
|
|
|
|
public List<Object> getNextRowOfCells() throws IOException {
|
2011-08-06 21:37:23 +02:00
|
|
|
if (nextRow > lastRow) {
|
2011-08-02 05:34:47 +02:00
|
|
|
return null;
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
2010-08-01 06:22:45 +02:00
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
List<Object> cells = new ArrayList<Object>();
|
|
|
|
org.apache.poi.ss.usermodel.Row row = sheet.getRow(nextRow++);
|
|
|
|
if (row != null) {
|
|
|
|
short lastCell = row.getLastCellNum();
|
2011-12-10 00:50:40 +01:00
|
|
|
for (short cellIndex = 0; cellIndex < lastCell; cellIndex++) {
|
2011-08-02 05:34:47 +02:00
|
|
|
Cell cell = null;
|
|
|
|
|
|
|
|
org.apache.poi.ss.usermodel.Cell sourceCell = row.getCell(cellIndex);
|
|
|
|
if (sourceCell != null) {
|
|
|
|
cell = extractCell(sourceCell, reconMap);
|
|
|
|
}
|
|
|
|
cells.add(cell);
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|
2011-08-02 05:34:47 +02:00
|
|
|
return cells;
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
2011-08-02 05:34:47 +02:00
|
|
|
};
|
2010-08-01 06:22:45 +02:00
|
|
|
|
2011-08-11 02:35:01 +02:00
|
|
|
TabularImportingParserBase.readTable(
|
2011-08-02 05:34:47 +02:00
|
|
|
project,
|
|
|
|
metadata,
|
|
|
|
job,
|
|
|
|
dataReader,
|
|
|
|
fileSource + "#" + sheet.getSheetName(),
|
|
|
|
limit,
|
|
|
|
options,
|
|
|
|
exceptions
|
|
|
|
);
|
2010-08-01 06:22:45 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
static protected Serializable extractCell(org.apache.poi.ss.usermodel.Cell cell) {
|
2010-08-01 06:22:45 +02:00
|
|
|
int cellType = cell.getCellType();
|
2010-12-25 22:41:21 +01:00
|
|
|
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) {
|
|
|
|
cellType = cell.getCachedFormulaResultType();
|
|
|
|
}
|
2010-08-01 06:22:45 +02:00
|
|
|
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR ||
|
|
|
|
cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
Serializable value = null;
|
|
|
|
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) {
|
|
|
|
value = cell.getBooleanCellValue();
|
|
|
|
} else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) {
|
|
|
|
double d = cell.getNumericCellValue();
|
|
|
|
|
|
|
|
if (HSSFDateUtil.isCellDateFormatted(cell)) {
|
|
|
|
value = HSSFDateUtil.getJavaDate(d);
|
2011-12-10 00:50:40 +01:00
|
|
|
// TODO: If we had a time datatype, we could use something like the following
|
|
|
|
// to distinguish times from dates (although Excel doesn't really make the distinction)
|
|
|
|
// Another alternative would be to look for values < 0.60
|
|
|
|
// String format = cell.getCellStyle().getDataFormatString();
|
|
|
|
// if (!format.contains("d") && !format.contains("m") && !format.contains("y") ) {
|
|
|
|
// // It's just a time
|
|
|
|
// }
|
2010-08-01 06:22:45 +02:00
|
|
|
} else {
|
|
|
|
value = d;
|
|
|
|
}
|
|
|
|
} else {
|
2011-10-13 01:45:52 +02:00
|
|
|
String text = cell.getStringCellValue();
|
2010-08-01 06:22:45 +02:00
|
|
|
if (text.length() > 0) {
|
|
|
|
value = text;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-06 07:04:25 +02:00
|
|
|
return value;
|
|
|
|
}
|
|
|
|
|
2011-08-02 05:34:47 +02:00
|
|
|
static protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map<String, Recon> reconMap) {
|
2010-08-06 07:04:25 +02:00
|
|
|
Serializable value = extractCell(cell);
|
|
|
|
|
2010-08-01 06:22:45 +02:00
|
|
|
if (value != null) {
|
|
|
|
Recon recon = null;
|
|
|
|
|
|
|
|
Hyperlink hyperlink = cell.getHyperlink();
|
|
|
|
if (hyperlink != null) {
|
|
|
|
String url = hyperlink.getAddress();
|
|
|
|
|
|
|
|
if (url.startsWith("http://") ||
|
|
|
|
url.startsWith("https://")) {
|
|
|
|
|
|
|
|
final String sig = "freebase.com/view";
|
|
|
|
|
|
|
|
int i = url.indexOf(sig);
|
|
|
|
if (i > 0) {
|
|
|
|
String id = url.substring(i + sig.length());
|
|
|
|
|
|
|
|
int q = id.indexOf('?');
|
|
|
|
if (q > 0) {
|
|
|
|
id = id.substring(0, q);
|
|
|
|
}
|
|
|
|
int h = id.indexOf('#');
|
|
|
|
if (h > 0) {
|
|
|
|
id = id.substring(0, h);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (reconMap.containsKey(id)) {
|
|
|
|
recon = reconMap.get(id);
|
|
|
|
recon.judgmentBatchSize++;
|
|
|
|
} else {
|
|
|
|
recon = new Recon(0, null, null);
|
|
|
|
recon.service = "import";
|
|
|
|
recon.match = new ReconCandidate(id, value.toString(), new String[0], 100);
|
|
|
|
recon.matchRank = 0;
|
|
|
|
recon.judgment = Judgment.Matched;
|
|
|
|
recon.judgmentAction = "auto";
|
|
|
|
recon.judgmentBatchSize = 1;
|
|
|
|
recon.addCandidate(recon.match);
|
|
|
|
|
|
|
|
reconMap.put(id, recon);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return new Cell(value, recon);
|
|
|
|
} else {
|
|
|
|
return null;
|
|
|
|
}
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|