2010-02-05 20:19:38 +01:00
|
|
|
package com.metaweb.gridworks.importers;
|
|
|
|
|
2010-03-17 22:43:58 +01:00
|
|
|
import java.io.IOException;
|
2010-02-05 20:19:38 +01:00
|
|
|
import java.io.InputStream;
|
|
|
|
import java.io.Reader;
|
2010-03-04 20:59:31 +01:00
|
|
|
import java.io.Serializable;
|
2010-02-05 20:19:38 +01:00
|
|
|
import java.util.ArrayList;
|
2010-04-06 19:55:36 +02:00
|
|
|
import java.util.HashMap;
|
2010-02-05 20:19:38 +01:00
|
|
|
import java.util.List;
|
2010-04-06 19:55:36 +02:00
|
|
|
import java.util.Map;
|
2010-02-05 20:19:38 +01:00
|
|
|
import java.util.Properties;
|
|
|
|
|
|
|
|
import org.apache.commons.lang.NotImplementedException;
|
2010-03-19 01:32:52 +01:00
|
|
|
import org.apache.poi.common.usermodel.Hyperlink;
|
2010-02-05 20:19:38 +01:00
|
|
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
|
|
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
2010-02-09 00:44:33 +01:00
|
|
|
import org.apache.poi.ss.usermodel.Sheet;
|
|
|
|
import org.apache.poi.ss.usermodel.Workbook;
|
|
|
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
2010-02-05 20:19:38 +01:00
|
|
|
|
|
|
|
import com.metaweb.gridworks.model.Cell;
|
|
|
|
import com.metaweb.gridworks.model.Column;
|
|
|
|
import com.metaweb.gridworks.model.Project;
|
2010-03-19 01:32:52 +01:00
|
|
|
import com.metaweb.gridworks.model.Recon;
|
|
|
|
import com.metaweb.gridworks.model.ReconCandidate;
|
2010-02-05 20:19:38 +01:00
|
|
|
import com.metaweb.gridworks.model.Row;
|
2010-03-19 01:32:52 +01:00
|
|
|
import com.metaweb.gridworks.model.Recon.Judgment;
|
2010-02-05 20:19:38 +01:00
|
|
|
|
|
|
|
public class ExcelImporter implements Importer {
|
2010-03-03 05:19:58 +01:00
|
|
|
final protected boolean _xmlBased;
|
|
|
|
|
|
|
|
public ExcelImporter(boolean xmlBased) {
|
|
|
|
_xmlBased = xmlBased;
|
|
|
|
}
|
2010-02-05 20:19:38 +01:00
|
|
|
|
2010-03-03 05:19:58 +01:00
|
|
|
public boolean takesReader() {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2010-04-11 23:54:56 +02:00
|
|
|
public void read(Reader reader, Project project, Properties options) throws Exception {
|
2010-03-03 05:19:58 +01:00
|
|
|
throw new NotImplementedException();
|
|
|
|
}
|
2010-02-05 20:19:38 +01:00
|
|
|
|
2010-04-11 23:54:56 +02:00
|
|
|
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
|
2010-04-13 23:23:41 +02:00
|
|
|
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
|
2010-04-11 23:54:56 +02:00
|
|
|
int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
|
|
|
|
int skip = ImporterUtilities.getIntegerOption("skip",options,0);
|
2010-03-03 05:19:58 +01:00
|
|
|
|
2010-03-17 22:43:58 +01:00
|
|
|
Workbook wb = null;
|
|
|
|
try {
|
|
|
|
wb = _xmlBased ?
|
2010-03-03 05:19:58 +01:00
|
|
|
new XSSFWorkbook(inputStream) :
|
|
|
|
new HSSFWorkbook(new POIFSFileSystem(inputStream));
|
2010-03-17 22:43:58 +01:00
|
|
|
} catch (IOException e) {
|
|
|
|
throw new IOException(
|
|
|
|
"Attempted to parse file as Excel file but failed. " +
|
|
|
|
"Try to use Excel to re-save the file as a different Excel version or as TSV and upload again.",
|
|
|
|
e
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2010-02-09 00:44:33 +01:00
|
|
|
Sheet sheet = wb.getSheetAt(0);
|
2010-02-05 20:19:38 +01:00
|
|
|
|
|
|
|
int firstRow = sheet.getFirstRowNum();
|
|
|
|
int lastRow = sheet.getLastRowNum();
|
|
|
|
int r = firstRow;
|
|
|
|
|
2010-04-13 23:23:41 +02:00
|
|
|
List<Integer> nonBlankIndices = null;
|
2010-03-03 05:19:58 +01:00
|
|
|
List<String> nonBlankHeaderStrings = null;
|
2010-02-05 20:19:38 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the header row
|
|
|
|
*/
|
|
|
|
for (; r <= lastRow; r++) {
|
2010-02-09 00:44:33 +01:00
|
|
|
org.apache.poi.ss.usermodel.Row row = sheet.getRow(r);
|
2010-02-05 20:19:38 +01:00
|
|
|
if (row == null) {
|
|
|
|
continue;
|
2010-04-13 23:23:41 +02:00
|
|
|
} else if (ignoreLines > 0) {
|
|
|
|
ignoreLines--;
|
|
|
|
continue;
|
2010-02-05 20:19:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
short firstCell = row.getFirstCellNum();
|
|
|
|
short lastCell = row.getLastCellNum();
|
|
|
|
if (firstCell >= 0 && firstCell <= lastCell) {
|
2010-03-03 05:19:58 +01:00
|
|
|
nonBlankIndices = new ArrayList<Integer>(lastCell - firstCell + 1);
|
|
|
|
nonBlankHeaderStrings = new ArrayList<String>(lastCell - firstCell + 1);
|
|
|
|
|
2010-02-08 20:54:09 +01:00
|
|
|
for (int c = firstCell; c <= lastCell; c++) {
|
2010-02-09 00:44:33 +01:00
|
|
|
org.apache.poi.ss.usermodel.Cell cell = row.getCell(c);
|
2010-02-05 20:19:38 +01:00
|
|
|
if (cell != null) {
|
|
|
|
String text = cell.getStringCellValue().trim();
|
|
|
|
if (text.length() > 0) {
|
2010-03-03 05:19:58 +01:00
|
|
|
nonBlankIndices.add((int) c);
|
|
|
|
nonBlankHeaderStrings.add(text);
|
2010-02-05 20:19:38 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nonBlankIndices.size() > 0) {
|
2010-03-03 05:19:58 +01:00
|
|
|
r++;
|
|
|
|
break;
|
2010-02-05 20:19:38 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nonBlankIndices == null || nonBlankIndices.size() == 0) {
|
2010-03-03 05:19:58 +01:00
|
|
|
return;
|
2010-02-05 20:19:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create columns
|
|
|
|
*/
|
2010-04-06 19:55:36 +02:00
|
|
|
Map<String, Integer> nameToIndex = new HashMap<String, Integer>();
|
2010-02-05 20:19:38 +01:00
|
|
|
for (int c = 0; c < nonBlankIndices.size(); c++) {
|
2010-04-06 19:55:36 +02:00
|
|
|
String cell = nonBlankHeaderStrings.get(c);
|
|
|
|
if (nameToIndex.containsKey(cell)) {
|
|
|
|
int index = nameToIndex.get(cell);
|
|
|
|
nameToIndex.put(cell, index + 1);
|
|
|
|
|
|
|
|
cell = cell.contains(" ") ? (cell + " " + index) : (cell + index);
|
|
|
|
} else {
|
|
|
|
nameToIndex.put(cell, 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
Column column = new Column(c, cell);
|
2010-03-03 05:19:58 +01:00
|
|
|
project.columnModel.columns.add(column);
|
2010-02-05 20:19:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now process the data rows
|
|
|
|
*/
|
2010-02-24 19:52:54 +01:00
|
|
|
int rowsWithData = 0;
|
2010-02-05 20:19:38 +01:00
|
|
|
for (; r <= lastRow; r++) {
|
2010-02-09 00:44:33 +01:00
|
|
|
org.apache.poi.ss.usermodel.Row row = sheet.getRow(r);
|
2010-02-05 20:19:38 +01:00
|
|
|
if (row == null) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
short firstCell = row.getFirstCellNum();
|
|
|
|
short lastCell = row.getLastCellNum();
|
|
|
|
if (firstCell >= 0 && firstCell <= lastCell) {
|
2010-03-03 05:19:58 +01:00
|
|
|
Row newRow = new Row(nonBlankIndices.size());
|
|
|
|
boolean hasData = false;
|
|
|
|
|
|
|
|
for (int c = 0; c < nonBlankIndices.size(); c++) {
|
|
|
|
if (c < firstCell || c > lastCell) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
org.apache.poi.ss.usermodel.Cell cell = row.getCell(c);
|
|
|
|
if (cell == null) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
int cellType = cell.getCellType();
|
|
|
|
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR ||
|
|
|
|
cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) {
|
|
|
|
cellType = cell.getCachedFormulaResultType();
|
|
|
|
}
|
|
|
|
|
2010-03-04 20:59:31 +01:00
|
|
|
Serializable value = null;
|
2010-03-03 05:19:58 +01:00
|
|
|
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) {
|
|
|
|
value = cell.getBooleanCellValue();
|
|
|
|
} else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) {
|
|
|
|
value = cell.getNumericCellValue();
|
|
|
|
} else {
|
2010-02-05 20:19:38 +01:00
|
|
|
String text = cell.getStringCellValue().trim();
|
|
|
|
if (text.length() > 0) {
|
2010-03-03 05:19:58 +01:00
|
|
|
value = text;
|
2010-02-05 20:19:38 +01:00
|
|
|
}
|
|
|
|
}
|
2010-03-03 05:19:58 +01:00
|
|
|
|
|
|
|
if (value != null) {
|
2010-03-19 01:32:52 +01:00
|
|
|
Recon recon = null;
|
|
|
|
|
|
|
|
Hyperlink hyperlink = cell.getHyperlink();
|
|
|
|
if (hyperlink != null) {
|
|
|
|
String url = hyperlink.getAddress();
|
|
|
|
|
|
|
|
if (url.startsWith("http://") ||
|
|
|
|
url.startsWith("https://")) {
|
|
|
|
|
|
|
|
final String sig = "freebase.com/view";
|
|
|
|
|
|
|
|
int i = url.indexOf(sig);
|
|
|
|
if (i > 0) {
|
|
|
|
String id = url.substring(i + sig.length());
|
|
|
|
|
|
|
|
int q = id.indexOf('?');
|
|
|
|
if (q > 0) {
|
|
|
|
id = id.substring(0, q);
|
|
|
|
}
|
|
|
|
int h = id.indexOf('#');
|
|
|
|
if (h > 0) {
|
|
|
|
id = id.substring(0, h);
|
|
|
|
}
|
|
|
|
|
|
|
|
recon = new Recon();
|
|
|
|
recon.judgment = Judgment.Matched;
|
|
|
|
recon.match = new ReconCandidate(id, "", value.toString(), new String[0], 100);
|
|
|
|
recon.addCandidate(recon.match);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
newRow.setCell(c, new Cell(value, recon));
|
2010-03-03 05:19:58 +01:00
|
|
|
hasData = true;
|
|
|
|
}
|
2010-02-05 20:19:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (hasData) {
|
2010-02-24 19:52:54 +01:00
|
|
|
rowsWithData++;
|
|
|
|
|
|
|
|
if (skip <= 0 || rowsWithData > skip) {
|
2010-03-03 05:19:58 +01:00
|
|
|
project.rows.add(newRow);
|
2010-03-12 01:23:01 +01:00
|
|
|
project.columnModel.setMaxCellIndex(newRow.cells.size());
|
|
|
|
|
2010-03-03 05:19:58 +01:00
|
|
|
if (limit > 0 && project.rows.size() >= limit) {
|
|
|
|
break;
|
|
|
|
}
|
2010-02-24 19:52:54 +01:00
|
|
|
}
|
2010-02-05 20:19:38 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|