Excel importer now supports "header lines" parameter.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@1125 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-08-01 04:22:45 +00:00
parent 7bb6674e5b
commit 4ad31ffcde

View File

@ -6,9 +6,11 @@ import java.io.Reader;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
import java.util.Set;
import org.apache.poi.common.usermodel.Hyperlink; import org.apache.poi.common.usermodel.Hyperlink;
import org.apache.poi.hssf.usermodel.HSSFDateUtil; import org.apache.poi.hssf.usermodel.HSSFDateUtil;
@ -39,8 +41,9 @@ public class ExcelImporter implements Importer {
public void read(InputStream inputStream, Project project, Properties options) throws Exception { public void read(InputStream inputStream, Project project, Properties options) throws Exception {
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1); int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
int limit = ImporterUtilities.getIntegerOption("limit",options,-1); int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1);
int skip = ImporterUtilities.getIntegerOption("skip",options,0); int limit = ImporterUtilities.getIntegerOption("limit", options, -1);
int skip = ImporterUtilities.getIntegerOption("skip", options, 0);
Workbook wb = null; Workbook wb = null;
try { try {
@ -59,15 +62,15 @@ public class ExcelImporter implements Importer {
int firstRow = sheet.getFirstRowNum(); int firstRow = sheet.getFirstRowNum();
int lastRow = sheet.getLastRowNum(); int lastRow = sheet.getLastRowNum();
int r = firstRow;
List<Integer> nonBlankIndices = null; List<String> columnNames = new ArrayList<String>();
List<String> nonBlankHeaderStrings = null; Set<String> columnNameSet = new HashSet<String>();
Map<String, Integer> columnRootNameToIndex = new HashMap<String, Integer>();
/* int rowsWithData = 0;
* Find the header row Map<String, Recon> reconMap = new HashMap<String, Recon>();
*/
for (; r <= lastRow; r++) { for (int r = firstRow; r <= lastRow; r++) {
org.apache.poi.ss.usermodel.Row row = sheet.getRow(r); org.apache.poi.ss.usermodel.Row row = sheet.getRow(r);
if (row == null) { if (row == null) {
continue; continue;
@ -78,152 +81,66 @@ public class ExcelImporter implements Importer {
short firstCell = row.getFirstCellNum(); short firstCell = row.getFirstCellNum();
short lastCell = row.getLastCellNum(); short lastCell = row.getLastCellNum();
if (firstCell >= 0 && firstCell <= lastCell) { if (firstCell < 0 || firstCell > lastCell) {
nonBlankIndices = new ArrayList<Integer>(lastCell - firstCell + 1); continue;
nonBlankHeaderStrings = new ArrayList<String>(lastCell - firstCell + 1); }
/*
* Still processing header lines
*/
if (headerLines > 0) {
headerLines--;
for (int c = firstCell; c <= lastCell; c++) { for (int c = firstCell; c <= lastCell; c++) {
org.apache.poi.ss.usermodel.Cell cell = row.getCell(c); org.apache.poi.ss.usermodel.Cell cell = row.getCell(c);
if (cell != null) { if (cell != null) {
String text = cell.getStringCellValue().trim(); String text = cell.getStringCellValue().trim();
if (text.length() > 0) { if (text.length() > 0) {
nonBlankIndices.add((int) c); while (columnNames.size() < c + 1) {
nonBlankHeaderStrings.add(text); columnNames.add(null);
}
String existingName = columnNames.get(c);
String name = (existingName == null) ? text : (existingName + " " + text);
columnNames.set(c, name);
} }
} }
} }
if (nonBlankIndices.size() > 0) { if (headerLines == 0) {
r++; for (int i = 0; i < columnNames.size(); i++) {
break; String rootName = columnNames.get(i);
if (rootName == null) {
continue;
}
setUnduplicatedColumnName(rootName, columnNames, i, columnNameSet, columnRootNameToIndex);
}
} }
}
}
if (nonBlankIndices == null || nonBlankIndices.size() == 0) { /*
return; * Processing data rows
} */
/*
* Create columns
*/
Map<String, Integer> nameToIndex = new HashMap<String, Integer>();
for (int c = 0; c < nonBlankIndices.size(); c++) {
String cell = nonBlankHeaderStrings.get(c);
if (nameToIndex.containsKey(cell)) {
int index = nameToIndex.get(cell);
nameToIndex.put(cell, index + 1);
cell = cell.contains(" ") ? (cell + " " + index) : (cell + index);
} else { } else {
nameToIndex.put(cell, 2); Row newRow = new Row(columnNames.size());
}
Column column = new Column(c, cell);
project.columnModel.columns.add(column);
}
/*
* Now process the data rows
*/
int rowsWithData = 0;
Map<String, Recon> reconMap = new HashMap<String, Recon>();
for (; r <= lastRow; r++) {
org.apache.poi.ss.usermodel.Row row = sheet.getRow(r);
if (row == null) {
continue;
}
short firstCell = row.getFirstCellNum();
short lastCell = row.getLastCellNum();
if (firstCell >= 0 && firstCell <= lastCell) {
Row newRow = new Row(nonBlankIndices.size());
boolean hasData = false; boolean hasData = false;
for (int c = 0; c < nonBlankIndices.size(); c++) { for (int c = firstCell; c <= lastCell; c++) {
if (c < firstCell || c > lastCell) {
continue;
}
org.apache.poi.ss.usermodel.Cell cell = row.getCell(c); org.apache.poi.ss.usermodel.Cell cell = row.getCell(c);
if (cell == null) { if (cell == null) {
continue; continue;
} }
int cellType = cell.getCellType(); Cell ourCell = extractCell(cell, reconMap);
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR || if (ourCell != null) {
cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) { while (columnNames.size() < c + 1) {
continue; columnNames.add(null);
}
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) {
cellType = cell.getCachedFormulaResultType();
}
Serializable value = null;
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) {
value = cell.getBooleanCellValue();
} else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) {
double d = cell.getNumericCellValue();
if (HSSFDateUtil.isCellDateFormatted(cell)) {
value = HSSFDateUtil.getJavaDate(d);
} else {
value = d;
} }
} else { if (columnNames.get(c) == null) {
String text = cell.getStringCellValue().trim(); setUnduplicatedColumnName("Column", columnNames, c, columnNameSet, columnRootNameToIndex);
if (text.length() > 0) {
value = text;
}
}
if (value != null) {
Recon recon = null;
Hyperlink hyperlink = cell.getHyperlink();
if (hyperlink != null) {
String url = hyperlink.getAddress();
if (url.startsWith("http://") ||
url.startsWith("https://")) {
final String sig = "freebase.com/view";
int i = url.indexOf(sig);
if (i > 0) {
String id = url.substring(i + sig.length());
int q = id.indexOf('?');
if (q > 0) {
id = id.substring(0, q);
}
int h = id.indexOf('#');
if (h > 0) {
id = id.substring(0, h);
}
if (reconMap.containsKey(id)) {
recon = reconMap.get(id);
recon.judgmentBatchSize++;
} else {
recon = new Recon(0, null, null);
recon.service = "import";
recon.match = new ReconCandidate(id, value.toString(), new String[0], 100);
recon.matchRank = 0;
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.judgmentBatchSize = 1;
recon.addCandidate(recon.match);
reconMap.put(id, recon);
}
}
}
} }
newRow.setCell(c, new Cell(value, recon)); newRow.setCell(c, ourCell);
hasData = true; hasData = true;
} }
} }
@ -242,6 +159,118 @@ public class ExcelImporter implements Importer {
} }
} }
} }
/*
* Create columns
*/
for (int c = 0; c < columnNames.size(); c++) {
String name = columnNames.get(c);
if (name != null) {
Column column = new Column(c, name);
project.columnModel.columns.add(column);
}
}
}
protected void setUnduplicatedColumnName(
String rootName, List<String> columnNames, int index, Set<String> columnNameSet, Map<String, Integer> columnRootNameToIndex) {
if (columnNameSet.contains(rootName)) {
int startIndex = columnRootNameToIndex.containsKey(rootName) ? columnRootNameToIndex.get(rootName) : 2;
while (true) {
String name = rootName + " " + startIndex;
if (columnNameSet.contains(name)) {
startIndex++;
} else {
columnNames.set(index, name);
columnNameSet.add(name);
break;
}
}
columnRootNameToIndex.put(rootName, startIndex + 1);
} else {
columnNames.set(index, rootName);
columnNameSet.add(rootName);
}
}
protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map<String, Recon> reconMap) {
int cellType = cell.getCellType();
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR ||
cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) {
return null;
}
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) {
cellType = cell.getCachedFormulaResultType();
}
Serializable value = null;
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) {
value = cell.getBooleanCellValue();
} else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) {
double d = cell.getNumericCellValue();
if (HSSFDateUtil.isCellDateFormatted(cell)) {
value = HSSFDateUtil.getJavaDate(d);
} else {
value = d;
}
} else {
String text = cell.getStringCellValue().trim();
if (text.length() > 0) {
value = text;
}
}
if (value != null) {
Recon recon = null;
Hyperlink hyperlink = cell.getHyperlink();
if (hyperlink != null) {
String url = hyperlink.getAddress();
if (url.startsWith("http://") ||
url.startsWith("https://")) {
final String sig = "freebase.com/view";
int i = url.indexOf(sig);
if (i > 0) {
String id = url.substring(i + sig.length());
int q = id.indexOf('?');
if (q > 0) {
id = id.substring(0, q);
}
int h = id.indexOf('#');
if (h > 0) {
id = id.substring(0, h);
}
if (reconMap.containsKey(id)) {
recon = reconMap.get(id);
recon.judgmentBatchSize++;
} else {
recon = new Recon(0, null, null);
recon.service = "import";
recon.match = new ReconCandidate(id, value.toString(), new String[0], 100);
recon.matchRank = 0;
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.judgmentBatchSize = 1;
recon.addCandidate(recon.match);
reconMap.put(id, recon);
}
}
}
}
return new Cell(value, recon);
} else {
return null;
}
} }
public boolean canImportData(String contentType, String fileName) { public boolean canImportData(String contentType, String fileName) {