From 4ad31ffcdeed6195f367f5bf6340270b375836ae Mon Sep 17 00:00:00 2001 From: David Huynh Date: Sun, 1 Aug 2010 04:22:45 +0000 Subject: [PATCH] Excel importer now supports "header lines" parameter. git-svn-id: http://google-refine.googlecode.com/svn/trunk@1125 7d457c2a-affb-35e4-300a-418c747d4874 --- .../gridworks/importers/ExcelImporter.java | 333 ++++++++++-------- 1 file changed, 181 insertions(+), 152 deletions(-) diff --git a/main/src/com/metaweb/gridworks/importers/ExcelImporter.java b/main/src/com/metaweb/gridworks/importers/ExcelImporter.java index 5d9d65279..04e1ceab4 100644 --- a/main/src/com/metaweb/gridworks/importers/ExcelImporter.java +++ b/main/src/com/metaweb/gridworks/importers/ExcelImporter.java @@ -6,9 +6,11 @@ import java.io.Reader; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.Set; import org.apache.poi.common.usermodel.Hyperlink; import org.apache.poi.hssf.usermodel.HSSFDateUtil; @@ -28,19 +30,20 @@ import com.metaweb.gridworks.model.Recon.Judgment; public class ExcelImporter implements Importer { protected boolean _xmlBased; - + public boolean takesReader() { return false; } - + public void read(Reader reader, Project project, Properties options) throws Exception { throw new UnsupportedOperationException(); } - + public void read(InputStream inputStream, Project project, Properties options) throws Exception { int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1); - int limit = ImporterUtilities.getIntegerOption("limit",options,-1); - int skip = ImporterUtilities.getIntegerOption("skip",options,0); + int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1); + int limit = ImporterUtilities.getIntegerOption("limit", options, -1); + int skip = ImporterUtilities.getIntegerOption("skip", options, 0); Workbook wb = null; try { @@ -54,20 +57,20 @@ public class ExcelImporter implements Importer { e ); } - + Sheet sheet = wb.getSheetAt(0); - + int firstRow = sheet.getFirstRowNum(); int lastRow = sheet.getLastRowNum(); - int r = firstRow; - - List nonBlankIndices = null; - List nonBlankHeaderStrings = null; - - /* - * Find the header row - */ - for (; r <= lastRow; r++) { + + List columnNames = new ArrayList(); + Set columnNameSet = new HashSet(); + Map columnRootNameToIndex = new HashMap(); + + int rowsWithData = 0; + Map reconMap = new HashMap(); + + for (int r = firstRow; r <= lastRow; r++) { org.apache.poi.ss.usermodel.Row row = sheet.getRow(r); if (row == null) { continue; @@ -75,166 +78,80 @@ public class ExcelImporter implements Importer { ignoreLines--; continue; } - + short firstCell = row.getFirstCellNum(); short lastCell = row.getLastCellNum(); - if (firstCell >= 0 && firstCell <= lastCell) { - nonBlankIndices = new ArrayList(lastCell - firstCell + 1); - nonBlankHeaderStrings = new ArrayList(lastCell - firstCell + 1); - + if (firstCell < 0 || firstCell > lastCell) { + continue; + } + + /* + * Still processing header lines + */ + if (headerLines > 0) { + headerLines--; + for (int c = firstCell; c <= lastCell; c++) { org.apache.poi.ss.usermodel.Cell cell = row.getCell(c); if (cell != null) { String text = cell.getStringCellValue().trim(); if (text.length() > 0) { - nonBlankIndices.add((int) c); - nonBlankHeaderStrings.add(text); + while (columnNames.size() < c + 1) { + columnNames.add(null); + } + + String existingName = columnNames.get(c); + String name = (existingName == null) ? text : (existingName + " " + text); + + columnNames.set(c, name); } } } - - if (nonBlankIndices.size() > 0) { - r++; - break; - } - } - } - - if (nonBlankIndices == null || nonBlankIndices.size() == 0) { - return; - } - - /* - * Create columns - */ - Map nameToIndex = new HashMap(); - for (int c = 0; c < nonBlankIndices.size(); c++) { - String cell = nonBlankHeaderStrings.get(c); - if (nameToIndex.containsKey(cell)) { - int index = nameToIndex.get(cell); - nameToIndex.put(cell, index + 1); - - cell = cell.contains(" ") ? (cell + " " + index) : (cell + index); - } else { - nameToIndex.put(cell, 2); - } - - Column column = new Column(c, cell); - project.columnModel.columns.add(column); - } - - /* - * Now process the data rows - */ - int rowsWithData = 0; - Map reconMap = new HashMap(); - - for (; r <= lastRow; r++) { - org.apache.poi.ss.usermodel.Row row = sheet.getRow(r); - if (row == null) { - continue; - } - - short firstCell = row.getFirstCellNum(); - short lastCell = row.getLastCellNum(); - if (firstCell >= 0 && firstCell <= lastCell) { - Row newRow = new Row(nonBlankIndices.size()); - boolean hasData = false; - - for (int c = 0; c < nonBlankIndices.size(); c++) { - if (c < firstCell || c > lastCell) { - continue; + + if (headerLines == 0) { + for (int i = 0; i < columnNames.size(); i++) { + String rootName = columnNames.get(i); + if (rootName == null) { + continue; + } + setUnduplicatedColumnName(rootName, columnNames, i, columnNameSet, columnRootNameToIndex); } - + } + + /* + * Processing data rows + */ + } else { + Row newRow = new Row(columnNames.size()); + boolean hasData = false; + + for (int c = firstCell; c <= lastCell; c++) { org.apache.poi.ss.usermodel.Cell cell = row.getCell(c); if (cell == null) { continue; } - - int cellType = cell.getCellType(); - if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR || - cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) { - continue; - } - if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) { - cellType = cell.getCachedFormulaResultType(); - } - - Serializable value = null; - if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) { - value = cell.getBooleanCellValue(); - } else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) { - double d = cell.getNumericCellValue(); - - if (HSSFDateUtil.isCellDateFormatted(cell)) { - value = HSSFDateUtil.getJavaDate(d); - } else { - value = d; + + Cell ourCell = extractCell(cell, reconMap); + if (ourCell != null) { + while (columnNames.size() < c + 1) { + columnNames.add(null); } - } else { - String text = cell.getStringCellValue().trim(); - if (text.length() > 0) { - value = text; + if (columnNames.get(c) == null) { + setUnduplicatedColumnName("Column", columnNames, c, columnNameSet, columnRootNameToIndex); } - } - - if (value != null) { - Recon recon = null; - - Hyperlink hyperlink = cell.getHyperlink(); - if (hyperlink != null) { - String url = hyperlink.getAddress(); - - if (url.startsWith("http://") || - url.startsWith("https://")) { - - final String sig = "freebase.com/view"; - - int i = url.indexOf(sig); - if (i > 0) { - String id = url.substring(i + sig.length()); - - int q = id.indexOf('?'); - if (q > 0) { - id = id.substring(0, q); - } - int h = id.indexOf('#'); - if (h > 0) { - id = id.substring(0, h); - } - - if (reconMap.containsKey(id)) { - recon = reconMap.get(id); - recon.judgmentBatchSize++; - } else { - recon = new Recon(0, null, null); - recon.service = "import"; - recon.match = new ReconCandidate(id, value.toString(), new String[0], 100); - recon.matchRank = 0; - recon.judgment = Judgment.Matched; - recon.judgmentAction = "auto"; - recon.judgmentBatchSize = 1; - recon.addCandidate(recon.match); - - reconMap.put(id, recon); - } - - } - } - } - - newRow.setCell(c, new Cell(value, recon)); + + newRow.setCell(c, ourCell); hasData = true; } } - + if (hasData) { rowsWithData++; - + if (skip <= 0 || rowsWithData > skip) { project.rows.add(newRow); project.columnModel.setMaxCellIndex(newRow.cells.size()); - + if (limit > 0 && project.rows.size() >= limit) { break; } @@ -242,8 +159,120 @@ public class ExcelImporter implements Importer { } } } + + /* + * Create columns + */ + for (int c = 0; c < columnNames.size(); c++) { + String name = columnNames.get(c); + if (name != null) { + Column column = new Column(c, name); + project.columnModel.columns.add(column); + } + } } - + + protected void setUnduplicatedColumnName( + String rootName, List columnNames, int index, Set columnNameSet, Map columnRootNameToIndex) { + if (columnNameSet.contains(rootName)) { + int startIndex = columnRootNameToIndex.containsKey(rootName) ? columnRootNameToIndex.get(rootName) : 2; + while (true) { + String name = rootName + " " + startIndex; + if (columnNameSet.contains(name)) { + startIndex++; + } else { + columnNames.set(index, name); + columnNameSet.add(name); + break; + } + } + + columnRootNameToIndex.put(rootName, startIndex + 1); + } else { + columnNames.set(index, rootName); + columnNameSet.add(rootName); + } + } + + protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map reconMap) { + int cellType = cell.getCellType(); + if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR || + cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) { + return null; + } + if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) { + cellType = cell.getCachedFormulaResultType(); + } + + Serializable value = null; + if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) { + value = cell.getBooleanCellValue(); + } else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) { + double d = cell.getNumericCellValue(); + + if (HSSFDateUtil.isCellDateFormatted(cell)) { + value = HSSFDateUtil.getJavaDate(d); + } else { + value = d; + } + } else { + String text = cell.getStringCellValue().trim(); + if (text.length() > 0) { + value = text; + } + } + + if (value != null) { + Recon recon = null; + + Hyperlink hyperlink = cell.getHyperlink(); + if (hyperlink != null) { + String url = hyperlink.getAddress(); + + if (url.startsWith("http://") || + url.startsWith("https://")) { + + final String sig = "freebase.com/view"; + + int i = url.indexOf(sig); + if (i > 0) { + String id = url.substring(i + sig.length()); + + int q = id.indexOf('?'); + if (q > 0) { + id = id.substring(0, q); + } + int h = id.indexOf('#'); + if (h > 0) { + id = id.substring(0, h); + } + + if (reconMap.containsKey(id)) { + recon = reconMap.get(id); + recon.judgmentBatchSize++; + } else { + recon = new Recon(0, null, null); + recon.service = "import"; + recon.match = new ReconCandidate(id, value.toString(), new String[0], 100); + recon.matchRank = 0; + recon.judgment = Judgment.Matched; + recon.judgmentAction = "auto"; + recon.judgmentBatchSize = 1; + recon.addCandidate(recon.match); + + reconMap.put(id, recon); + } + + } + } + } + + return new Cell(value, recon); + } else { + return null; + } + } + public boolean canImportData(String contentType, String fileName) { if (contentType != null) { contentType = contentType.toLowerCase().trim();