Added options for specifying # lines the header columns take, and the # lines to skip processing entirely initially.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@468 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-04-13 21:23:41 +00:00
parent da55033df3
commit a1a8758c37
5 changed files with 88 additions and 35 deletions

View File

@ -42,6 +42,7 @@ public class ExcelImporter implements Importer {
}
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
int skip = ImporterUtilities.getIntegerOption("skip",options,0);
@ -64,7 +65,7 @@ public class ExcelImporter implements Importer {
int lastRow = sheet.getLastRowNum();
int r = firstRow;
List<Integer> nonBlankIndices = null;
List<Integer> nonBlankIndices = null;
List<String> nonBlankHeaderStrings = null;
/*
@ -74,6 +75,9 @@ public class ExcelImporter implements Importer {
org.apache.poi.ss.usermodel.Row row = sheet.getRow(r);
if (row == null) {
continue;
} else if (ignoreLines > 0) {
ignoreLines--;
continue;
}
short firstCell = row.getFirstCellNum();

View File

@ -1,8 +1,15 @@
package com.metaweb.gridworks.importers;
import java.io.Serializable;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
public class ImporterUtilities {
static public Serializable parseCellValue(String text) {
@ -51,4 +58,53 @@ public class ImporterUtilities {
return value;
}
static public void appendColumnName(List<String> columnNames, int index, String name) {
name = name.trim();
while (columnNames.size() <= index) {
columnNames.add("");
}
if (!name.isEmpty()) {
String oldName = columnNames.get(index);
if (!oldName.isEmpty()) {
name = oldName + " " + name;
}
columnNames.set(index, name);
}
}
static public void ensureColumnsInRowExist(List<String> columnNames, Row row) {
int count = row.cells.size();
while (count > columnNames.size()) {
columnNames.add("");
}
}
static public void setupColumns(Project project, List<String> columnNames) {
Map<String, Integer> nameToIndex = new HashMap<String, Integer>();
for (int c = 0; c < columnNames.size(); c++) {
String cell = columnNames.get(c).trim();
if (cell.isEmpty()) {
cell = "Column";
} else if (cell.startsWith("\"") && cell.endsWith("\"")) {
cell = cell.substring(1, cell.length() - 1).trim();
}
if (nameToIndex.containsKey(cell)) {
int index = nameToIndex.get(cell);
nameToIndex.put(cell, index + 1);
cell = cell.contains(" ") ? (cell + " " + index) : (cell + index);
} else {
nameToIndex.put(cell, 2);
}
Column column = new Column(c, cell);
project.columnModel.columns.add(column);
}
}
}

View File

@ -3,9 +3,8 @@ package com.metaweb.gridworks.importers;
import java.io.InputStream;
import java.io.LineNumberReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.lang.NotImplementedException;
@ -14,27 +13,32 @@ import org.apache.commons.lang.StringUtils;
import com.metaweb.gridworks.importers.parsers.CSVRowParser;
import com.metaweb.gridworks.importers.parsers.RowParser;
import com.metaweb.gridworks.importers.parsers.SeparatorRowParser;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
public class TsvCsvImporter implements Importer {
public void read(Reader reader, Project project, Properties options) throws Exception {
String sep = options.getProperty("separator"); // auto-detect if not present
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1);
int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
int skip = ImporterUtilities.getIntegerOption("skip",options,0);
boolean guessValueType = ImporterUtilities.getBooleanOption("guess-value-type", options, true);
LineNumberReader lnReader = new LineNumberReader(reader);
String sep = options.getProperty("separator"); // auto-detect if not present
String line = null;
boolean first = true;
int cellCount = 1;
RowParser parser = (sep == null || (sep.length() == 0)) ? null : new SeparatorRowParser(sep);
List<String> columnNames = new ArrayList<String>();
LineNumberReader lnReader = new LineNumberReader(reader);
RowParser parser = (sep == null || (sep.length() == 0)) ? null : new SeparatorRowParser(sep);
String line = null;
int rowsWithData = 0;
while ((line = lnReader.readLine()) != null) {
if (StringUtils.isBlank(line)) {
if (ignoreLines > 0) {
ignoreLines--;
continue;
} else if (StringUtils.isBlank(line)) {
continue;
}
@ -49,34 +53,17 @@ public class TsvCsvImporter implements Importer {
}
}
if (first) {
List<String> cells = parser.split(line);
Map<String, Integer> nameToIndex = new HashMap<String, Integer>();
if (headerLines > 0) {
headerLines--;
first = false;
List<String> cells = parser.split(line);
for (int c = 0; c < cells.size(); c++) {
String cell = cells.get(c).trim();
if (cell.startsWith("\"") && cell.endsWith("\"")) {
cell = cell.substring(1, cell.length() - 1).trim();
}
if (nameToIndex.containsKey(cell)) {
int index = nameToIndex.get(cell);
nameToIndex.put(cell, index + 1);
cell = cell.contains(" ") ? (cell + " " + index) : (cell + index);
} else {
nameToIndex.put(cell, 2);
}
Column column = new Column(c, cell);
project.columnModel.columns.add(column);
ImporterUtilities.appendColumnName(columnNames, c, cell);
}
cellCount = cells.size();
} else {
Row row = new Row(cellCount);
Row row = new Row(columnNames.size());
if (parser.parseRow(row, line, guessValueType)) {
rowsWithData++;
@ -85,6 +72,8 @@ public class TsvCsvImporter implements Importer {
project.rows.add(row);
project.columnModel.setMaxCellIndex(row.cells.size());
ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
if (limit > 0 && project.rows.size() >= limit) {
break;
}
@ -92,6 +81,8 @@ public class TsvCsvImporter implements Importer {
}
}
}
ImporterUtilities.setupColumns(project, columnNames);
}
public void read(InputStream inputStream, Project project, Properties options) throws Exception {

View File

@ -1 +1 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html> <head> <title>Freebase Gridworks</title> <link rel="stylesheet" href="/styles/common.css" /> <link rel="stylesheet" href="/styles/index.css" /> <script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script> <script type="text/javascript" src="externals/date.js"></script> <script type="text/javascript" src="scripts/util/string.js"></script> <script type="text/javascript" src="scripts/version.js"></script> <script type="text/javascript" src="scripts/index.js"></script> <script type="text/javascript" src="http://www.freebase.com/labs/gridworks.js"></script> </head> <body> <div id="header"> <a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a> <div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div> </div> <div id="body"> <div id="body-empty"> <table><tr> <td id="body-empty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-empty-create-project-panel-container"></td> </tr></table> </div> <div id="body-nonempty"> <table><tr> <td id="body-nonempty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-nonempty-projects-container"> <div id="projects"></div> </td> <td id="body-nonempty-create-project-panel-container"></td> </tr></table> </div> </div> <div id="footer"> <a href="about.html">About Freebase Gridworks</a> &bull; &copy; 2010 <a href="http://www.metaweb.com/">Metaweb Technologies, Inc.</a> </div> <div id="body-template"> <div id="create-project-panel"> <h1>Upload Data File</h1> <form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8"> <div class="grid-layout layout-tight"><table class="import-project-panel-layout"> <tr><td>Data File:</td> <td><input type="file" id="project-file-input" name="project-file" /></td></tr> <tr><td>Project Name:</td> <td><input type="text" size="20" id="project-name-input" name="project-name" /></td></tr> <tr><td>Load up to:</td> <td><input id="limit-input" name="limit" size="5" /> data rows (optional)</td></tr> <tr><td>Skip:</td> <td><input id="skip-input" name="skip" size="5" /> initial data rows (optional)</td></tr> <tr><td>Column separator:</td> <td><input id="separator-input" name="separator" size="2" /> (optional, default to comma or tab)</td></tr> <tr><td>Guess Value Type:</td> <td><input id="guess-value-type-input" name="guess-value-type" type="checkbox" checked="true" /> (try to parse cells' content into numbers, dates, etc.)</td></tr> <tr><td></td><td><input type="submit" value="Create Project" id="upload-file-button" /></td></tr> </table></div> </form> <h1>Import Existing Project</h1> <form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8"> <table class="import-project-panel-layout"> <tr><td>Project .tar or .tar.gz File:</td><td><input type="file" id="project-tar-file-input" name="project-file" /></td></tr> <tr><td>Re-name Project:</td><td><input type="text" size="20" id="project-name-input" name="project-name" /> (optional)</td></tr> <tr><td></td><td><input type="submit" value="Import Project" id="import-project-button" /></td></tr> </table> </form> </div> </div> </body> </html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html> <head> <title>Freebase Gridworks</title> <link rel="stylesheet" href="/styles/common.css" /> <link rel="stylesheet" href="/styles/index.css" /> <script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script> <script type="text/javascript" src="externals/date.js"></script> <script type="text/javascript" src="scripts/util/string.js"></script> <script type="text/javascript" src="scripts/version.js"></script> <script type="text/javascript" src="scripts/index.js"></script> <script type="text/javascript" src="http://www.freebase.com/labs/gridworks.js"></script> </head> <body> <div id="header"> <a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a> <div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div> </div> <div id="body"> <div id="body-empty"> <table><tr> <td id="body-empty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-empty-create-project-panel-container"></td> </tr></table> </div> <div id="body-nonempty"> <table><tr> <td id="body-nonempty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-nonempty-projects-container"> <div id="projects"></div> </td> <td id="body-nonempty-create-project-panel-container"></td> </tr></table> </div> </div> <div id="footer"> <a href="about.html">About Freebase Gridworks</a> &bull; &copy; 2010 <a href="http://www.metaweb.com/">Metaweb Technologies, Inc.</a> </div> <div id="body-template"> <div id="create-project-panel"> <h1>Upload Data File</h1> <form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8"> <div class="grid-layout layout-tight"><table class="import-project-panel-layout"> <tr><td>Data File:</td> <td><input type="file" id="project-file-input" name="project-file" /></td></tr> <tr><td>Project Name:</td> <td><input type="text" size="20" id="project-name-input" name="project-name" /></td></tr> <tr><td>Column separator:</td> <td><input id="separator-input" name="separator" size="2" /> leave blank to guess comma or tab</td></tr> <tr><td>Guess Value Type:</td> <tr><td>Load up to:</td> <tr><td>Load up to:</td> <tr><td>Load up to:</td> <td><input id="limit-input" name="limit" size="5" /> data rows (optional)</td></tr> <tr><td>Load up to:</td> <tr><td>Skip:</td> <tr><td>Load up to:</td> <td><input id="skip-input" name="skip" size="5" /> initial data rows (optional)</td></tr> <tr><td>Load up to:</td> <tr><td>Column separator:</td> <tr><td>Load up to:</td> <td><input id="separator-input" name="separator" size="2" /> (optional, default to comma or tab)</td></tr> <tr><td>Skip:</td> <tr><td>Load up to:</td> <tr><td>Guess Value Type:</td> <tr><td>Load up to:</td> <td><input id="guess-value-type-input" name="guess-value-type" type="checkbox" checked="true" /> (try to parse cells' content into numbers, dates, etc. <td><input id="limit-input" name="limit" size="5" /> data rows (leave blank to load all rows)</td></tr> <tr><td></td><td><input type="submit" value="Create Project" id="upload-file-button" /></td></tr> </table></div> </form> <h1>Import Existing Project</h1> <form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8"> <table class="import-project-panel-layout"> <tr><td>Project .tar or .tar.gz File:</td><td><input type="file" id="project-tar-file-input" name="project-file" /></td></tr> <tr><td>Re-name Project:</td><td><input type="text" size="20" id="project-name-input" name="project-name" /> (optional)</td></tr> <tr><td></td><td><input type="submit" value="Import Project" id="import-project-button" /></td></tr> </table> </form> </div> </div> </body> </html>

View File

@ -8,9 +8,11 @@ function onClickUploadFileButton(evt) {
} else {
$("#file-upload-form").attr("action",
"/command/create-project-from-upload?" + [
"separator=" + $("#separator-input")[0].value,
"ignore=" + $("#ignore-input")[0].value,
"header-lines=" + $("#header-lines-input")[0].value,
"skip=" + $("#skip-input")[0].value,
"limit=" + $("#limit-input")[0].value,
"separator=" + $("#separator-input")[0].value,
"guess-value-type=" + $("#guess-value-type-input")[0].checked
].join("&"));
}