Added options for specifying # lines the header columns take, and the # lines to skip processing entirely initially.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@468 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
da55033df3
commit
a1a8758c37
@ -42,6 +42,7 @@ public class ExcelImporter implements Importer {
|
||||
}
|
||||
|
||||
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
|
||||
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
|
||||
int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
|
||||
int skip = ImporterUtilities.getIntegerOption("skip",options,0);
|
||||
|
||||
@ -64,7 +65,7 @@ public class ExcelImporter implements Importer {
|
||||
int lastRow = sheet.getLastRowNum();
|
||||
int r = firstRow;
|
||||
|
||||
List<Integer> nonBlankIndices = null;
|
||||
List<Integer> nonBlankIndices = null;
|
||||
List<String> nonBlankHeaderStrings = null;
|
||||
|
||||
/*
|
||||
@ -74,6 +75,9 @@ public class ExcelImporter implements Importer {
|
||||
org.apache.poi.ss.usermodel.Row row = sheet.getRow(r);
|
||||
if (row == null) {
|
||||
continue;
|
||||
} else if (ignoreLines > 0) {
|
||||
ignoreLines--;
|
||||
continue;
|
||||
}
|
||||
|
||||
short firstCell = row.getFirstCellNum();
|
||||
|
@ -1,8 +1,15 @@
|
||||
package com.metaweb.gridworks.importers;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import com.metaweb.gridworks.model.Column;
|
||||
import com.metaweb.gridworks.model.Project;
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
public class ImporterUtilities {
|
||||
|
||||
static public Serializable parseCellValue(String text) {
|
||||
@ -51,4 +58,53 @@ public class ImporterUtilities {
|
||||
return value;
|
||||
}
|
||||
|
||||
static public void appendColumnName(List<String> columnNames, int index, String name) {
|
||||
name = name.trim();
|
||||
|
||||
while (columnNames.size() <= index) {
|
||||
columnNames.add("");
|
||||
}
|
||||
|
||||
if (!name.isEmpty()) {
|
||||
String oldName = columnNames.get(index);
|
||||
if (!oldName.isEmpty()) {
|
||||
name = oldName + " " + name;
|
||||
}
|
||||
|
||||
columnNames.set(index, name);
|
||||
}
|
||||
}
|
||||
|
||||
static public void ensureColumnsInRowExist(List<String> columnNames, Row row) {
|
||||
int count = row.cells.size();
|
||||
while (count > columnNames.size()) {
|
||||
columnNames.add("");
|
||||
}
|
||||
}
|
||||
|
||||
static public void setupColumns(Project project, List<String> columnNames) {
|
||||
Map<String, Integer> nameToIndex = new HashMap<String, Integer>();
|
||||
for (int c = 0; c < columnNames.size(); c++) {
|
||||
String cell = columnNames.get(c).trim();
|
||||
if (cell.isEmpty()) {
|
||||
cell = "Column";
|
||||
} else if (cell.startsWith("\"") && cell.endsWith("\"")) {
|
||||
cell = cell.substring(1, cell.length() - 1).trim();
|
||||
}
|
||||
|
||||
if (nameToIndex.containsKey(cell)) {
|
||||
int index = nameToIndex.get(cell);
|
||||
nameToIndex.put(cell, index + 1);
|
||||
|
||||
cell = cell.contains(" ") ? (cell + " " + index) : (cell + index);
|
||||
} else {
|
||||
nameToIndex.put(cell, 2);
|
||||
}
|
||||
|
||||
Column column = new Column(c, cell);
|
||||
|
||||
project.columnModel.columns.add(column);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -3,9 +3,8 @@ package com.metaweb.gridworks.importers;
|
||||
import java.io.InputStream;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.commons.lang.NotImplementedException;
|
||||
@ -14,27 +13,32 @@ import org.apache.commons.lang.StringUtils;
|
||||
import com.metaweb.gridworks.importers.parsers.CSVRowParser;
|
||||
import com.metaweb.gridworks.importers.parsers.RowParser;
|
||||
import com.metaweb.gridworks.importers.parsers.SeparatorRowParser;
|
||||
import com.metaweb.gridworks.model.Column;
|
||||
import com.metaweb.gridworks.model.Project;
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
public class TsvCsvImporter implements Importer {
|
||||
|
||||
public void read(Reader reader, Project project, Properties options) throws Exception {
|
||||
String sep = options.getProperty("separator"); // auto-detect if not present
|
||||
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
|
||||
int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1);
|
||||
|
||||
int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
|
||||
int skip = ImporterUtilities.getIntegerOption("skip",options,0);
|
||||
boolean guessValueType = ImporterUtilities.getBooleanOption("guess-value-type", options, true);
|
||||
|
||||
LineNumberReader lnReader = new LineNumberReader(reader);
|
||||
String sep = options.getProperty("separator"); // auto-detect if not present
|
||||
String line = null;
|
||||
boolean first = true;
|
||||
int cellCount = 1;
|
||||
RowParser parser = (sep == null || (sep.length() == 0)) ? null : new SeparatorRowParser(sep);
|
||||
List<String> columnNames = new ArrayList<String>();
|
||||
|
||||
LineNumberReader lnReader = new LineNumberReader(reader);
|
||||
RowParser parser = (sep == null || (sep.length() == 0)) ? null : new SeparatorRowParser(sep);
|
||||
|
||||
String line = null;
|
||||
int rowsWithData = 0;
|
||||
|
||||
while ((line = lnReader.readLine()) != null) {
|
||||
if (StringUtils.isBlank(line)) {
|
||||
if (ignoreLines > 0) {
|
||||
ignoreLines--;
|
||||
continue;
|
||||
} else if (StringUtils.isBlank(line)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -49,34 +53,17 @@ public class TsvCsvImporter implements Importer {
|
||||
}
|
||||
}
|
||||
|
||||
if (first) {
|
||||
List<String> cells = parser.split(line);
|
||||
Map<String, Integer> nameToIndex = new HashMap<String, Integer>();
|
||||
if (headerLines > 0) {
|
||||
headerLines--;
|
||||
|
||||
first = false;
|
||||
List<String> cells = parser.split(line);
|
||||
for (int c = 0; c < cells.size(); c++) {
|
||||
String cell = cells.get(c).trim();
|
||||
if (cell.startsWith("\"") && cell.endsWith("\"")) {
|
||||
cell = cell.substring(1, cell.length() - 1).trim();
|
||||
}
|
||||
|
||||
if (nameToIndex.containsKey(cell)) {
|
||||
int index = nameToIndex.get(cell);
|
||||
nameToIndex.put(cell, index + 1);
|
||||
|
||||
cell = cell.contains(" ") ? (cell + " " + index) : (cell + index);
|
||||
} else {
|
||||
nameToIndex.put(cell, 2);
|
||||
}
|
||||
|
||||
Column column = new Column(c, cell);
|
||||
|
||||
project.columnModel.columns.add(column);
|
||||
ImporterUtilities.appendColumnName(columnNames, c, cell);
|
||||
}
|
||||
|
||||
cellCount = cells.size();
|
||||
} else {
|
||||
Row row = new Row(cellCount);
|
||||
Row row = new Row(columnNames.size());
|
||||
|
||||
if (parser.parseRow(row, line, guessValueType)) {
|
||||
rowsWithData++;
|
||||
@ -85,6 +72,8 @@ public class TsvCsvImporter implements Importer {
|
||||
project.rows.add(row);
|
||||
project.columnModel.setMaxCellIndex(row.cells.size());
|
||||
|
||||
ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
|
||||
|
||||
if (limit > 0 && project.rows.size() >= limit) {
|
||||
break;
|
||||
}
|
||||
@ -92,6 +81,8 @@ public class TsvCsvImporter implements Importer {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ImporterUtilities.setupColumns(project, columnNames);
|
||||
}
|
||||
|
||||
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
|
||||
|
@ -1 +1 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<title>Freebase Gridworks</title>
<link rel="stylesheet" href="/styles/common.css" />
<link rel="stylesheet" href="/styles/index.css" />
<script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script>
<script type="text/javascript" src="externals/date.js"></script>
<script type="text/javascript" src="scripts/util/string.js"></script>
<script type="text/javascript" src="scripts/version.js"></script>
<script type="text/javascript" src="scripts/index.js"></script>
<script type="text/javascript" src="http://www.freebase.com/labs/gridworks.js"></script>
</head>
<body>
<div id="header">
<a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a>
<div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div>
</div>
<div id="body">
<div id="body-empty">
<table><tr>
<td id="body-empty-logo-container"><img src="images/gridworks.png" /> Gridworks</td>
<td id="body-empty-create-project-panel-container"></td>
</tr></table>
</div>
<div id="body-nonempty">
<table><tr>
<td id="body-nonempty-logo-container"><img src="images/gridworks.png" /> Gridworks</td>
<td id="body-nonempty-projects-container">
<div id="projects"></div>
</td>
<td id="body-nonempty-create-project-panel-container"></td>
</tr></table>
</div>
</div>
<div id="footer">
<a href="about.html">About Freebase Gridworks</a>
•
© 2010 <a href="http://www.metaweb.com/">Metaweb Technologies, Inc.</a>
</div>
<div id="body-template">
<div id="create-project-panel">
<h1>Upload Data File</h1>
<form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8">
<div class="grid-layout layout-tight"><table class="import-project-panel-layout">
<tr><td>Data File:</td>
<td><input type="file" id="project-file-input" name="project-file" /></td></tr>
<tr><td>Project Name:</td>
<td><input type="text" size="20" id="project-name-input" name="project-name" /></td></tr>
<tr><td>Load up to:</td>
<td><input id="limit-input" name="limit" size="5" /> data rows (optional)</td></tr>
<tr><td>Skip:</td>
<td><input id="skip-input" name="skip" size="5" /> initial data rows (optional)</td></tr>
<tr><td>Column separator:</td>
<td><input id="separator-input" name="separator" size="2" /> (optional, default to comma or tab)</td></tr>
<tr><td>Guess Value Type:</td>
<td><input id="guess-value-type-input" name="guess-value-type" type="checkbox" checked="true" /> (try to parse cells' content into numbers, dates, etc.)</td></tr>
<tr><td></td><td><input type="submit" value="Create Project" id="upload-file-button" /></td></tr>
</table></div>
</form>
<h1>Import Existing Project</h1>
<form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8">
<table class="import-project-panel-layout">
<tr><td>Project .tar or .tar.gz File:</td><td><input type="file" id="project-tar-file-input" name="project-file" /></td></tr>
<tr><td>Re-name Project:</td><td><input type="text" size="20" id="project-name-input" name="project-name" /> (optional)</td></tr>
<tr><td></td><td><input type="submit" value="Import Project" id="import-project-button" /></td></tr>
</table>
</form>
</div>
</div>
</body>
</html>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<title>Freebase Gridworks</title>
<link rel="stylesheet" href="/styles/common.css" />
<link rel="stylesheet" href="/styles/index.css" />
<script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script>
<script type="text/javascript" src="externals/date.js"></script>
<script type="text/javascript" src="scripts/util/string.js"></script>
<script type="text/javascript" src="scripts/version.js"></script>
<script type="text/javascript" src="scripts/index.js"></script>
<script type="text/javascript" src="http://www.freebase.com/labs/gridworks.js"></script>
</head>
<body>
<div id="header">
<a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a>
<div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div>
</div>
<div id="body">
<div id="body-empty">
<table><tr>
<td id="body-empty-logo-container"><img src="images/gridworks.png" /> Gridworks</td>
<td id="body-empty-create-project-panel-container"></td>
</tr></table>
</div>
<div id="body-nonempty">
<table><tr>
<td id="body-nonempty-logo-container"><img src="images/gridworks.png" /> Gridworks</td>
<td id="body-nonempty-projects-container">
<div id="projects"></div>
</td>
<td id="body-nonempty-create-project-panel-container"></td>
</tr></table>
</div>
</div>
<div id="footer">
<a href="about.html">About Freebase Gridworks</a>
•
© 2010 <a href="http://www.metaweb.com/">Metaweb Technologies, Inc.</a>
</div>
<div id="body-template">
<div id="create-project-panel">
<h1>Upload Data File</h1>
<form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8">
<div class="grid-layout layout-tight"><table class="import-project-panel-layout">
<tr><td>Data File:</td>
<td><input type="file" id="project-file-input" name="project-file" /></td></tr>
<tr><td>Project Name:</td>
<td><input type="text" size="20" id="project-name-input" name="project-name" /></td></tr>
<tr><td>Column separator:</td>
<td><input id="separator-input" name="separator" size="2" /> leave blank to guess comma or tab</td></tr>
<tr><td>Guess Value Type:</td>
<tr><td>Load up to:</td>
<tr><td>Load up to:</td>
<tr><td>Load up to:</td>
<td><input id="limit-input" name="limit" size="5" /> data rows (optional)</td></tr>
<tr><td>Load up to:</td>
<tr><td>Skip:</td>
<tr><td>Load up to:</td>
<td><input id="skip-input" name="skip" size="5" /> initial data rows (optional)</td></tr>
<tr><td>Load up to:</td>
<tr><td>Column separator:</td>
<tr><td>Load up to:</td>
<td><input id="separator-input" name="separator" size="2" /> (optional, default to comma or tab)</td></tr>
<tr><td>Skip:</td>
<tr><td>Load up to:</td>
<tr><td>Guess Value Type:</td>
<tr><td>Load up to:</td>
<td><input id="guess-value-type-input" name="guess-value-type" type="checkbox" checked="true" /> (try to parse cells' content into numbers, dates, etc. <td><input id="limit-input" name="limit" size="5" /> data rows (leave blank to load all rows)</td></tr>
<tr><td></td><td><input type="submit" value="Create Project" id="upload-file-button" /></td></tr>
</table></div>
</form>
<h1>Import Existing Project</h1>
<form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8">
<table class="import-project-panel-layout">
<tr><td>Project .tar or .tar.gz File:</td><td><input type="file" id="project-tar-file-input" name="project-file" /></td></tr>
<tr><td>Re-name Project:</td><td><input type="text" size="20" id="project-name-input" name="project-name" /> (optional)</td></tr>
<tr><td></td><td><input type="submit" value="Import Project" id="import-project-button" /></td></tr>
</table>
</form>
</div>
</div>
</body>
</html>
|
@ -8,9 +8,11 @@ function onClickUploadFileButton(evt) {
|
||||
} else {
|
||||
$("#file-upload-form").attr("action",
|
||||
"/command/create-project-from-upload?" + [
|
||||
"separator=" + $("#separator-input")[0].value,
|
||||
"ignore=" + $("#ignore-input")[0].value,
|
||||
"header-lines=" + $("#header-lines-input")[0].value,
|
||||
"skip=" + $("#skip-input")[0].value,
|
||||
"limit=" + $("#limit-input")[0].value,
|
||||
"separator=" + $("#separator-input")[0].value,
|
||||
"guess-value-type=" + $("#guess-value-type-input")[0].checked
|
||||
].join("&"));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user