- added the ability to specify the character separator for CSV or TSV files that don't use commas or tabs (this was needed to parse a dataset that we got from the BBC to try things out)
- used commons-lang split function instead of the java String.split one, this is necessary to avoid having to escape separators that might be confused for regexps git-svn-id: http://google-refine.googlecode.com/svn/trunk@368 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
77b452e87f
commit
dced641599
@ -209,7 +209,7 @@ public class CreateProjectCommand extends Command {
|
||||
}
|
||||
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setDeclaredEncoding("utf8"); // the content on the web is encoded in UTF-8 so assume that
|
||||
detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that
|
||||
|
||||
Reader reader = null;
|
||||
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();
|
||||
|
@ -2,10 +2,6 @@ package com.metaweb.gridworks.importers;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.metaweb.gridworks.expr.ExpressionUtils;
|
||||
import com.metaweb.gridworks.model.Cell;
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
public class ImporterUtilities {
|
||||
|
||||
static public Serializable parseCellValue(String text) {
|
||||
@ -27,78 +23,4 @@ public class ImporterUtilities {
|
||||
return text;
|
||||
}
|
||||
|
||||
static public boolean parseCSVIntoRow(Row row, String line) {
|
||||
boolean hasData = false;
|
||||
|
||||
int start = 0;
|
||||
while (start < line.length()) {
|
||||
String text = null;
|
||||
|
||||
if (line.charAt(start) == '"') {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
|
||||
start++; // skip over "
|
||||
while (start < line.length()) {
|
||||
int quote = line.indexOf('"', start);
|
||||
if (quote < 0) {
|
||||
sb.append(line.substring(start));
|
||||
start = line.length();
|
||||
break;
|
||||
} else {
|
||||
if (quote < line.length() - 1 && line.charAt(quote + 1) == '"') {
|
||||
sb.append(line.substring(start, quote + 1)); // include " as well
|
||||
start = quote + 2;
|
||||
} else {
|
||||
sb.append(line.substring(start, quote));
|
||||
start = quote + 1;
|
||||
if (start < line.length() && line.charAt(start) == ',') {
|
||||
start++; // skip ,
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
text = sb.toString();
|
||||
} else {
|
||||
int next = line.indexOf(',', start);
|
||||
if (next < 0) {
|
||||
text = line.substring(start);
|
||||
start = line.length();
|
||||
} else {
|
||||
text = line.substring(start, next);
|
||||
start = next + 1;
|
||||
}
|
||||
}
|
||||
|
||||
Serializable value = parseCellValue(text);
|
||||
if (ExpressionUtils.isNonBlankData(value)) {
|
||||
row.cells.add(new Cell(value, null));
|
||||
hasData = true;
|
||||
} else {
|
||||
row.cells.add(null);
|
||||
}
|
||||
}
|
||||
|
||||
return hasData;
|
||||
}
|
||||
|
||||
static public boolean parseTSVIntoRow(Row row, String line) {
|
||||
boolean hasData = false;
|
||||
|
||||
String[] cells = line.split("\t");
|
||||
for (int c = 0; c < cells.length; c++) {
|
||||
String text = cells[c];
|
||||
|
||||
Serializable value = parseCellValue(text);
|
||||
if (ExpressionUtils.isNonBlankData(value)) {
|
||||
row.cells.add(new Cell(value, null));
|
||||
hasData = true;
|
||||
} else {
|
||||
row.cells.add(null);
|
||||
}
|
||||
}
|
||||
return hasData;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -6,7 +6,11 @@ import java.io.Reader;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.commons.lang.NotImplementedException;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.metaweb.gridworks.importers.parsers.CSVRowParser;
|
||||
import com.metaweb.gridworks.importers.parsers.RowParser;
|
||||
import com.metaweb.gridworks.importers.parsers.SeparatorRowParser;
|
||||
import com.metaweb.gridworks.model.Column;
|
||||
import com.metaweb.gridworks.model.Project;
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
@ -18,10 +22,11 @@ public class TsvCsvImporter implements Importer {
|
||||
|
||||
LineNumberReader lnReader = new LineNumberReader(reader);
|
||||
try {
|
||||
String sep = null; // auto-detect TSV or CSV
|
||||
String line = null;
|
||||
String sep = options.getProperty("separator"); // auto-detect if not present
|
||||
String line = null;
|
||||
boolean first = true;
|
||||
int cellCount = 1;
|
||||
RowParser parser = (sep == null || (sep.length() == 0)) ? null : new SeparatorRowParser(sep);
|
||||
|
||||
int rowsWithData = 0;
|
||||
while ((line = lnReader.readLine()) != null) {
|
||||
@ -29,18 +34,20 @@ public class TsvCsvImporter implements Importer {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (sep == null) {
|
||||
if (parser == null) {
|
||||
int tab = line.indexOf('\t');
|
||||
if (tab >= 0) {
|
||||
sep = "\t";
|
||||
parser = new SeparatorRowParser(sep);
|
||||
} else {
|
||||
sep = ",";
|
||||
parser = new CSVRowParser();
|
||||
}
|
||||
}
|
||||
|
||||
if (first) {
|
||||
String[] cells = line.split(sep);
|
||||
|
||||
String[] cells = StringUtils.splitPreserveAllTokens(line, sep);
|
||||
|
||||
first = false;
|
||||
for (int c = 0; c < cells.length; c++) {
|
||||
String cell = cells[c];
|
||||
@ -57,7 +64,7 @@ public class TsvCsvImporter implements Importer {
|
||||
} else {
|
||||
Row row = new Row(cellCount);
|
||||
|
||||
if ((sep.charAt(0) == ',') ? ImporterUtilities.parseCSVIntoRow(row, line) : ImporterUtilities.parseTSVIntoRow(row, line)) {
|
||||
if (parser.parseRow(row, line)) {
|
||||
rowsWithData++;
|
||||
|
||||
if (skip <= 0 || rowsWithData > skip) {
|
||||
|
@ -0,0 +1,68 @@
|
||||
package com.metaweb.gridworks.importers.parsers;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.metaweb.gridworks.expr.ExpressionUtils;
|
||||
import com.metaweb.gridworks.importers.ImporterUtilities;
|
||||
import com.metaweb.gridworks.model.Cell;
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
public class CSVRowParser extends RowParser {
|
||||
|
||||
public boolean parseRow(Row row, String line) {
|
||||
boolean hasData = false;
|
||||
|
||||
int start = 0;
|
||||
while (start < line.length()) {
|
||||
String text = null;
|
||||
|
||||
if (line.charAt(start) == '"') {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
|
||||
start++; // skip over "
|
||||
while (start < line.length()) {
|
||||
int quote = line.indexOf('"', start);
|
||||
if (quote < 0) {
|
||||
sb.append(line.substring(start));
|
||||
start = line.length();
|
||||
break;
|
||||
} else {
|
||||
if (quote < line.length() - 1 && line.charAt(quote + 1) == '"') {
|
||||
sb.append(line.substring(start, quote + 1)); // include " as well
|
||||
start = quote + 2;
|
||||
} else {
|
||||
sb.append(line.substring(start, quote));
|
||||
start = quote + 1;
|
||||
if (start < line.length() && line.charAt(start) == ',') {
|
||||
start++; // skip ,
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
text = sb.toString();
|
||||
} else {
|
||||
int next = line.indexOf(',', start);
|
||||
if (next < 0) {
|
||||
text = line.substring(start);
|
||||
start = line.length();
|
||||
} else {
|
||||
text = line.substring(start, next);
|
||||
start = next + 1;
|
||||
}
|
||||
}
|
||||
|
||||
Serializable value = ImporterUtilities.parseCellValue(text);
|
||||
if (ExpressionUtils.isNonBlankData(value)) {
|
||||
row.cells.add(new Cell(value, null));
|
||||
hasData = true;
|
||||
} else {
|
||||
row.cells.add(null);
|
||||
}
|
||||
}
|
||||
|
||||
return hasData;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,8 @@
|
||||
package com.metaweb.gridworks.importers.parsers;
|
||||
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
public abstract class RowParser {
|
||||
|
||||
public abstract boolean parseRow(Row row, String line);
|
||||
}
|
@ -0,0 +1,38 @@
|
||||
package com.metaweb.gridworks.importers.parsers;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.metaweb.gridworks.expr.ExpressionUtils;
|
||||
import com.metaweb.gridworks.importers.ImporterUtilities;
|
||||
import com.metaweb.gridworks.model.Cell;
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
public class SeparatorRowParser extends RowParser {
|
||||
|
||||
String sep;
|
||||
|
||||
public SeparatorRowParser(String sep) {
|
||||
this.sep = sep;
|
||||
}
|
||||
|
||||
public boolean parseRow(Row row, String line) {
|
||||
boolean hasData = false;
|
||||
|
||||
String[] cells = StringUtils.splitPreserveAllTokens(line, sep);
|
||||
for (int c = 0; c < cells.length; c++) {
|
||||
String text = cells[c];
|
||||
|
||||
Serializable value = ImporterUtilities.parseCellValue(text);
|
||||
if (ExpressionUtils.isNonBlankData(value)) {
|
||||
row.cells.add(new Cell(value, null));
|
||||
hasData = true;
|
||||
} else {
|
||||
row.cells.add(null);
|
||||
}
|
||||
}
|
||||
return hasData;
|
||||
}
|
||||
|
||||
}
|
@ -1 +1 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<title>Freebase Gridworks</title>
<link rel="stylesheet" href="/styles/common.css" />
<link rel="stylesheet" href="/styles/index.css" />
<script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script>
<script type="text/javascript" src="externals/date.js"></script>
<script type="text/javascript" src="scripts/util/string.js"></script>
<script type="text/javascript" src="scripts/version.js"></script>
<script type="text/javascript" src="scripts/index.js"></script>
<script type="text/javascript" src="http://www.freebase.com/labs/gridworks.js"></script>
</head>
<body>
<div id="header">
<a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a>
<div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div>
</div>
<div id="body">
<div id="body-empty">
<table><tr>
<td id="body-empty-logo-container"><img src="images/gridworks.png" /> Gridworks</td>
<td id="body-empty-create-project-panel-container"></td>
</tr></table>
</div>
<div id="body-nonempty">
<table><tr>
<td id="body-nonempty-logo-container"><img src="images/gridworks.png" /> Gridworks</td>
<td id="body-nonempty-projects-container">
<div id="projects"></div>
</td>
<td id="body-nonempty-create-project-panel-container"></td>
</tr></table>
</div>
</div>
<div id="footer">
<a href="about.html">About Freebase Gridworks</a>
•
© 2010 <a href="http://www.metaweb.com/">Metaweb Technologies, Inc.</a>
</div>
<div id="body-template">
<div id="create-project-panel">
<h1>Upload Data File</h1>
<form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8">
<div class="grid-layout layout-tight"><table>
<tr><td>Data File:</td><td>
<input type="file" id="project-file-input" name="project-file" />
</td></tr>
<tr><td>Project Name:</td><td>
<input type="text" size="30" id="project-name-input" name="project-name" />
</td></tr>
<tr><td>Load up to:</td><td>
<input id="limit-input" name="limit" size="5" /> data rows (optional)
</td></tr>
<tr><td>Skip:</td><td>
<input id="skip-input" name="skip" size="5" /> initial data rows (optional)
</td></tr>
<tr><td></td><td>
<input type="submit" value="Create Project" id="upload-file-button" />
</td></tr>
</table></div>
</form>
<h1>Import Existing Project</h1>
<form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8">
<table id="import-project-panel-layout">
<tr><td>Project TAR File:</td><td>
<input type="file" id="project-tar-file-input" name="project-file" />
</td></tr>
<tr><td>Re-name Project:</td><td>
<input type="text" size="30" id="project-name-input" name="project-name" /> (optional)
</td></tr>
<tr><td></td><td>
<input type="submit" value="Import Project" id="import-project-button" />
</td></tr>
</table>
</form>
</div>
</div>
</body>
</html>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<title>Freebase Gridworks</title>
<link rel="stylesheet" href="/styles/common.css" />
<link rel="stylesheet" href="/styles/index.css" />
<script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script>
<script type="text/javascript" src="externals/date.js"></script>
<script type="text/javascript" src="scripts/util/string.js"></script>
<script type="text/javascript" src="scripts/version.js"></script>
<script type="text/javascript" src="scripts/index.js"></script>
<script type="text/javascript" src="http://www.freebase.com/labs/gridworks.js"></script>
</head>
<body>
<div id="header">
<a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a>
<div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div>
</div>
<div id="body">
<div id="body-empty">
<table><tr>
<td id="body-empty-logo-container"><img src="images/gridworks.png" /> Gridworks</td>
<td id="body-empty-create-project-panel-container"></td>
</tr></table>
</div>
<div id="body-nonempty">
<table><tr>
<td id="body-nonempty-logo-container"><img src="images/gridworks.png" /> Gridworks</td>
<td id="body-nonempty-projects-container">
<div id="projects"></div>
</td>
<td id="body-nonempty-create-project-panel-container"></td>
</tr></table>
</div>
</div>
<div id="footer">
<a href="about.html">About Freebase Gridworks</a>
•
© 2010 <a href="http://www.metaweb.com/">Metaweb Technologies, Inc.</a>
</div>
<div id="body-template">
<div id="create-project-panel">
<h1>Upload Data File</h1>
<form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8">
<div class="grid-layout layout-tight"><table>
<tr><td>Data File:</td><td>
<input type="file" id="project-file-input" name="project-file" />
</td></tr>
<tr><td>Project Name:</td><td>
<input type="text" size="30" id="project-name-input" name="project-name" />
</td></tr>
<tr><td>Load up to:</td><td>
<input id="limit-input" name="limit" size="5" /> data rows (optional)
</td></tr>
<tr><td>Skip:</td><td>
<input id="skip-input" name="skip" size="5" /> initial data rows (optional)
</td></tr>
<tr><td>Separator:</td><td>
<input id="separator-input" name="separator" size="2" /> column separator (optional)
</td></tr>
<tr><td></td><td>
<input type="submit" value="Create Project" id="upload-file-button" />
</td></tr>
</table></div>
</form>
<h1>Import Existing Project</h1>
<form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8">
<table id="import-project-panel-layout">
<tr><td>Project TAR File:</td><td>
<input type="file" id="project-tar-file-input" name="project-file" />
</td></tr>
<tr><td>Re-name Project:</td><td>
<input type="text" size="30" id="project-name-input" name="project-name" /> (optional)
</td></tr>
<tr><td></td><td>
<input type="submit" value="Import Project" id="import-project-button" />
</td></tr>
</table>
</form>
</div>
</div>
</body>
</html>
|
@ -29,7 +29,8 @@ function onClickUploadFileButton(evt) {
|
||||
$("#file-upload-form").attr("action",
|
||||
"/command/create-project-from-upload?" + [
|
||||
"skip=" + $("#skip-input")[0].value,
|
||||
"limit=" + $("#limit-input")[0].value
|
||||
"limit=" + $("#limit-input")[0].value,
|
||||
"separator=" + $("#separator-input")[0].value
|
||||
].join("&"));
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user