- added the ability to specify the character separator for CSV or TSV files that don't use commas or tabs (this was needed to parse a dataset that we got from the BBC to try things out)

- used commons-lang split function instead of the java String.split one, this is necessary to avoid having to escape separators that might be confused for regexps


git-svn-id: http://google-refine.googlecode.com/svn/trunk@368 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-31 22:34:21 +00:00
parent 77b452e87f
commit dced641599
8 changed files with 131 additions and 87 deletions

View File

@ -209,7 +209,7 @@ public class CreateProjectCommand extends Command {
}
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("utf8"); // the content on the web is encoded in UTF-8 so assume that
detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that
Reader reader = null;
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();

View File

@ -2,10 +2,6 @@ package com.metaweb.gridworks.importers;
import java.io.Serializable;
import com.metaweb.gridworks.expr.ExpressionUtils;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Row;
public class ImporterUtilities {
static public Serializable parseCellValue(String text) {
@ -27,78 +23,4 @@ public class ImporterUtilities {
return text;
}
static public boolean parseCSVIntoRow(Row row, String line) {
boolean hasData = false;
int start = 0;
while (start < line.length()) {
String text = null;
if (line.charAt(start) == '"') {
StringBuffer sb = new StringBuffer();
start++; // skip over "
while (start < line.length()) {
int quote = line.indexOf('"', start);
if (quote < 0) {
sb.append(line.substring(start));
start = line.length();
break;
} else {
if (quote < line.length() - 1 && line.charAt(quote + 1) == '"') {
sb.append(line.substring(start, quote + 1)); // include " as well
start = quote + 2;
} else {
sb.append(line.substring(start, quote));
start = quote + 1;
if (start < line.length() && line.charAt(start) == ',') {
start++; // skip ,
}
break;
}
}
}
text = sb.toString();
} else {
int next = line.indexOf(',', start);
if (next < 0) {
text = line.substring(start);
start = line.length();
} else {
text = line.substring(start, next);
start = next + 1;
}
}
Serializable value = parseCellValue(text);
if (ExpressionUtils.isNonBlankData(value)) {
row.cells.add(new Cell(value, null));
hasData = true;
} else {
row.cells.add(null);
}
}
return hasData;
}
static public boolean parseTSVIntoRow(Row row, String line) {
boolean hasData = false;
String[] cells = line.split("\t");
for (int c = 0; c < cells.length; c++) {
String text = cells[c];
Serializable value = parseCellValue(text);
if (ExpressionUtils.isNonBlankData(value)) {
row.cells.add(new Cell(value, null));
hasData = true;
} else {
row.cells.add(null);
}
}
return hasData;
}
}

View File

@ -6,7 +6,11 @@ import java.io.Reader;
import java.util.Properties;
import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.lang.StringUtils;
import com.metaweb.gridworks.importers.parsers.CSVRowParser;
import com.metaweb.gridworks.importers.parsers.RowParser;
import com.metaweb.gridworks.importers.parsers.SeparatorRowParser;
import com.metaweb.gridworks.model.Column;
import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row;
@ -18,10 +22,11 @@ public class TsvCsvImporter implements Importer {
LineNumberReader lnReader = new LineNumberReader(reader);
try {
String sep = null; // auto-detect TSV or CSV
String line = null;
String sep = options.getProperty("separator"); // auto-detect if not present
String line = null;
boolean first = true;
int cellCount = 1;
RowParser parser = (sep == null || (sep.length() == 0)) ? null : new SeparatorRowParser(sep);
int rowsWithData = 0;
while ((line = lnReader.readLine()) != null) {
@ -29,18 +34,20 @@ public class TsvCsvImporter implements Importer {
continue;
}
if (sep == null) {
if (parser == null) {
int tab = line.indexOf('\t');
if (tab >= 0) {
sep = "\t";
parser = new SeparatorRowParser(sep);
} else {
sep = ",";
parser = new CSVRowParser();
}
}
if (first) {
String[] cells = line.split(sep);
String[] cells = StringUtils.splitPreserveAllTokens(line, sep);
first = false;
for (int c = 0; c < cells.length; c++) {
String cell = cells[c];
@ -57,7 +64,7 @@ public class TsvCsvImporter implements Importer {
} else {
Row row = new Row(cellCount);
if ((sep.charAt(0) == ',') ? ImporterUtilities.parseCSVIntoRow(row, line) : ImporterUtilities.parseTSVIntoRow(row, line)) {
if (parser.parseRow(row, line)) {
rowsWithData++;
if (skip <= 0 || rowsWithData > skip) {

View File

@ -0,0 +1,68 @@
package com.metaweb.gridworks.importers.parsers;
import java.io.Serializable;
import com.metaweb.gridworks.expr.ExpressionUtils;
import com.metaweb.gridworks.importers.ImporterUtilities;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Row;
public class CSVRowParser extends RowParser {
public boolean parseRow(Row row, String line) {
boolean hasData = false;
int start = 0;
while (start < line.length()) {
String text = null;
if (line.charAt(start) == '"') {
StringBuffer sb = new StringBuffer();
start++; // skip over "
while (start < line.length()) {
int quote = line.indexOf('"', start);
if (quote < 0) {
sb.append(line.substring(start));
start = line.length();
break;
} else {
if (quote < line.length() - 1 && line.charAt(quote + 1) == '"') {
sb.append(line.substring(start, quote + 1)); // include " as well
start = quote + 2;
} else {
sb.append(line.substring(start, quote));
start = quote + 1;
if (start < line.length() && line.charAt(start) == ',') {
start++; // skip ,
}
break;
}
}
}
text = sb.toString();
} else {
int next = line.indexOf(',', start);
if (next < 0) {
text = line.substring(start);
start = line.length();
} else {
text = line.substring(start, next);
start = next + 1;
}
}
Serializable value = ImporterUtilities.parseCellValue(text);
if (ExpressionUtils.isNonBlankData(value)) {
row.cells.add(new Cell(value, null));
hasData = true;
} else {
row.cells.add(null);
}
}
return hasData;
}
}

View File

@ -0,0 +1,8 @@
package com.metaweb.gridworks.importers.parsers;
import com.metaweb.gridworks.model.Row;
public abstract class RowParser {
public abstract boolean parseRow(Row row, String line);
}

View File

@ -0,0 +1,38 @@
package com.metaweb.gridworks.importers.parsers;
import java.io.Serializable;
import org.apache.commons.lang.StringUtils;
import com.metaweb.gridworks.expr.ExpressionUtils;
import com.metaweb.gridworks.importers.ImporterUtilities;
import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Row;
public class SeparatorRowParser extends RowParser {
String sep;
public SeparatorRowParser(String sep) {
this.sep = sep;
}
public boolean parseRow(Row row, String line) {
boolean hasData = false;
String[] cells = StringUtils.splitPreserveAllTokens(line, sep);
for (int c = 0; c < cells.length; c++) {
String text = cells[c];
Serializable value = ImporterUtilities.parseCellValue(text);
if (ExpressionUtils.isNonBlankData(value)) {
row.cells.add(new Cell(value, null));
hasData = true;
} else {
row.cells.add(null);
}
}
return hasData;
}
}

View File

@ -1 +1 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html> <head> <title>Freebase Gridworks</title> <link rel="stylesheet" href="/styles/common.css" /> <link rel="stylesheet" href="/styles/index.css" /> <script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script> <script type="text/javascript" src="externals/date.js"></script> <script type="text/javascript" src="scripts/util/string.js"></script> <script type="text/javascript" src="scripts/version.js"></script> <script type="text/javascript" src="scripts/index.js"></script> <script type="text/javascript" src="http://www.freebase.com/labs/gridworks.js"></script> </head> <body> <div id="header"> <a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a> <div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div> </div> <div id="body"> <div id="body-empty"> <table><tr> <td id="body-empty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-empty-create-project-panel-container"></td> </tr></table> </div> <div id="body-nonempty"> <table><tr> <td id="body-nonempty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-nonempty-projects-container"> <div id="projects"></div> </td> <td id="body-nonempty-create-project-panel-container"></td> </tr></table> </div> </div> <div id="footer"> <a href="about.html">About Freebase Gridworks</a> &bull; &copy; 2010 <a href="http://www.metaweb.com/">Metaweb Technologies, Inc.</a> </div> <div id="body-template"> <div id="create-project-panel"> <h1>Upload Data File</h1> <form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8"> <div class="grid-layout layout-tight"><table> <tr><td>Data File:</td><td> <input type="file" id="project-file-input" name="project-file" /> </td></tr> <tr><td>Project Name:</td><td> <input type="text" size="30" id="project-name-input" name="project-name" /> </td></tr> <tr><td>Load up to:</td><td> <input id="limit-input" name="limit" size="5" /> data rows (optional) </td></tr> <tr><td>Skip:</td><td> <input id="skip-input" name="skip" size="5" /> initial data rows (optional) </td></tr> <tr><td></td><td> <input type="submit" value="Create Project" id="upload-file-button" /> </td></tr> </table></div> </form> <h1>Import Existing Project</h1> <form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8"> <table id="import-project-panel-layout"> <tr><td>Project TAR File:</td><td> <input type="file" id="project-tar-file-input" name="project-file" /> </td></tr> <tr><td>Re-name Project:</td><td> <input type="text" size="30" id="project-name-input" name="project-name" /> (optional) </td></tr> <tr><td></td><td> <input type="submit" value="Import Project" id="import-project-button" /> </td></tr> </table> </form> </div> </div> </body> </html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html> <head> <title>Freebase Gridworks</title> <link rel="stylesheet" href="/styles/common.css" /> <link rel="stylesheet" href="/styles/index.css" /> <script type="text/javascript" src="externals/jquery-1.4.1.min.js"></script> <script type="text/javascript" src="externals/date.js"></script> <script type="text/javascript" src="scripts/util/string.js"></script> <script type="text/javascript" src="scripts/version.js"></script> <script type="text/javascript" src="scripts/index.js"></script> <script type="text/javascript" src="http://www.freebase.com/labs/gridworks.js"></script> </head> <body> <div id="header"> <a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a> <div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div> </div> <div id="body"> <div id="body-empty"> <table><tr> <td id="body-empty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-empty-create-project-panel-container"></td> </tr></table> </div> <div id="body-nonempty"> <table><tr> <td id="body-nonempty-logo-container"><img src="images/gridworks.png" /> Gridworks</td> <td id="body-nonempty-projects-container"> <div id="projects"></div> </td> <td id="body-nonempty-create-project-panel-container"></td> </tr></table> </div> </div> <div id="footer"> <a href="about.html">About Freebase Gridworks</a> &bull; &copy; 2010 <a href="http://www.metaweb.com/">Metaweb Technologies, Inc.</a> </div> <div id="body-template"> <div id="create-project-panel"> <h1>Upload Data File</h1> <form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8"> <div class="grid-layout layout-tight"><table> <tr><td>Data File:</td><td> <input type="file" id="project-file-input" name="project-file" /> </td></tr> <tr><td>Project Name:</td><td> <input type="text" size="30" id="project-name-input" name="project-name" /> </td></tr> <tr><td>Load up to:</td><td> <input id="limit-input" name="limit" size="5" /> data rows (optional) </td></tr> <tr><td>Skip:</td><td> <input id="skip-input" name="skip" size="5" /> initial data rows (optional) </td></tr> <tr><td>Separator:</td><td> <input id="separator-input" name="separator" size="2" /> column separator (optional) </td></tr> <tr><td></td><td> <input type="submit" value="Create Project" id="upload-file-button" /> </td></tr> </table></div> </form> <h1>Import Existing Project</h1> <form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8"> <table id="import-project-panel-layout"> <tr><td>Project TAR File:</td><td> <input type="file" id="project-tar-file-input" name="project-file" /> </td></tr> <tr><td>Re-name Project:</td><td> <input type="text" size="30" id="project-name-input" name="project-name" /> (optional) </td></tr> <tr><td></td><td> <input type="submit" value="Import Project" id="import-project-button" /> </td></tr> </table> </form> </div> </div> </body> </html>

View File

@ -29,7 +29,8 @@ function onClickUploadFileButton(evt) {
$("#file-upload-form").attr("action",
"/command/create-project-from-upload?" + [
"skip=" + $("#skip-input")[0].value,
"limit=" + $("#limit-input")[0].value
"limit=" + $("#limit-input")[0].value,
"separator=" + $("#separator-input")[0].value
].join("&"));
}
}