From dced6415990299f20a8c8d5c2fb63c89497675bd Mon Sep 17 00:00:00 2001 From: Stefano Mazzocchi Date: Wed, 31 Mar 2010 22:34:21 +0000 Subject: [PATCH] - added the ability to specify the character separator for CSV or TSV files that don't use commas or tabs (this was needed to parse a dataset that we got from the BBC to try things out) - used commons-lang split function instead of the java String.split one, this is necessary to avoid having to escape separators that might be confused for regexps git-svn-id: http://google-refine.googlecode.com/svn/trunk@368 7d457c2a-affb-35e4-300a-418c747d4874 --- .../commands/edit/CreateProjectCommand.java | 2 +- .../importers/ImporterUtilities.java | 78 ------------------- .../gridworks/importers/TsvCsvImporter.java | 19 +++-- .../importers/parsers/CSVRowParser.java | 68 ++++++++++++++++ .../importers/parsers/RowParser.java | 8 ++ .../importers/parsers/SeparatorRowParser.java | 38 +++++++++ src/main/webapp/index.html | 2 +- src/main/webapp/scripts/index.js | 3 +- 8 files changed, 131 insertions(+), 87 deletions(-) create mode 100644 src/main/java/com/metaweb/gridworks/importers/parsers/CSVRowParser.java create mode 100644 src/main/java/com/metaweb/gridworks/importers/parsers/RowParser.java create mode 100644 src/main/java/com/metaweb/gridworks/importers/parsers/SeparatorRowParser.java diff --git a/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java b/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java index 07d334f34..06fd015ae 100644 --- a/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java +++ b/src/main/java/com/metaweb/gridworks/commands/edit/CreateProjectCommand.java @@ -209,7 +209,7 @@ public class CreateProjectCommand extends Command { } CharsetDetector detector = new CharsetDetector(); - detector.setDeclaredEncoding("utf8"); // the content on the web is encoded in UTF-8 so assume that + detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that Reader reader = null; CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll(); diff --git a/src/main/java/com/metaweb/gridworks/importers/ImporterUtilities.java b/src/main/java/com/metaweb/gridworks/importers/ImporterUtilities.java index d4e4387dc..31649182a 100644 --- a/src/main/java/com/metaweb/gridworks/importers/ImporterUtilities.java +++ b/src/main/java/com/metaweb/gridworks/importers/ImporterUtilities.java @@ -2,10 +2,6 @@ package com.metaweb.gridworks.importers; import java.io.Serializable; -import com.metaweb.gridworks.expr.ExpressionUtils; -import com.metaweb.gridworks.model.Cell; -import com.metaweb.gridworks.model.Row; - public class ImporterUtilities { static public Serializable parseCellValue(String text) { @@ -27,78 +23,4 @@ public class ImporterUtilities { return text; } - static public boolean parseCSVIntoRow(Row row, String line) { - boolean hasData = false; - - int start = 0; - while (start < line.length()) { - String text = null; - - if (line.charAt(start) == '"') { - StringBuffer sb = new StringBuffer(); - - start++; // skip over " - while (start < line.length()) { - int quote = line.indexOf('"', start); - if (quote < 0) { - sb.append(line.substring(start)); - start = line.length(); - break; - } else { - if (quote < line.length() - 1 && line.charAt(quote + 1) == '"') { - sb.append(line.substring(start, quote + 1)); // include " as well - start = quote + 2; - } else { - sb.append(line.substring(start, quote)); - start = quote + 1; - if (start < line.length() && line.charAt(start) == ',') { - start++; // skip , - } - break; - } - } - } - - text = sb.toString(); - } else { - int next = line.indexOf(',', start); - if (next < 0) { - text = line.substring(start); - start = line.length(); - } else { - text = line.substring(start, next); - start = next + 1; - } - } - - Serializable value = parseCellValue(text); - if (ExpressionUtils.isNonBlankData(value)) { - row.cells.add(new Cell(value, null)); - hasData = true; - } else { - row.cells.add(null); - } - } - - return hasData; - } - - static public boolean parseTSVIntoRow(Row row, String line) { - boolean hasData = false; - - String[] cells = line.split("\t"); - for (int c = 0; c < cells.length; c++) { - String text = cells[c]; - - Serializable value = parseCellValue(text); - if (ExpressionUtils.isNonBlankData(value)) { - row.cells.add(new Cell(value, null)); - hasData = true; - } else { - row.cells.add(null); - } - } - return hasData; - } - } diff --git a/src/main/java/com/metaweb/gridworks/importers/TsvCsvImporter.java b/src/main/java/com/metaweb/gridworks/importers/TsvCsvImporter.java index f91b3f9b6..64559733e 100644 --- a/src/main/java/com/metaweb/gridworks/importers/TsvCsvImporter.java +++ b/src/main/java/com/metaweb/gridworks/importers/TsvCsvImporter.java @@ -6,7 +6,11 @@ import java.io.Reader; import java.util.Properties; import org.apache.commons.lang.NotImplementedException; +import org.apache.commons.lang.StringUtils; +import com.metaweb.gridworks.importers.parsers.CSVRowParser; +import com.metaweb.gridworks.importers.parsers.RowParser; +import com.metaweb.gridworks.importers.parsers.SeparatorRowParser; import com.metaweb.gridworks.model.Column; import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Row; @@ -18,10 +22,11 @@ public class TsvCsvImporter implements Importer { LineNumberReader lnReader = new LineNumberReader(reader); try { - String sep = null; // auto-detect TSV or CSV - String line = null; + String sep = options.getProperty("separator"); // auto-detect if not present + String line = null; boolean first = true; int cellCount = 1; + RowParser parser = (sep == null || (sep.length() == 0)) ? null : new SeparatorRowParser(sep); int rowsWithData = 0; while ((line = lnReader.readLine()) != null) { @@ -29,18 +34,20 @@ public class TsvCsvImporter implements Importer { continue; } - if (sep == null) { + if (parser == null) { int tab = line.indexOf('\t'); if (tab >= 0) { sep = "\t"; + parser = new SeparatorRowParser(sep); } else { sep = ","; + parser = new CSVRowParser(); } } if (first) { - String[] cells = line.split(sep); - + String[] cells = StringUtils.splitPreserveAllTokens(line, sep); + first = false; for (int c = 0; c < cells.length; c++) { String cell = cells[c]; @@ -57,7 +64,7 @@ public class TsvCsvImporter implements Importer { } else { Row row = new Row(cellCount); - if ((sep.charAt(0) == ',') ? ImporterUtilities.parseCSVIntoRow(row, line) : ImporterUtilities.parseTSVIntoRow(row, line)) { + if (parser.parseRow(row, line)) { rowsWithData++; if (skip <= 0 || rowsWithData > skip) { diff --git a/src/main/java/com/metaweb/gridworks/importers/parsers/CSVRowParser.java b/src/main/java/com/metaweb/gridworks/importers/parsers/CSVRowParser.java new file mode 100644 index 000000000..1ae8e4be2 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/importers/parsers/CSVRowParser.java @@ -0,0 +1,68 @@ +package com.metaweb.gridworks.importers.parsers; + +import java.io.Serializable; + +import com.metaweb.gridworks.expr.ExpressionUtils; +import com.metaweb.gridworks.importers.ImporterUtilities; +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Row; + +public class CSVRowParser extends RowParser { + + public boolean parseRow(Row row, String line) { + boolean hasData = false; + + int start = 0; + while (start < line.length()) { + String text = null; + + if (line.charAt(start) == '"') { + StringBuffer sb = new StringBuffer(); + + start++; // skip over " + while (start < line.length()) { + int quote = line.indexOf('"', start); + if (quote < 0) { + sb.append(line.substring(start)); + start = line.length(); + break; + } else { + if (quote < line.length() - 1 && line.charAt(quote + 1) == '"') { + sb.append(line.substring(start, quote + 1)); // include " as well + start = quote + 2; + } else { + sb.append(line.substring(start, quote)); + start = quote + 1; + if (start < line.length() && line.charAt(start) == ',') { + start++; // skip , + } + break; + } + } + } + + text = sb.toString(); + } else { + int next = line.indexOf(',', start); + if (next < 0) { + text = line.substring(start); + start = line.length(); + } else { + text = line.substring(start, next); + start = next + 1; + } + } + + Serializable value = ImporterUtilities.parseCellValue(text); + if (ExpressionUtils.isNonBlankData(value)) { + row.cells.add(new Cell(value, null)); + hasData = true; + } else { + row.cells.add(null); + } + } + + return hasData; + } + +} diff --git a/src/main/java/com/metaweb/gridworks/importers/parsers/RowParser.java b/src/main/java/com/metaweb/gridworks/importers/parsers/RowParser.java new file mode 100644 index 000000000..e424312b4 --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/importers/parsers/RowParser.java @@ -0,0 +1,8 @@ +package com.metaweb.gridworks.importers.parsers; + +import com.metaweb.gridworks.model.Row; + +public abstract class RowParser { + + public abstract boolean parseRow(Row row, String line); +} diff --git a/src/main/java/com/metaweb/gridworks/importers/parsers/SeparatorRowParser.java b/src/main/java/com/metaweb/gridworks/importers/parsers/SeparatorRowParser.java new file mode 100644 index 000000000..1a1f0947c --- /dev/null +++ b/src/main/java/com/metaweb/gridworks/importers/parsers/SeparatorRowParser.java @@ -0,0 +1,38 @@ +package com.metaweb.gridworks.importers.parsers; + +import java.io.Serializable; + +import org.apache.commons.lang.StringUtils; + +import com.metaweb.gridworks.expr.ExpressionUtils; +import com.metaweb.gridworks.importers.ImporterUtilities; +import com.metaweb.gridworks.model.Cell; +import com.metaweb.gridworks.model.Row; + +public class SeparatorRowParser extends RowParser { + + String sep; + + public SeparatorRowParser(String sep) { + this.sep = sep; + } + + public boolean parseRow(Row row, String line) { + boolean hasData = false; + + String[] cells = StringUtils.splitPreserveAllTokens(line, sep); + for (int c = 0; c < cells.length; c++) { + String text = cells[c]; + + Serializable value = ImporterUtilities.parseCellValue(text); + if (ExpressionUtils.isNonBlankData(value)) { + row.cells.add(new Cell(value, null)); + hasData = true; + } else { + row.cells.add(null); + } + } + return hasData; + } + +} diff --git a/src/main/webapp/index.html b/src/main/webapp/index.html index 5c9a8930a..c5d737726 100644 --- a/src/main/webapp/index.html +++ b/src/main/webapp/index.html @@ -1 +1 @@ - Freebase Gridworks
Gridworks
Gridworks

Upload Data File

Data File:
Project Name:
Load up to: data rows (optional)
Skip: initial data rows (optional)

Import Existing Project

Project TAR File:
Re-name Project: (optional)
\ No newline at end of file + Freebase Gridworks
Gridworks
Gridworks

Upload Data File

Data File:
Project Name:
Load up to: data rows (optional)
Skip: initial data rows (optional)
Separator: column separator (optional)

Import Existing Project

Project TAR File:
Re-name Project: (optional)
\ No newline at end of file diff --git a/src/main/webapp/scripts/index.js b/src/main/webapp/scripts/index.js index d43e7065b..f47726763 100644 --- a/src/main/webapp/scripts/index.js +++ b/src/main/webapp/scripts/index.js @@ -29,7 +29,8 @@ function onClickUploadFileButton(evt) { $("#file-upload-form").attr("action", "/command/create-project-from-upload?" + [ "skip=" + $("#skip-input")[0].value, - "limit=" + $("#limit-input")[0].value + "limit=" + $("#limit-input")[0].value, + "separator=" + $("#separator-input")[0].value ].join("&")); } }