Fixed Issue 19: CSV import is too basic. Quoted cells in a CSV file can now contain line breaks. Added a CSV file to test that.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@717 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-05-12 06:07:44 +00:00
parent c0e006861a
commit 7ad438078f
5 changed files with 24 additions and 13 deletions

View File

@ -64,7 +64,7 @@ public class TsvCsvImporter implements Importer {
if (headerLines > 0) { if (headerLines > 0) {
headerLines--; headerLines--;
List<String> cells = parser.split(line); List<String> cells = parser.split(line, lnReader);
for (int c = 0; c < cells.size(); c++) { for (int c = 0; c < cells.size(); c++) {
String cell = cells.get(c).trim(); String cell = cells.get(c).trim();
@ -73,7 +73,7 @@ public class TsvCsvImporter implements Importer {
} else { } else {
Row row = new Row(columnNames.size()); Row row = new Row(columnNames.size());
if (parser.parseRow(row, line, guessValueType)) { if (parser.parseRow(row, line, guessValueType, lnReader)) {
rowsWithData++; rowsWithData++;
if (skip <= 0 || rowsWithData > skip) { if (skip <= 0 || rowsWithData > skip) {

View File

@ -1,5 +1,7 @@
package com.metaweb.gridworks.importers.parsers; package com.metaweb.gridworks.importers.parsers;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -10,7 +12,7 @@ import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Row; import com.metaweb.gridworks.model.Row;
public class CSVRowParser extends RowParser { public class CSVRowParser extends RowParser {
public List<String> split(String line) { public List<String> split(String line, LineNumberReader lineReader) {
List<String> results = new ArrayList<String>(); List<String> results = new ArrayList<String>();
int start = 0; int start = 0;
@ -25,8 +27,14 @@ public class CSVRowParser extends RowParser {
int quote = line.indexOf('"', start); int quote = line.indexOf('"', start);
if (quote < 0) { if (quote < 0) {
sb.append(line.substring(start)); sb.append(line.substring(start));
start = line.length();
break; start = 0;
try {
line = lineReader.readLine();
} catch (IOException e) {
line = "";
break;
}
} else { } else {
if (quote < line.length() - 1 && line.charAt(quote + 1) == '"') { if (quote < line.length() - 1 && line.charAt(quote + 1) == '"') {
sb.append(line.substring(start, quote + 1)); // include " as well sb.append(line.substring(start, quote + 1)); // include " as well
@ -60,10 +68,10 @@ public class CSVRowParser extends RowParser {
return results; return results;
} }
public boolean parseRow(Row row, String line, boolean guessValueType) { public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) {
boolean hasData = false; boolean hasData = false;
List<String> strings = split(line); List<String> strings = split(line, lineReader);
for (String s : strings) { for (String s : strings) {
Serializable value = guessValueType ? ImporterUtilities.parseCellValue(s) : s; Serializable value = guessValueType ? ImporterUtilities.parseCellValue(s) : s;

View File

@ -1,5 +1,6 @@
package com.metaweb.gridworks.importers.parsers; package com.metaweb.gridworks.importers.parsers;
import java.io.LineNumberReader;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -11,7 +12,7 @@ import com.metaweb.gridworks.model.Row;
public class NonSplitRowParser extends RowParser { public class NonSplitRowParser extends RowParser {
public List<String> split(String line) { public List<String> split(String line, LineNumberReader lineReader) {
List<String> results = new ArrayList<String>(1); List<String> results = new ArrayList<String>(1);
results.add(line.trim()); results.add(line.trim());
@ -19,7 +20,7 @@ public class NonSplitRowParser extends RowParser {
return results; return results;
} }
public boolean parseRow(Row row, String line, boolean guessValueType) { public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) {
line = line.trim(); line = line.trim();
if (line.isEmpty()) { if (line.isEmpty()) {
return false; return false;

View File

@ -1,11 +1,12 @@
package com.metaweb.gridworks.importers.parsers; package com.metaweb.gridworks.importers.parsers;
import java.io.LineNumberReader;
import java.util.List; import java.util.List;
import com.metaweb.gridworks.model.Row; import com.metaweb.gridworks.model.Row;
public abstract class RowParser { public abstract class RowParser {
public abstract List<String> split(String line); public abstract List<String> split(String line, LineNumberReader lineReader);
public abstract boolean parseRow(Row row, String line, boolean guessValueType); public abstract boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader);
} }

View File

@ -1,5 +1,6 @@
package com.metaweb.gridworks.importers.parsers; package com.metaweb.gridworks.importers.parsers;
import java.io.LineNumberReader;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -19,7 +20,7 @@ public class SeparatorRowParser extends RowParser {
this.sep = sep; this.sep = sep;
} }
public List<String> split(String line) { public List<String> split(String line, LineNumberReader lineReader) {
String[] cells = StringUtils.splitPreserveAllTokens(line, sep); String[] cells = StringUtils.splitPreserveAllTokens(line, sep);
List<String> results = new ArrayList<String>(); List<String> results = new ArrayList<String>();
@ -30,7 +31,7 @@ public class SeparatorRowParser extends RowParser {
return results; return results;
} }
public boolean parseRow(Row row, String line, boolean guessValueType) { public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) {
boolean hasData = false; boolean hasData = false;
String[] cells = StringUtils.splitPreserveAllTokens(line, sep); String[] cells = StringUtils.splitPreserveAllTokens(line, sep);