Fixed Issue 19: CSV import is too basic. Quoted cells in a CSV file can now contain line breaks. Added a CSV file to test that.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@717 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-05-12 06:07:44 +00:00
parent c0e006861a
commit 7ad438078f
5 changed files with 24 additions and 13 deletions

View File

@ -64,7 +64,7 @@ public class TsvCsvImporter implements Importer {
if (headerLines > 0) {
headerLines--;
List<String> cells = parser.split(line);
List<String> cells = parser.split(line, lnReader);
for (int c = 0; c < cells.size(); c++) {
String cell = cells.get(c).trim();
@ -73,7 +73,7 @@ public class TsvCsvImporter implements Importer {
} else {
Row row = new Row(columnNames.size());
if (parser.parseRow(row, line, guessValueType)) {
if (parser.parseRow(row, line, guessValueType, lnReader)) {
rowsWithData++;
if (skip <= 0 || rowsWithData > skip) {

View File

@ -1,5 +1,7 @@
package com.metaweb.gridworks.importers.parsers;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
@ -10,7 +12,7 @@ import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Row;
public class CSVRowParser extends RowParser {
public List<String> split(String line) {
public List<String> split(String line, LineNumberReader lineReader) {
List<String> results = new ArrayList<String>();
int start = 0;
@ -25,8 +27,14 @@ public class CSVRowParser extends RowParser {
int quote = line.indexOf('"', start);
if (quote < 0) {
sb.append(line.substring(start));
start = line.length();
start = 0;
try {
line = lineReader.readLine();
} catch (IOException e) {
line = "";
break;
}
} else {
if (quote < line.length() - 1 && line.charAt(quote + 1) == '"') {
sb.append(line.substring(start, quote + 1)); // include " as well
@ -60,10 +68,10 @@ public class CSVRowParser extends RowParser {
return results;
}
public boolean parseRow(Row row, String line, boolean guessValueType) {
public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) {
boolean hasData = false;
List<String> strings = split(line);
List<String> strings = split(line, lineReader);
for (String s : strings) {
Serializable value = guessValueType ? ImporterUtilities.parseCellValue(s) : s;

View File

@ -1,5 +1,6 @@
package com.metaweb.gridworks.importers.parsers;
import java.io.LineNumberReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
@ -11,7 +12,7 @@ import com.metaweb.gridworks.model.Row;
public class NonSplitRowParser extends RowParser {
public List<String> split(String line) {
public List<String> split(String line, LineNumberReader lineReader) {
List<String> results = new ArrayList<String>(1);
results.add(line.trim());
@ -19,7 +20,7 @@ public class NonSplitRowParser extends RowParser {
return results;
}
public boolean parseRow(Row row, String line, boolean guessValueType) {
public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) {
line = line.trim();
if (line.isEmpty()) {
return false;

View File

@ -1,11 +1,12 @@
package com.metaweb.gridworks.importers.parsers;
import java.io.LineNumberReader;
import java.util.List;
import com.metaweb.gridworks.model.Row;
public abstract class RowParser {
public abstract List<String> split(String line);
public abstract List<String> split(String line, LineNumberReader lineReader);
public abstract boolean parseRow(Row row, String line, boolean guessValueType);
public abstract boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader);
}

View File

@ -1,5 +1,6 @@
package com.metaweb.gridworks.importers.parsers;
import java.io.LineNumberReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
@ -19,7 +20,7 @@ public class SeparatorRowParser extends RowParser {
this.sep = sep;
}
public List<String> split(String line) {
public List<String> split(String line, LineNumberReader lineReader) {
String[] cells = StringUtils.splitPreserveAllTokens(line, sep);
List<String> results = new ArrayList<String>();
@ -30,7 +31,7 @@ public class SeparatorRowParser extends RowParser {
return results;
}
public boolean parseRow(Row row, String line, boolean guessValueType) {
public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) {
boolean hasData = false;
String[] cells = StringUtils.splitPreserveAllTokens(line, sep);