Fixed Issue 19: CSV import is too basic. Quoted cells in a CSV file can now contain line breaks. Added a CSV file to test that.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@717 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
c0e006861a
commit
7ad438078f
@ -64,7 +64,7 @@ public class TsvCsvImporter implements Importer {
|
||||
if (headerLines > 0) {
|
||||
headerLines--;
|
||||
|
||||
List<String> cells = parser.split(line);
|
||||
List<String> cells = parser.split(line, lnReader);
|
||||
for (int c = 0; c < cells.size(); c++) {
|
||||
String cell = cells.get(c).trim();
|
||||
|
||||
@ -73,7 +73,7 @@ public class TsvCsvImporter implements Importer {
|
||||
} else {
|
||||
Row row = new Row(columnNames.size());
|
||||
|
||||
if (parser.parseRow(row, line, guessValueType)) {
|
||||
if (parser.parseRow(row, line, guessValueType, lnReader)) {
|
||||
rowsWithData++;
|
||||
|
||||
if (skip <= 0 || rowsWithData > skip) {
|
||||
|
@ -1,5 +1,7 @@
|
||||
package com.metaweb.gridworks.importers.parsers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -10,7 +12,7 @@ import com.metaweb.gridworks.model.Cell;
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
public class CSVRowParser extends RowParser {
|
||||
public List<String> split(String line) {
|
||||
public List<String> split(String line, LineNumberReader lineReader) {
|
||||
List<String> results = new ArrayList<String>();
|
||||
|
||||
int start = 0;
|
||||
@ -25,8 +27,14 @@ public class CSVRowParser extends RowParser {
|
||||
int quote = line.indexOf('"', start);
|
||||
if (quote < 0) {
|
||||
sb.append(line.substring(start));
|
||||
start = line.length();
|
||||
break;
|
||||
|
||||
start = 0;
|
||||
try {
|
||||
line = lineReader.readLine();
|
||||
} catch (IOException e) {
|
||||
line = "";
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (quote < line.length() - 1 && line.charAt(quote + 1) == '"') {
|
||||
sb.append(line.substring(start, quote + 1)); // include " as well
|
||||
@ -60,10 +68,10 @@ public class CSVRowParser extends RowParser {
|
||||
return results;
|
||||
}
|
||||
|
||||
public boolean parseRow(Row row, String line, boolean guessValueType) {
|
||||
public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) {
|
||||
boolean hasData = false;
|
||||
|
||||
List<String> strings = split(line);
|
||||
List<String> strings = split(line, lineReader);
|
||||
for (String s : strings) {
|
||||
Serializable value = guessValueType ? ImporterUtilities.parseCellValue(s) : s;
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package com.metaweb.gridworks.importers.parsers;
|
||||
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -11,7 +12,7 @@ import com.metaweb.gridworks.model.Row;
|
||||
|
||||
public class NonSplitRowParser extends RowParser {
|
||||
|
||||
public List<String> split(String line) {
|
||||
public List<String> split(String line, LineNumberReader lineReader) {
|
||||
List<String> results = new ArrayList<String>(1);
|
||||
|
||||
results.add(line.trim());
|
||||
@ -19,7 +20,7 @@ public class NonSplitRowParser extends RowParser {
|
||||
return results;
|
||||
}
|
||||
|
||||
public boolean parseRow(Row row, String line, boolean guessValueType) {
|
||||
public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) {
|
||||
line = line.trim();
|
||||
if (line.isEmpty()) {
|
||||
return false;
|
||||
|
@ -1,11 +1,12 @@
|
||||
package com.metaweb.gridworks.importers.parsers;
|
||||
|
||||
import java.io.LineNumberReader;
|
||||
import java.util.List;
|
||||
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
public abstract class RowParser {
|
||||
public abstract List<String> split(String line);
|
||||
public abstract List<String> split(String line, LineNumberReader lineReader);
|
||||
|
||||
public abstract boolean parseRow(Row row, String line, boolean guessValueType);
|
||||
public abstract boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader);
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package com.metaweb.gridworks.importers.parsers;
|
||||
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -19,7 +20,7 @@ public class SeparatorRowParser extends RowParser {
|
||||
this.sep = sep;
|
||||
}
|
||||
|
||||
public List<String> split(String line) {
|
||||
public List<String> split(String line, LineNumberReader lineReader) {
|
||||
String[] cells = StringUtils.splitPreserveAllTokens(line, sep);
|
||||
|
||||
List<String> results = new ArrayList<String>();
|
||||
@ -30,7 +31,7 @@ public class SeparatorRowParser extends RowParser {
|
||||
return results;
|
||||
}
|
||||
|
||||
public boolean parseRow(Row row, String line, boolean guessValueType) {
|
||||
public boolean parseRow(Row row, String line, boolean guessValueType, LineNumberReader lineReader) {
|
||||
boolean hasData = false;
|
||||
|
||||
String[] cells = StringUtils.splitPreserveAllTokens(line, sep);
|
||||
|
Loading…
Reference in New Issue
Block a user