TsvCsvImporter passes all unit tests. TsvCsvImporter again uses opencsv library.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@797 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
648e7ee5fa
commit
092d4602b9
@ -10,10 +10,10 @@ import java.util.Properties;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.metaweb.gridworks.importers.parsers.TsvCsvRowParser;
|
||||
import com.metaweb.gridworks.importers.parsers.NonSplitRowParser;
|
||||
import com.metaweb.gridworks.importers.parsers.RowParser;
|
||||
import com.metaweb.gridworks.importers.parsers.SeparatorRowParser;
|
||||
import au.com.bytecode.opencsv.CSVParser;
|
||||
|
||||
import com.metaweb.gridworks.expr.ExpressionUtils;
|
||||
import com.metaweb.gridworks.model.Cell;
|
||||
import com.metaweb.gridworks.model.Project;
|
||||
import com.metaweb.gridworks.model.Row;
|
||||
|
||||
@ -38,9 +38,31 @@ public class TsvCsvImporter implements Importer {
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param lnReader
|
||||
* LineNumberReader used to read file or string contents
|
||||
* @param project
|
||||
* The project into which the parsed data will be added
|
||||
* @param sep
|
||||
* The character used to denote different the break between data points
|
||||
* @param limit
|
||||
* The maximum number of rows of data to import
|
||||
* @param skip
|
||||
* The number of initial data rows to skip
|
||||
* @param ignoreLines
|
||||
* The number of initial lines within the data source which should be ignored entirely
|
||||
* @param headerLines
|
||||
* The number of lines in the data source which describe each column
|
||||
* @param guessValueType
|
||||
* Whether the parser should try and guess the type of the value being parsed
|
||||
* @param splitIntoColumns
|
||||
* Whether the parser should try and split the data source into columns
|
||||
* @throws IOException
|
||||
*/
|
||||
public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns ) throws IOException{
|
||||
RowParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ?
|
||||
new SeparatorRowParser(sep) : null;
|
||||
CSVParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ?
|
||||
new CSVParser(sep.toCharArray()[0]) : null;//HACK changing string to char - won't work for multi-char separators.
|
||||
List<String> columnNames = new ArrayList<String>();
|
||||
String line = null;
|
||||
int rowsWithData = 0;
|
||||
@ -53,46 +75,55 @@ public class TsvCsvImporter implements Importer {
|
||||
continue;
|
||||
}
|
||||
|
||||
//guess separator
|
||||
if (parser == null) {
|
||||
if (splitIntoColumns) {
|
||||
int tab = line.indexOf('\t');
|
||||
if (tab >= 0) {
|
||||
sep = "\t";
|
||||
parser = new TsvCsvRowParser('\t');
|
||||
} else {
|
||||
sep = ",";
|
||||
parser = new TsvCsvRowParser(',');
|
||||
}
|
||||
int tab = line.indexOf('\t');
|
||||
if (tab >= 0) {
|
||||
parser = new CSVParser('\t');
|
||||
} else {
|
||||
parser = new NonSplitRowParser();
|
||||
parser = new CSVParser(',');
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (headerLines > 0) {
|
||||
//column headers
|
||||
headerLines--;
|
||||
|
||||
List<String> cells = parser.split(line, lnReader);
|
||||
ArrayList<String> cells = getCells(line, parser, lnReader, splitIntoColumns);
|
||||
|
||||
for (int c = 0; c < cells.size(); c++) {
|
||||
String cell = cells.get(c).trim();
|
||||
|
||||
//add column even if cell is blank
|
||||
ImporterUtilities.appendColumnName(columnNames, c, cell);
|
||||
}
|
||||
} else {
|
||||
//data
|
||||
Row row = new Row(columnNames.size());
|
||||
|
||||
if (parser.parseRow(row, line, guessValueType, lnReader)) {
|
||||
ArrayList<String> cells = getCells(line, parser, lnReader, splitIntoColumns);
|
||||
|
||||
if( cells != null && cells.size() > 0 )
|
||||
rowsWithData++;
|
||||
|
||||
if (skip <= 0 || rowsWithData > skip) {
|
||||
project.rows.add(row);
|
||||
project.columnModel.setMaxCellIndex(row.cells.size());
|
||||
|
||||
ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
|
||||
|
||||
if (limit > 0 && project.rows.size() >= limit) {
|
||||
break;
|
||||
if (skip <=0 || rowsWithData > skip){
|
||||
//add parsed data to row
|
||||
for(String s : cells){
|
||||
s = s.trim();
|
||||
if (ExpressionUtils.isNonBlankData(s)) {
|
||||
row.cells.add(new Cell(s, null));
|
||||
}else{
|
||||
row.cells.add(null);
|
||||
}
|
||||
}
|
||||
project.rows.add(row);
|
||||
project.columnModel.setMaxCellIndex(row.cells.size());
|
||||
|
||||
ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
|
||||
|
||||
if (limit > 0 && project.rows.size() >= limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -100,6 +131,25 @@ public class TsvCsvImporter implements Importer {
|
||||
ImporterUtilities.setupColumns(project, columnNames);
|
||||
}
|
||||
|
||||
protected ArrayList<String> getCells(String line, CSVParser parser, LineNumberReader lnReader, boolean splitIntoColumns) throws IOException{
|
||||
ArrayList<String> cells = new ArrayList<String>();
|
||||
if(splitIntoColumns){
|
||||
String[] tokens = parser.parseLineMulti(line);
|
||||
for(String s : tokens){
|
||||
cells.add(s);
|
||||
}
|
||||
while(parser.isPending()){
|
||||
tokens = parser.parseLineMulti(lnReader.readLine());
|
||||
for(String s : tokens){
|
||||
cells.add(s);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
cells.add(line);
|
||||
}
|
||||
return cells;
|
||||
}
|
||||
|
||||
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
@ -66,6 +66,23 @@ public class TsvCsvImporterTests {
|
||||
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2");
|
||||
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "col3");
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CSV-or-null")
|
||||
public void readUnseperatedData(String sep){
|
||||
String input = "value1,value2,value3";
|
||||
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
|
||||
|
||||
try {
|
||||
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, false);
|
||||
} catch (IOException e) {
|
||||
Assert.fail();
|
||||
}
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 1);
|
||||
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Column");
|
||||
Assert.assertEquals(project.rows.size(), 1);
|
||||
Assert.assertEquals(project.rows.get(0).cells.size(), 1);
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, input);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CSV-or-null")
|
||||
public void readSimpleData_CSV_1Header_1Row(String sep){
|
||||
@ -129,7 +146,7 @@ public class TsvCsvImporterTests {
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
|
||||
}
|
||||
|
||||
@Test(groups = { "broken" }, dataProvider = "CSV-or-null")
|
||||
@Test(groups = { }, dataProvider = "CSV-or-null")
|
||||
public void readDoesTrimsLeadingTrailingWhitespace(String sep){
|
||||
String input = " data1 , data2 , data3 ";
|
||||
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
|
||||
@ -229,7 +246,7 @@ public class TsvCsvImporterTests {
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(5).value, "data6");
|
||||
}
|
||||
|
||||
@Test(groups = { "broken" }, dataProvider = "CSV-or-null")
|
||||
@Test(groups = { }, dataProvider = "CSV-or-null")
|
||||
public void readQuotedData(String sep){
|
||||
String input = "col1,col2,col3\n" +
|
||||
"\"\"\"To Be\"\" is often followed by \"\"or not To Be\"\"\",data2";
|
||||
@ -319,7 +336,7 @@ public class TsvCsvImporterTests {
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
|
||||
}
|
||||
|
||||
@Test(groups = { "broken" }, dataProvider = "CSV-or-null")
|
||||
@Test(groups = { }, dataProvider = "CSV-or-null")
|
||||
public void readIgnore3_Header2_Skip2_limit2(String sep){
|
||||
String input = "ignore1\n" +
|
||||
"ignore2\n" +
|
||||
@ -346,12 +363,13 @@ public class TsvCsvImporterTests {
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "data-row1-cell1");
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data-row1-cell2");
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data-row1-cell3");
|
||||
Assert.assertEquals(project.rows.get(1).cells.size(), 2);
|
||||
Assert.assertEquals(project.rows.get(1).cells.size(), 3);
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(0).value, "data-row2-cell1");
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "data-row2-cell2");
|
||||
Assert.assertNull(project.rows.get(1).cells.get(2));
|
||||
}
|
||||
|
||||
@Test(groups = { "broken" }, dataProvider = "CSV-or-null")
|
||||
@Test(groups = { }, dataProvider = "CSV-or-null")
|
||||
public void readWithMultiLinedQuotedData(String sep){
|
||||
String input = "col1,col2,col3\n" +
|
||||
"\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\",data2";
|
||||
@ -371,7 +389,7 @@ public class TsvCsvImporterTests {
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data2");
|
||||
}
|
||||
|
||||
@Test(groups = { "broken" }, dataProvider = "CSV-or-null")
|
||||
@Test(groups = { }, dataProvider = "CSV-or-null")
|
||||
public void readWithMultiLinedQuotedDataAndBlankLines(String sep){
|
||||
String input = "col1,col2,col3\n" +
|
||||
"\"A line with many \n\n\n\n\n empty lines\",data2";
|
||||
|
Loading…
Reference in New Issue
Block a user