TsvCsvImporter passes all unit tests. TsvCsvImporter again uses opencsv library.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@797 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-05-17 11:57:26 +00:00
parent 648e7ee5fa
commit 092d4602b9
2 changed files with 101 additions and 33 deletions

View File

@ -10,10 +10,10 @@ import java.util.Properties;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import com.metaweb.gridworks.importers.parsers.TsvCsvRowParser; import au.com.bytecode.opencsv.CSVParser;
import com.metaweb.gridworks.importers.parsers.NonSplitRowParser;
import com.metaweb.gridworks.importers.parsers.RowParser; import com.metaweb.gridworks.expr.ExpressionUtils;
import com.metaweb.gridworks.importers.parsers.SeparatorRowParser; import com.metaweb.gridworks.model.Cell;
import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.model.Row; import com.metaweb.gridworks.model.Row;
@ -38,9 +38,31 @@ public class TsvCsvImporter implements Importer {
); );
} }
/**
*
* @param lnReader
* LineNumberReader used to read file or string contents
* @param project
* The project into which the parsed data will be added
* @param sep
* The character used to denote different the break between data points
* @param limit
* The maximum number of rows of data to import
* @param skip
* The number of initial data rows to skip
* @param ignoreLines
* The number of initial lines within the data source which should be ignored entirely
* @param headerLines
* The number of lines in the data source which describe each column
* @param guessValueType
* Whether the parser should try and guess the type of the value being parsed
* @param splitIntoColumns
* Whether the parser should try and split the data source into columns
* @throws IOException
*/
public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns ) throws IOException{ public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns ) throws IOException{
RowParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ? CSVParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ?
new SeparatorRowParser(sep) : null; new CSVParser(sep.toCharArray()[0]) : null;//HACK changing string to char - won't work for multi-char separators.
List<String> columnNames = new ArrayList<String>(); List<String> columnNames = new ArrayList<String>();
String line = null; String line = null;
int rowsWithData = 0; int rowsWithData = 0;
@ -53,46 +75,55 @@ public class TsvCsvImporter implements Importer {
continue; continue;
} }
//guess separator
if (parser == null) { if (parser == null) {
if (splitIntoColumns) { int tab = line.indexOf('\t');
int tab = line.indexOf('\t'); if (tab >= 0) {
if (tab >= 0) { parser = new CSVParser('\t');
sep = "\t";
parser = new TsvCsvRowParser('\t');
} else {
sep = ",";
parser = new TsvCsvRowParser(',');
}
} else { } else {
parser = new NonSplitRowParser(); parser = new CSVParser(',');
} }
} }
if (headerLines > 0) { if (headerLines > 0) {
//column headers
headerLines--; headerLines--;
List<String> cells = parser.split(line, lnReader); ArrayList<String> cells = getCells(line, parser, lnReader, splitIntoColumns);
for (int c = 0; c < cells.size(); c++) { for (int c = 0; c < cells.size(); c++) {
String cell = cells.get(c).trim(); String cell = cells.get(c).trim();
//add column even if cell is blank
ImporterUtilities.appendColumnName(columnNames, c, cell); ImporterUtilities.appendColumnName(columnNames, c, cell);
} }
} else { } else {
//data
Row row = new Row(columnNames.size()); Row row = new Row(columnNames.size());
if (parser.parseRow(row, line, guessValueType, lnReader)) { ArrayList<String> cells = getCells(line, parser, lnReader, splitIntoColumns);
if( cells != null && cells.size() > 0 )
rowsWithData++; rowsWithData++;
if (skip <= 0 || rowsWithData > skip) { if (skip <=0 || rowsWithData > skip){
project.rows.add(row); //add parsed data to row
project.columnModel.setMaxCellIndex(row.cells.size()); for(String s : cells){
s = s.trim();
ImporterUtilities.ensureColumnsInRowExist(columnNames, row); if (ExpressionUtils.isNonBlankData(s)) {
row.cells.add(new Cell(s, null));
if (limit > 0 && project.rows.size() >= limit) { }else{
break; row.cells.add(null);
} }
} }
project.rows.add(row);
project.columnModel.setMaxCellIndex(row.cells.size());
ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
if (limit > 0 && project.rows.size() >= limit) {
break;
}
} }
} }
} }
@ -100,6 +131,25 @@ public class TsvCsvImporter implements Importer {
ImporterUtilities.setupColumns(project, columnNames); ImporterUtilities.setupColumns(project, columnNames);
} }
protected ArrayList<String> getCells(String line, CSVParser parser, LineNumberReader lnReader, boolean splitIntoColumns) throws IOException{
ArrayList<String> cells = new ArrayList<String>();
if(splitIntoColumns){
String[] tokens = parser.parseLineMulti(line);
for(String s : tokens){
cells.add(s);
}
while(parser.isPending()){
tokens = parser.parseLineMulti(lnReader.readLine());
for(String s : tokens){
cells.add(s);
}
}
}else{
cells.add(line);
}
return cells;
}
public void read(InputStream inputStream, Project project, Properties options) throws Exception { public void read(InputStream inputStream, Project project, Properties options) throws Exception {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }

View File

@ -66,6 +66,23 @@ public class TsvCsvImporterTests {
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2"); Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2");
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "col3"); Assert.assertEquals(project.columnModel.columns.get(2).getName(), "col3");
} }
@Test(dataProvider = "CSV-or-null")
public void readUnseperatedData(String sep){
String input = "value1,value2,value3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, false);
} catch (IOException e) {
Assert.fail();
}
Assert.assertEquals(project.columnModel.columns.size(), 1);
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Column");
Assert.assertEquals(project.rows.size(), 1);
Assert.assertEquals(project.rows.get(0).cells.size(), 1);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, input);
}
@Test(dataProvider = "CSV-or-null") @Test(dataProvider = "CSV-or-null")
public void readSimpleData_CSV_1Header_1Row(String sep){ public void readSimpleData_CSV_1Header_1Row(String sep){
@ -129,7 +146,7 @@ public class TsvCsvImporterTests {
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3"); Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
} }
@Test(groups = { "broken" }, dataProvider = "CSV-or-null") @Test(groups = { }, dataProvider = "CSV-or-null")
public void readDoesTrimsLeadingTrailingWhitespace(String sep){ public void readDoesTrimsLeadingTrailingWhitespace(String sep){
String input = " data1 , data2 , data3 "; String input = " data1 , data2 , data3 ";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input)); LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
@ -229,7 +246,7 @@ public class TsvCsvImporterTests {
Assert.assertEquals(project.rows.get(0).cells.get(5).value, "data6"); Assert.assertEquals(project.rows.get(0).cells.get(5).value, "data6");
} }
@Test(groups = { "broken" }, dataProvider = "CSV-or-null") @Test(groups = { }, dataProvider = "CSV-or-null")
public void readQuotedData(String sep){ public void readQuotedData(String sep){
String input = "col1,col2,col3\n" + String input = "col1,col2,col3\n" +
"\"\"\"To Be\"\" is often followed by \"\"or not To Be\"\"\",data2"; "\"\"\"To Be\"\" is often followed by \"\"or not To Be\"\"\",data2";
@ -319,7 +336,7 @@ public class TsvCsvImporterTests {
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3"); Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
} }
@Test(groups = { "broken" }, dataProvider = "CSV-or-null") @Test(groups = { }, dataProvider = "CSV-or-null")
public void readIgnore3_Header2_Skip2_limit2(String sep){ public void readIgnore3_Header2_Skip2_limit2(String sep){
String input = "ignore1\n" + String input = "ignore1\n" +
"ignore2\n" + "ignore2\n" +
@ -346,12 +363,13 @@ public class TsvCsvImporterTests {
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "data-row1-cell1"); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "data-row1-cell1");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data-row1-cell2"); Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data-row1-cell2");
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data-row1-cell3"); Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data-row1-cell3");
Assert.assertEquals(project.rows.get(1).cells.size(), 2); Assert.assertEquals(project.rows.get(1).cells.size(), 3);
Assert.assertEquals(project.rows.get(1).cells.get(0).value, "data-row2-cell1"); Assert.assertEquals(project.rows.get(1).cells.get(0).value, "data-row2-cell1");
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "data-row2-cell2"); Assert.assertEquals(project.rows.get(1).cells.get(1).value, "data-row2-cell2");
Assert.assertNull(project.rows.get(1).cells.get(2));
} }
@Test(groups = { "broken" }, dataProvider = "CSV-or-null") @Test(groups = { }, dataProvider = "CSV-or-null")
public void readWithMultiLinedQuotedData(String sep){ public void readWithMultiLinedQuotedData(String sep){
String input = "col1,col2,col3\n" + String input = "col1,col2,col3\n" +
"\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\",data2"; "\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\",data2";
@ -371,7 +389,7 @@ public class TsvCsvImporterTests {
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data2"); Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data2");
} }
@Test(groups = { "broken" }, dataProvider = "CSV-or-null") @Test(groups = { }, dataProvider = "CSV-or-null")
public void readWithMultiLinedQuotedDataAndBlankLines(String sep){ public void readWithMultiLinedQuotedDataAndBlankLines(String sep){
String input = "col1,col2,col3\n" + String input = "col1,col2,col3\n" +
"\"A line with many \n\n\n\n\n empty lines\",data2"; "\"A line with many \n\n\n\n\n empty lines\",data2";