TsvCsvImporter passes all unit tests. TsvCsvImporter again uses opencsv library.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@797 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
648e7ee5fa
commit
092d4602b9
@ -10,10 +10,10 @@ import java.util.Properties;
|
|||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
import com.metaweb.gridworks.importers.parsers.TsvCsvRowParser;
|
import au.com.bytecode.opencsv.CSVParser;
|
||||||
import com.metaweb.gridworks.importers.parsers.NonSplitRowParser;
|
|
||||||
import com.metaweb.gridworks.importers.parsers.RowParser;
|
import com.metaweb.gridworks.expr.ExpressionUtils;
|
||||||
import com.metaweb.gridworks.importers.parsers.SeparatorRowParser;
|
import com.metaweb.gridworks.model.Cell;
|
||||||
import com.metaweb.gridworks.model.Project;
|
import com.metaweb.gridworks.model.Project;
|
||||||
import com.metaweb.gridworks.model.Row;
|
import com.metaweb.gridworks.model.Row;
|
||||||
|
|
||||||
@ -38,9 +38,31 @@ public class TsvCsvImporter implements Importer {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param lnReader
|
||||||
|
* LineNumberReader used to read file or string contents
|
||||||
|
* @param project
|
||||||
|
* The project into which the parsed data will be added
|
||||||
|
* @param sep
|
||||||
|
* The character used to denote different the break between data points
|
||||||
|
* @param limit
|
||||||
|
* The maximum number of rows of data to import
|
||||||
|
* @param skip
|
||||||
|
* The number of initial data rows to skip
|
||||||
|
* @param ignoreLines
|
||||||
|
* The number of initial lines within the data source which should be ignored entirely
|
||||||
|
* @param headerLines
|
||||||
|
* The number of lines in the data source which describe each column
|
||||||
|
* @param guessValueType
|
||||||
|
* Whether the parser should try and guess the type of the value being parsed
|
||||||
|
* @param splitIntoColumns
|
||||||
|
* Whether the parser should try and split the data source into columns
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns ) throws IOException{
|
public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns ) throws IOException{
|
||||||
RowParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ?
|
CSVParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ?
|
||||||
new SeparatorRowParser(sep) : null;
|
new CSVParser(sep.toCharArray()[0]) : null;//HACK changing string to char - won't work for multi-char separators.
|
||||||
List<String> columnNames = new ArrayList<String>();
|
List<String> columnNames = new ArrayList<String>();
|
||||||
String line = null;
|
String line = null;
|
||||||
int rowsWithData = 0;
|
int rowsWithData = 0;
|
||||||
@ -53,46 +75,55 @@ public class TsvCsvImporter implements Importer {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//guess separator
|
||||||
if (parser == null) {
|
if (parser == null) {
|
||||||
if (splitIntoColumns) {
|
int tab = line.indexOf('\t');
|
||||||
int tab = line.indexOf('\t');
|
if (tab >= 0) {
|
||||||
if (tab >= 0) {
|
parser = new CSVParser('\t');
|
||||||
sep = "\t";
|
|
||||||
parser = new TsvCsvRowParser('\t');
|
|
||||||
} else {
|
|
||||||
sep = ",";
|
|
||||||
parser = new TsvCsvRowParser(',');
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
parser = new NonSplitRowParser();
|
parser = new CSVParser(',');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (headerLines > 0) {
|
if (headerLines > 0) {
|
||||||
|
//column headers
|
||||||
headerLines--;
|
headerLines--;
|
||||||
|
|
||||||
List<String> cells = parser.split(line, lnReader);
|
ArrayList<String> cells = getCells(line, parser, lnReader, splitIntoColumns);
|
||||||
|
|
||||||
for (int c = 0; c < cells.size(); c++) {
|
for (int c = 0; c < cells.size(); c++) {
|
||||||
String cell = cells.get(c).trim();
|
String cell = cells.get(c).trim();
|
||||||
|
//add column even if cell is blank
|
||||||
ImporterUtilities.appendColumnName(columnNames, c, cell);
|
ImporterUtilities.appendColumnName(columnNames, c, cell);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
//data
|
||||||
Row row = new Row(columnNames.size());
|
Row row = new Row(columnNames.size());
|
||||||
|
|
||||||
if (parser.parseRow(row, line, guessValueType, lnReader)) {
|
ArrayList<String> cells = getCells(line, parser, lnReader, splitIntoColumns);
|
||||||
|
|
||||||
|
if( cells != null && cells.size() > 0 )
|
||||||
rowsWithData++;
|
rowsWithData++;
|
||||||
|
|
||||||
if (skip <= 0 || rowsWithData > skip) {
|
if (skip <=0 || rowsWithData > skip){
|
||||||
project.rows.add(row);
|
//add parsed data to row
|
||||||
project.columnModel.setMaxCellIndex(row.cells.size());
|
for(String s : cells){
|
||||||
|
s = s.trim();
|
||||||
ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
|
if (ExpressionUtils.isNonBlankData(s)) {
|
||||||
|
row.cells.add(new Cell(s, null));
|
||||||
if (limit > 0 && project.rows.size() >= limit) {
|
}else{
|
||||||
break;
|
row.cells.add(null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
project.rows.add(row);
|
||||||
|
project.columnModel.setMaxCellIndex(row.cells.size());
|
||||||
|
|
||||||
|
ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
|
||||||
|
|
||||||
|
if (limit > 0 && project.rows.size() >= limit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -100,6 +131,25 @@ public class TsvCsvImporter implements Importer {
|
|||||||
ImporterUtilities.setupColumns(project, columnNames);
|
ImporterUtilities.setupColumns(project, columnNames);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected ArrayList<String> getCells(String line, CSVParser parser, LineNumberReader lnReader, boolean splitIntoColumns) throws IOException{
|
||||||
|
ArrayList<String> cells = new ArrayList<String>();
|
||||||
|
if(splitIntoColumns){
|
||||||
|
String[] tokens = parser.parseLineMulti(line);
|
||||||
|
for(String s : tokens){
|
||||||
|
cells.add(s);
|
||||||
|
}
|
||||||
|
while(parser.isPending()){
|
||||||
|
tokens = parser.parseLineMulti(lnReader.readLine());
|
||||||
|
for(String s : tokens){
|
||||||
|
cells.add(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
cells.add(line);
|
||||||
|
}
|
||||||
|
return cells;
|
||||||
|
}
|
||||||
|
|
||||||
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
|
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
@ -66,6 +66,23 @@ public class TsvCsvImporterTests {
|
|||||||
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2");
|
Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2");
|
||||||
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "col3");
|
Assert.assertEquals(project.columnModel.columns.get(2).getName(), "col3");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "CSV-or-null")
|
||||||
|
public void readUnseperatedData(String sep){
|
||||||
|
String input = "value1,value2,value3";
|
||||||
|
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
|
||||||
|
|
||||||
|
try {
|
||||||
|
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, false);
|
||||||
|
} catch (IOException e) {
|
||||||
|
Assert.fail();
|
||||||
|
}
|
||||||
|
Assert.assertEquals(project.columnModel.columns.size(), 1);
|
||||||
|
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Column");
|
||||||
|
Assert.assertEquals(project.rows.size(), 1);
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.size(), 1);
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.get(0).value, input);
|
||||||
|
}
|
||||||
|
|
||||||
@Test(dataProvider = "CSV-or-null")
|
@Test(dataProvider = "CSV-or-null")
|
||||||
public void readSimpleData_CSV_1Header_1Row(String sep){
|
public void readSimpleData_CSV_1Header_1Row(String sep){
|
||||||
@ -129,7 +146,7 @@ public class TsvCsvImporterTests {
|
|||||||
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
|
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(groups = { "broken" }, dataProvider = "CSV-or-null")
|
@Test(groups = { }, dataProvider = "CSV-or-null")
|
||||||
public void readDoesTrimsLeadingTrailingWhitespace(String sep){
|
public void readDoesTrimsLeadingTrailingWhitespace(String sep){
|
||||||
String input = " data1 , data2 , data3 ";
|
String input = " data1 , data2 , data3 ";
|
||||||
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
|
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
|
||||||
@ -229,7 +246,7 @@ public class TsvCsvImporterTests {
|
|||||||
Assert.assertEquals(project.rows.get(0).cells.get(5).value, "data6");
|
Assert.assertEquals(project.rows.get(0).cells.get(5).value, "data6");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(groups = { "broken" }, dataProvider = "CSV-or-null")
|
@Test(groups = { }, dataProvider = "CSV-or-null")
|
||||||
public void readQuotedData(String sep){
|
public void readQuotedData(String sep){
|
||||||
String input = "col1,col2,col3\n" +
|
String input = "col1,col2,col3\n" +
|
||||||
"\"\"\"To Be\"\" is often followed by \"\"or not To Be\"\"\",data2";
|
"\"\"\"To Be\"\" is often followed by \"\"or not To Be\"\"\",data2";
|
||||||
@ -319,7 +336,7 @@ public class TsvCsvImporterTests {
|
|||||||
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
|
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(groups = { "broken" }, dataProvider = "CSV-or-null")
|
@Test(groups = { }, dataProvider = "CSV-or-null")
|
||||||
public void readIgnore3_Header2_Skip2_limit2(String sep){
|
public void readIgnore3_Header2_Skip2_limit2(String sep){
|
||||||
String input = "ignore1\n" +
|
String input = "ignore1\n" +
|
||||||
"ignore2\n" +
|
"ignore2\n" +
|
||||||
@ -346,12 +363,13 @@ public class TsvCsvImporterTests {
|
|||||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "data-row1-cell1");
|
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "data-row1-cell1");
|
||||||
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data-row1-cell2");
|
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data-row1-cell2");
|
||||||
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data-row1-cell3");
|
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data-row1-cell3");
|
||||||
Assert.assertEquals(project.rows.get(1).cells.size(), 2);
|
Assert.assertEquals(project.rows.get(1).cells.size(), 3);
|
||||||
Assert.assertEquals(project.rows.get(1).cells.get(0).value, "data-row2-cell1");
|
Assert.assertEquals(project.rows.get(1).cells.get(0).value, "data-row2-cell1");
|
||||||
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "data-row2-cell2");
|
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "data-row2-cell2");
|
||||||
|
Assert.assertNull(project.rows.get(1).cells.get(2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(groups = { "broken" }, dataProvider = "CSV-or-null")
|
@Test(groups = { }, dataProvider = "CSV-or-null")
|
||||||
public void readWithMultiLinedQuotedData(String sep){
|
public void readWithMultiLinedQuotedData(String sep){
|
||||||
String input = "col1,col2,col3\n" +
|
String input = "col1,col2,col3\n" +
|
||||||
"\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\",data2";
|
"\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\",data2";
|
||||||
@ -371,7 +389,7 @@ public class TsvCsvImporterTests {
|
|||||||
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data2");
|
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data2");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(groups = { "broken" }, dataProvider = "CSV-or-null")
|
@Test(groups = { }, dataProvider = "CSV-or-null")
|
||||||
public void readWithMultiLinedQuotedDataAndBlankLines(String sep){
|
public void readWithMultiLinedQuotedDataAndBlankLines(String sep){
|
||||||
String input = "col1,col2,col3\n" +
|
String input = "col1,col2,col3\n" +
|
||||||
"\"A line with many \n\n\n\n\n empty lines\",data2";
|
"\"A line with many \n\n\n\n\n empty lines\",data2";
|
||||||
|
Loading…
Reference in New Issue
Block a user