TsvCsvImporter passes all unit tests. TsvCsvImporter again uses opencsv library.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@797 7d457c2a-affb-35e4-300a-418c747d4874
2010-05-17 11:57:26 +00:00 · 2010-05-17 11:57:26 +00:00 · 092d4602b9
commit 092d4602b9
parent 648e7ee5fa
2 changed files with 101 additions and 33 deletions
--- a/src/main/java/com/metaweb/gridworks/importers/TsvCsvImporter.java
+++ b/src/main/java/com/metaweb/gridworks/importers/TsvCsvImporter.java
@ -10,10 +10,10 @@ import java.util.Properties;

 import org.apache.commons.lang.StringUtils;

-import com.metaweb.gridworks.importers.parsers.TsvCsvRowParser;
-import com.metaweb.gridworks.importers.parsers.NonSplitRowParser;
-import com.metaweb.gridworks.importers.parsers.RowParser;
-import com.metaweb.gridworks.importers.parsers.SeparatorRowParser;
+import au.com.bytecode.opencsv.CSVParser;
+
+import com.metaweb.gridworks.expr.ExpressionUtils;
+import com.metaweb.gridworks.model.Cell;
 import com.metaweb.gridworks.model.Project;
 import com.metaweb.gridworks.model.Row;

@ -38,9 +38,31 @@ public class TsvCsvImporter implements Importer {
    	);
    }

+    /**
+     *
+     * @param lnReader
+     *           LineNumberReader used to read file or string contents
+     * @param project
+     *           The project into which the parsed data will be added
+     * @param sep
+     *           The character used to denote different the break between data points
+     * @param limit
+     *           The maximum number of rows of data to import
+     * @param skip
+     *           The number of initial data rows to skip
+     * @param ignoreLines
+     *           The number of initial lines within the data source which should be ignored entirely
+     * @param headerLines
+     *           The number of lines in the data source which describe each column
+     * @param guessValueType
+     *           Whether the parser should try and guess the type of the value being parsed
+     * @param splitIntoColumns
+     *           Whether the parser should try and split the data source into columns
+     * @throws IOException
+     */
    public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns ) throws IOException{
-        RowParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ?
-                new SeparatorRowParser(sep) : null;
+        CSVParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ?
+                        new CSVParser(sep.toCharArray()[0]) : null;//HACK changing string to char - won't work for multi-char separators.
        List<String> columnNames = new ArrayList<String>();
        String line = null;
        int rowsWithData = 0;
@ -53,46 +75,55 @@ public class TsvCsvImporter implements Importer {
                continue;
            }

+            //guess separator
            if (parser == null) {
-                if (splitIntoColumns) {
-                    int tab = line.indexOf('\t');
-                    if (tab >= 0) {
-                        sep = "\t";
-                        parser = new TsvCsvRowParser('\t');
-                    } else {
-                        sep = ",";
-                        parser = new TsvCsvRowParser(',');
-                    }
+                int tab = line.indexOf('\t');
+                if (tab >= 0) {
+                    parser = new CSVParser('\t');
                } else {
-                    parser = new NonSplitRowParser();
+                    parser = new CSVParser(',');
                }
            }

+
            if (headerLines > 0) {
+                //column headers
                headerLines--;

-                List<String> cells = parser.split(line, lnReader);
+                ArrayList<String> cells = getCells(line, parser, lnReader, splitIntoColumns);
+
                for (int c = 0; c < cells.size(); c++) {
                    String cell = cells.get(c).trim();
-
+                    //add column even if cell is blank
                    ImporterUtilities.appendColumnName(columnNames, c, cell);
                }
            } else {
+                //data
                Row row = new Row(columnNames.size());

-                if (parser.parseRow(row, line, guessValueType, lnReader)) {
+                ArrayList<String> cells = getCells(line, parser, lnReader, splitIntoColumns);
+
+                if( cells != null && cells.size() > 0 )
                    rowsWithData++;

-                    if (skip <= 0 || rowsWithData > skip) {
-                        project.rows.add(row);
-                        project.columnModel.setMaxCellIndex(row.cells.size());
-
-                        ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
-
-                        if (limit > 0 && project.rows.size() >= limit) {
-                            break;
+                if (skip <=0  || rowsWithData > skip){
+                    //add parsed data to row
+                    for(String s : cells){
+                        s = s.trim();
+                        if (ExpressionUtils.isNonBlankData(s)) {
+                            row.cells.add(new Cell(s, null));
+                        }else{
+                            row.cells.add(null);
                        }
                    }
+                    project.rows.add(row);
+                    project.columnModel.setMaxCellIndex(row.cells.size());
+
+                    ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
+
+                    if (limit > 0 && project.rows.size() >= limit) {
+                        break;
+                    }
                }
            }
        }
@ -100,6 +131,25 @@ public class TsvCsvImporter implements Importer {
        ImporterUtilities.setupColumns(project, columnNames);
    }

+    protected ArrayList<String> getCells(String line, CSVParser parser, LineNumberReader lnReader, boolean splitIntoColumns) throws IOException{
+        ArrayList<String> cells = new ArrayList<String>();
+        if(splitIntoColumns){
+            String[] tokens = parser.parseLineMulti(line);
+            for(String s : tokens){
+                cells.add(s);
+            }
+            while(parser.isPending()){
+                tokens = parser.parseLineMulti(lnReader.readLine());
+                for(String s : tokens){
+                    cells.add(s);
+                }
+            }
+        }else{
+            cells.add(line);
+        }
+        return cells;
+    }
+
    public void read(InputStream inputStream, Project project, Properties options) throws Exception {
        throw new UnsupportedOperationException();
    }
--- a/tests/java/src/com/metaweb/gridworks/tests/importers/TsvCsvImporterTests.java
+++ b/tests/java/src/com/metaweb/gridworks/tests/importers/TsvCsvImporterTests.java
@ -66,6 +66,23 @@ public class TsvCsvImporterTests {
        Assert.assertEquals(project.columnModel.columns.get(1).getName(), "col2");
        Assert.assertEquals(project.columnModel.columns.get(2).getName(), "col3");
    }
+    
+    @Test(dataProvider = "CSV-or-null")
+    public void readUnseperatedData(String sep){
+        String input = "value1,value2,value3";
+        LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
+
+        try {
+            SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, false);
+        } catch (IOException e) {
+            Assert.fail();
+        }
+        Assert.assertEquals(project.columnModel.columns.size(), 1);
+        Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Column");
+        Assert.assertEquals(project.rows.size(), 1);
+        Assert.assertEquals(project.rows.get(0).cells.size(), 1);
+        Assert.assertEquals(project.rows.get(0).cells.get(0).value, input);
+    }

    @Test(dataProvider = "CSV-or-null")
    public void readSimpleData_CSV_1Header_1Row(String sep){
@ -129,7 +146,7 @@ public class TsvCsvImporterTests {
        Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
    }

-    @Test(groups = { "broken" }, dataProvider = "CSV-or-null")
+    @Test(groups = {  }, dataProvider = "CSV-or-null")
    public void readDoesTrimsLeadingTrailingWhitespace(String sep){
        String input = " data1 , data2 , data3 ";
        LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
@ -229,7 +246,7 @@ public class TsvCsvImporterTests {
        Assert.assertEquals(project.rows.get(0).cells.get(5).value, "data6");
    }

-    @Test(groups = { "broken" }, dataProvider = "CSV-or-null")
+    @Test(groups = { }, dataProvider = "CSV-or-null")
    public void readQuotedData(String sep){
        String input = "col1,col2,col3\n" +
                       "\"\"\"To Be\"\" is often followed by \"\"or not To Be\"\"\",data2";
@ -319,7 +336,7 @@ public class TsvCsvImporterTests {
        Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
    }

-    @Test(groups = { "broken" }, dataProvider = "CSV-or-null")
+    @Test(groups = {  }, dataProvider = "CSV-or-null")
    public void readIgnore3_Header2_Skip2_limit2(String sep){
        String input = "ignore1\n" +
                       "ignore2\n" +
@ -346,12 +363,13 @@ public class TsvCsvImporterTests {
        Assert.assertEquals(project.rows.get(0).cells.get(0).value, "data-row1-cell1");
        Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data-row1-cell2");
        Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data-row1-cell3");
-        Assert.assertEquals(project.rows.get(1).cells.size(), 2);
+        Assert.assertEquals(project.rows.get(1).cells.size(), 3);
        Assert.assertEquals(project.rows.get(1).cells.get(0).value, "data-row2-cell1");
        Assert.assertEquals(project.rows.get(1).cells.get(1).value, "data-row2-cell2");
+        Assert.assertNull(project.rows.get(1).cells.get(2));
    }

-    @Test(groups = { "broken" }, dataProvider = "CSV-or-null")
+    @Test(groups = { }, dataProvider = "CSV-or-null")
    public void readWithMultiLinedQuotedData(String sep){
        String input = "col1,col2,col3\n" +
        	"\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\",data2";
@ -371,7 +389,7 @@ public class TsvCsvImporterTests {
        Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data2");
    }

-    @Test(groups = { "broken" }, dataProvider = "CSV-or-null")
+    @Test(groups = {  }, dataProvider = "CSV-or-null")
    public void readWithMultiLinedQuotedDataAndBlankLines(String sep){
        String input = "col1,col2,col3\n" +
            "\"A line with many \n\n\n\n\n empty lines\",data2";