New feature for importing text files (CSV and TSV). By selecting the checkbox in index.html it allows the effects of quotation marks around data values to be ignored.

Unit test added for this.

This has required a further branch to opencsv - patch sent to opencsv project and can be tracked at  https://sourceforge.net/tracker/?func=detail&aid=3018599&group_id=148905&atid=773543

git-svn-id: http://google-refine.googlecode.com/svn/trunk@1010 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-06-20 14:47:45 +00:00
parent 619f914b80
commit 7ced0cb31e
5 changed files with 159 additions and 78 deletions

View File

@ -30,14 +30,14 @@ public class TsvCsvImporter implements Importer {
int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
int skip = ImporterUtilities.getIntegerOption("skip",options,0);
boolean guessValueType = ImporterUtilities.getBooleanOption("guess-value-type", options, true);
boolean ignoreQuotes = ImporterUtilities.getBooleanOption("ignore-quotes", options, false);
LineNumberReader lnReader = new LineNumberReader(reader);
read(lnReader, project, sep,
limit, skip, ignoreLines, headerLines,
guessValueType, splitIntoColumns
);
limit, skip, ignoreLines, headerLines,
guessValueType, splitIntoColumns, ignoreQuotes
);
}
/**
@ -60,11 +60,18 @@ public class TsvCsvImporter implements Importer {
* Whether the parser should try and guess the type of the value being parsed
* @param splitIntoColumns
* Whether the parser should try and split the data source into columns
* @param ignoreQuotes
* Quotation marks are ignored, and all separators and newlines treated as such regardless of whether they are within quoted values
* @throws IOException
*/
public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns ) throws IOException{
public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns, boolean ignoreQuotes ) throws IOException{
CSVParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ?
new CSVParser(sep.toCharArray()[0]) : null;//HACK changing string to char - won't work for multi-char separators.
new CSVParser(sep.toCharArray()[0],//HACK changing string to char - won't work for multi-char separators.
CSVParser.DEFAULT_QUOTE_CHARACTER,
CSVParser.DEFAULT_ESCAPE_CHARACTER,
CSVParser.DEFAULT_STRICT_QUOTES,
CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE,
ignoreQuotes) : null;
List<String> columnNames = new ArrayList<String>();
String line = null;
int rowsWithData = 0;
@ -81,9 +88,19 @@ public class TsvCsvImporter implements Importer {
if (parser == null) {
int tab = line.indexOf('\t');
if (tab >= 0) {
parser = new CSVParser('\t');
parser = new CSVParser('\t',
CSVParser.DEFAULT_QUOTE_CHARACTER,
CSVParser.DEFAULT_ESCAPE_CHARACTER,
CSVParser.DEFAULT_STRICT_QUOTES,
CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE,
ignoreQuotes);
} else {
parser = new CSVParser(',');
parser = new CSVParser(',',
CSVParser.DEFAULT_QUOTE_CHARACTER,
CSVParser.DEFAULT_ESCAPE_CHARACTER,
CSVParser.DEFAULT_STRICT_QUOTES,
CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE,
ignoreQuotes);
}
}

View File

@ -60,7 +60,7 @@ public class TsvCsvImporterTests extends GridworksTest {
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -76,7 +76,7 @@ public class TsvCsvImporterTests extends GridworksTest {
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, false);
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, false, false);
} catch (IOException e) {
Assert.fail();
}
@ -93,7 +93,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -114,7 +114,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1\tdata2\tdata3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, "\t", -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, "\t", -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -135,7 +135,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,234,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, true, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, true, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -156,7 +156,7 @@ public class TsvCsvImporterTests extends GridworksTest {
String input = "data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -176,7 +176,7 @@ public class TsvCsvImporterTests extends GridworksTest {
String input = " data1 , data2 , data3 ";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -193,7 +193,7 @@ public class TsvCsvImporterTests extends GridworksTest {
String input = " data1, data2, data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, true, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 0, true, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -210,7 +210,7 @@ public class TsvCsvImporterTests extends GridworksTest {
String input = " data1, , data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, true, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 0, true, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -229,7 +229,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 2, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 2, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -250,7 +250,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3,data4,data5,data6";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -277,7 +277,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"\"\"\"To Be\"\" is often followed by \"\"or not To Be\"\"\",data2";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -298,7 +298,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 1, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 1, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -320,7 +320,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 1, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 1, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -346,7 +346,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 1, 3, 2, false, true);
SUT.read(lnReader, project, sep, -1, 1, 3, 2, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -375,7 +375,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data-row3-cell1,data-row3-cell2,data-row1-cell3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, 2, 2, 3, 2, false, true);
SUT.read(lnReader, project, sep, 2, 2, 3, 2, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -394,13 +394,30 @@ public class TsvCsvImporterTests extends GridworksTest {
Assert.assertNull(project.rows.get(1).cells.get(2));
}
@Test(dataProvider = "CSV-or-null")
public void ignoreQuotes(String sep){
String input = "data1,data2\",data3,data4";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true, true);
} catch (IOException e) {
Assert.fail();
}
//Assert.assertEquals(project.columnModel.columns.size(), 4);
Assert.assertEquals(project.rows.size(), 1);
//Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "data1");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data2");
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
}
@Test(groups = { }, dataProvider = "CSV-or-null")
public void readWithMultiLinedQuotedData(String sep){
String input = "col1,col2,col3\n" +
"\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\",data2";
"\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\",data2";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -420,7 +437,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"\"A line with many \n\n\n\n\n empty lines\",data2";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -444,6 +461,7 @@ public class TsvCsvImporterTests extends GridworksTest {
whenGetIntegerOption("header-lines",properties,0);
whenGetIntegerOption("limit",properties,-1);
whenGetIntegerOption("skip",properties,0);
whenGetIntegerOption("ignore-quotes",properties,0);
try {
SUT.read(reader, project, properties);
@ -459,10 +477,45 @@ public class TsvCsvImporterTests extends GridworksTest {
Assert.assertEquals((String)project.rows.get(0).cells.get(2).value, "Water");
verify(properties, times(1)).getProperty("separator");
verifyGetIntegerOption("ignore",properties);
verifyGetIntegerOption("header-lines",properties);
verifyGetIntegerOption("limit",properties);
verifyGetIntegerOption("skip",properties);
verifyGetOption("ignore",properties);
verifyGetOption("header-lines",properties);
verifyGetOption("limit",properties);
verifyGetOption("skip",properties);
verifyGetOption("ignore-quotes",properties);
}
@Test
public void readCsvWithPropertiesIgnoreQuotes(){
String input = "data1,data2\",data3,data4";
StringReader reader = new StringReader(input);
when(properties.getProperty("separator")).thenReturn(",");
whenGetIntegerOption("ignore",properties,0);
whenGetIntegerOption("header-lines",properties,0);
whenGetIntegerOption("limit",properties,-1);
whenGetIntegerOption("skip",properties,0);
whenGetBooleanOption("ignore-quotes",properties,true);
try {
SUT.read(reader, project, properties);
} catch (Exception e) {
Assert.fail();
}
Assert.assertEquals(project.rows.size(), 1);
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals((String)project.rows.get(0).cells.get(0).value, "data1");
Assert.assertEquals((String)project.rows.get(0).cells.get(1).value, "data2");
Assert.assertEquals((String)project.rows.get(0).cells.get(2).value, "data3");
Assert.assertEquals((String)project.rows.get(0).cells.get(3).value, "data4");
verify(properties, times(1)).getProperty("separator");
verifyGetOption("ignore",properties);
verifyGetOption("header-lines",properties);
verifyGetOption("limit",properties);
verifyGetOption("skip",properties);
verifyGetOption("ignore-quotes",properties);
}
//--helpers--
@ -477,12 +530,17 @@ public class TsvCsvImporterTests extends GridworksTest {
}};
}
public void whenGetBooleanOption(String name, Properties properties, Boolean def){
when(properties.containsKey(name)).thenReturn(true);
when(properties.getProperty(name)).thenReturn(Boolean.toString(def));
}
public void whenGetIntegerOption(String name, Properties properties, int def){
when(properties.containsKey(name)).thenReturn(true);
when(properties.getProperty(name)).thenReturn(Integer.toString(def));
}
public void verifyGetIntegerOption(String name, Properties properties){
public void verifyGetOption(String name, Properties properties){
verify(properties, times(1)).containsKey(name);
verify(properties, times(1)).getProperty(name);
}

View File

@ -106,6 +106,11 @@
<input id="header-lines-input" name="header-lines" size="5" value="1" /></div>
<div class="field-hint">use 0 if there is no header line</div>
</div>
<div class="field-group">
<div><input id="ignore-quotes-input" name="ignore-quotes" type="checkbox" />Ignore Quotation Marks </div>
<div class="field-hint">Ignore quotation marks, using all newlines and separators</div>
</div>
</td>
</tr></table>
</div>

View File

@ -15,7 +15,8 @@ function onClickUploadFileButton(evt) {
"header-lines=" + $("#header-lines-input")[0].value,
"skip=" + $("#skip-input")[0].value,
"limit=" + $("#limit-input")[0].value,
"guess-value-type=" + $("#guess-value-type-input")[0].checked
"guess-value-type=" + $("#guess-value-type-input")[0].checked,
"ignore-quotes=" + $("#ignore-quotes-input")[0].checked
].join("&"));
return true;