New feature for importing text files (CSV and TSV). By selecting the checkbox in index.html it allows the effects of quotation marks around data values to be ignored.

Unit test added for this.

This has required a further branch to opencsv - patch sent to opencsv project and can be tracked at  https://sourceforge.net/tracker/?func=detail&aid=3018599&group_id=148905&atid=773543

git-svn-id: http://google-refine.googlecode.com/svn/trunk@1010 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-06-20 14:47:45 +00:00
parent 619f914b80
commit 7ced0cb31e
5 changed files with 159 additions and 78 deletions

View File

@ -30,14 +30,14 @@ public class TsvCsvImporter implements Importer {
int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
int skip = ImporterUtilities.getIntegerOption("skip",options,0);
boolean guessValueType = ImporterUtilities.getBooleanOption("guess-value-type", options, true);
boolean ignoreQuotes = ImporterUtilities.getBooleanOption("ignore-quotes", options, false);
LineNumberReader lnReader = new LineNumberReader(reader);
read(lnReader, project, sep,
limit, skip, ignoreLines, headerLines,
guessValueType, splitIntoColumns
);
limit, skip, ignoreLines, headerLines,
guessValueType, splitIntoColumns, ignoreQuotes
);
}
/**
@ -60,11 +60,18 @@ public class TsvCsvImporter implements Importer {
* Whether the parser should try and guess the type of the value being parsed
* @param splitIntoColumns
* Whether the parser should try and split the data source into columns
* @param ignoreQuotes
* Quotation marks are ignored, and all separators and newlines treated as such regardless of whether they are within quoted values
* @throws IOException
*/
public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns ) throws IOException{
public void read(LineNumberReader lnReader, Project project, String sep, int limit, int skip, int ignoreLines, int headerLines, boolean guessValueType, boolean splitIntoColumns, boolean ignoreQuotes ) throws IOException{
CSVParser parser = (sep != null && sep.length() > 0 && splitIntoColumns) ?
new CSVParser(sep.toCharArray()[0]) : null;//HACK changing string to char - won't work for multi-char separators.
new CSVParser(sep.toCharArray()[0],//HACK changing string to char - won't work for multi-char separators.
CSVParser.DEFAULT_QUOTE_CHARACTER,
CSVParser.DEFAULT_ESCAPE_CHARACTER,
CSVParser.DEFAULT_STRICT_QUOTES,
CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE,
ignoreQuotes) : null;
List<String> columnNames = new ArrayList<String>();
String line = null;
int rowsWithData = 0;
@ -81,9 +88,19 @@ public class TsvCsvImporter implements Importer {
if (parser == null) {
int tab = line.indexOf('\t');
if (tab >= 0) {
parser = new CSVParser('\t');
parser = new CSVParser('\t',
CSVParser.DEFAULT_QUOTE_CHARACTER,
CSVParser.DEFAULT_ESCAPE_CHARACTER,
CSVParser.DEFAULT_STRICT_QUOTES,
CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE,
ignoreQuotes);
} else {
parser = new CSVParser(',');
parser = new CSVParser(',',
CSVParser.DEFAULT_QUOTE_CHARACTER,
CSVParser.DEFAULT_ESCAPE_CHARACTER,
CSVParser.DEFAULT_STRICT_QUOTES,
CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE,
ignoreQuotes);
}
}

View File

@ -23,12 +23,12 @@ import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.tests.GridworksTest;
public class TsvCsvImporterTests extends GridworksTest {
@BeforeTest
public void init() {
logger = LoggerFactory.getLogger(this.getClass());
}
//constants
String SAMPLE_ROW = "NDB_No,Shrt_Desc,Water";
@ -60,7 +60,7 @@ public class TsvCsvImporterTests extends GridworksTest {
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -76,7 +76,7 @@ public class TsvCsvImporterTests extends GridworksTest {
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, false);
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, false, false);
} catch (IOException e) {
Assert.fail();
}
@ -93,7 +93,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -114,7 +114,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1\tdata2\tdata3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, "\t", -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, "\t", -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -135,7 +135,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,234,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, true, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, true, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -156,7 +156,7 @@ public class TsvCsvImporterTests extends GridworksTest {
String input = "data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -176,7 +176,7 @@ public class TsvCsvImporterTests extends GridworksTest {
String input = " data1 , data2 , data3 ";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -193,7 +193,7 @@ public class TsvCsvImporterTests extends GridworksTest {
String input = " data1, data2, data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, true, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 0, true, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -210,7 +210,7 @@ public class TsvCsvImporterTests extends GridworksTest {
String input = " data1, , data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, true, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 0, true, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -229,7 +229,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 2, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 2, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -250,7 +250,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3,data4,data5,data6";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -277,7 +277,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"\"\"\"To Be\"\" is often followed by \"\"or not To Be\"\"\",data2";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -298,7 +298,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 1, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 1, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -320,7 +320,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 1, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 1, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -346,7 +346,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data1,data2,data3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 1, 3, 2, false, true);
SUT.read(lnReader, project, sep, -1, 1, 3, 2, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -375,7 +375,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"data-row3-cell1,data-row3-cell2,data-row1-cell3";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, 2, 2, 3, 2, false, true);
SUT.read(lnReader, project, sep, 2, 2, 3, 2, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -394,13 +394,30 @@ public class TsvCsvImporterTests extends GridworksTest {
Assert.assertNull(project.rows.get(1).cells.get(2));
}
@Test(dataProvider = "CSV-or-null")
public void ignoreQuotes(String sep){
String input = "data1,data2\",data3,data4";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 0, false, true, true);
} catch (IOException e) {
Assert.fail();
}
//Assert.assertEquals(project.columnModel.columns.size(), 4);
Assert.assertEquals(project.rows.size(), 1);
//Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "data1");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "data2");
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "data3");
}
@Test(groups = { }, dataProvider = "CSV-or-null")
public void readWithMultiLinedQuotedData(String sep){
String input = "col1,col2,col3\n" +
"\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\",data2";
"\"\"\"To\n Be\"\" is often followed by \"\"or not To\n Be\"\"\",data2";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -420,7 +437,7 @@ public class TsvCsvImporterTests extends GridworksTest {
"\"A line with many \n\n\n\n\n empty lines\",data2";
LineNumberReader lnReader = new LineNumberReader(new StringReader(input));
try {
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true);
SUT.read(lnReader, project, sep, -1, 0, 0, 1, false, true, false);
} catch (IOException e) {
Assert.fail();
}
@ -444,6 +461,7 @@ public class TsvCsvImporterTests extends GridworksTest {
whenGetIntegerOption("header-lines",properties,0);
whenGetIntegerOption("limit",properties,-1);
whenGetIntegerOption("skip",properties,0);
whenGetIntegerOption("ignore-quotes",properties,0);
try {
SUT.read(reader, project, properties);
@ -459,10 +477,45 @@ public class TsvCsvImporterTests extends GridworksTest {
Assert.assertEquals((String)project.rows.get(0).cells.get(2).value, "Water");
verify(properties, times(1)).getProperty("separator");
verifyGetIntegerOption("ignore",properties);
verifyGetIntegerOption("header-lines",properties);
verifyGetIntegerOption("limit",properties);
verifyGetIntegerOption("skip",properties);
verifyGetOption("ignore",properties);
verifyGetOption("header-lines",properties);
verifyGetOption("limit",properties);
verifyGetOption("skip",properties);
verifyGetOption("ignore-quotes",properties);
}
@Test
public void readCsvWithPropertiesIgnoreQuotes(){
String input = "data1,data2\",data3,data4";
StringReader reader = new StringReader(input);
when(properties.getProperty("separator")).thenReturn(",");
whenGetIntegerOption("ignore",properties,0);
whenGetIntegerOption("header-lines",properties,0);
whenGetIntegerOption("limit",properties,-1);
whenGetIntegerOption("skip",properties,0);
whenGetBooleanOption("ignore-quotes",properties,true);
try {
SUT.read(reader, project, properties);
} catch (Exception e) {
Assert.fail();
}
Assert.assertEquals(project.rows.size(), 1);
Assert.assertEquals(project.rows.get(0).cells.size(), 4);
Assert.assertEquals((String)project.rows.get(0).cells.get(0).value, "data1");
Assert.assertEquals((String)project.rows.get(0).cells.get(1).value, "data2");
Assert.assertEquals((String)project.rows.get(0).cells.get(2).value, "data3");
Assert.assertEquals((String)project.rows.get(0).cells.get(3).value, "data4");
verify(properties, times(1)).getProperty("separator");
verifyGetOption("ignore",properties);
verifyGetOption("header-lines",properties);
verifyGetOption("limit",properties);
verifyGetOption("skip",properties);
verifyGetOption("ignore-quotes",properties);
}
//--helpers--
@ -477,12 +530,17 @@ public class TsvCsvImporterTests extends GridworksTest {
}};
}
public void whenGetBooleanOption(String name, Properties properties, Boolean def){
when(properties.containsKey(name)).thenReturn(true);
when(properties.getProperty(name)).thenReturn(Boolean.toString(def));
}
public void whenGetIntegerOption(String name, Properties properties, int def){
when(properties.containsKey(name)).thenReturn(true);
when(properties.getProperty(name)).thenReturn(Integer.toString(def));
}
public void verifyGetIntegerOption(String name, Properties properties){
public void verifyGetOption(String name, Properties properties){
verify(properties, times(1)).containsKey(name);
verify(properties, times(1)).getProperty(name);
}

View File

@ -4,22 +4,22 @@
<title>Freebase Gridworks</title>
<link rel="icon" type="image/png" href="images/favicon.png">
<link type="text/css" rel="stylesheet" href="externals/jquery-ui/css/ui-lightness/jquery-ui-1.8.custom.css" />
<link rel="stylesheet" href="/styles/common.css" />
<link rel="stylesheet" href="/styles/freebase.css" />
<link rel="stylesheet" href="/styles/index.css" />
<link rel="stylesheet" href="/styles/jquery-ui-overrides.css" />
<script type="text/javascript" src="externals/jquery-1.4.2.min.js"></script>
<script type="text/javascript" src="externals/jquery-ui/jquery-ui-1.8.custom.min.js"></script>
<script type="text/javascript" src="externals/date.js"></script>
<script type="text/javascript" src="scripts/util/string.js"></script>
<script type="text/javascript" src="scripts/version.js"></script>
<script type="text/javascript" src="scripts/index.js"></script>
</head>
<body>
<div id="header">
@ -27,7 +27,7 @@
<a id="logo" href="http://www.freebase.com/" title="Freebase"><img alt="Freebase" src="images/freebase-headerlogo.png" /></a>
<div id="path"><span class="app-path-section"><a href="./index.html">Gridworks</a></span></div>
</div>
<div id="body">
<div class="grid-layout layout-loose"><table>
<tr>
@ -43,7 +43,7 @@
<p><a href="about.html" class="quiet-link">Credits and Licenses &raquo;</a>
</p>
</td>
<td id="projects-panel">
<div class="island">
<div id="projects" class="round-corners">
@ -52,11 +52,11 @@
</div>
</div>
</td>
<td id="forms">
<div class="island">
<form id="file-upload-form" method="post" enctype="multipart/form-data" action="/command/create-project-from-upload" accept-charset="UTF-8">
<div id="form-tabs-create-project" class="round-corners">
<a class="form-tab-link" href="javascript:showHide('file-upload-form', 'project-upload-form')">or Import an Existing Project</a>
<h1>Create a New Project</h1>
@ -70,48 +70,53 @@
<td><input type="submit" value="Create Project" id="upload-file-button" class="button-primary" /></td></tr>
</table></div>
</div>
<div id="form-create-project-more-options">
<h2>Advanced Options</h2>
<table><tr>
<td>
<div class="field-label">Limit load to:</div>
<div class="field-body"><input id="limit-input" name="limit" size="5" /> rows (blank for all)</div>
<div class="field-label">Ignore:</div>
<div class="field-body"><input id="ignore-input" name="ignore" size="5" value="0" /> initial non-blank lines</div>
<div class="field-label">Skip:</div>
<div class="field-body"><input id="skip-input" name="skip" size="5" value="0" /> initial data rows</div>
</td>
<td>
<div class="field-label">When parsing text files:</div>
<div class="field-group">
<div><input id="split-into-columns-input" type="checkbox" checked="true" name="split-into-columns" /> Split into columns</div>
<div>Column separator:
<input id="separator-input" name="separator" size="2" /></div>
<div class="field-hint">leave blank to guess comma or tab</div>
</div>
<div class="field-group">
<div><input id="guess-value-type-input" name="guess-value-type" type="checkbox" checked="true" /> Guess cells' value types</div>
<div class="field-hint">try to parse numbers, dates, etc.</div>
</div>
<div class="field-group">
<div>Header lines:
<input id="header-lines-input" name="header-lines" size="5" value="1" /></div>
<div class="field-hint">use 0 if there is no header line</div>
</div>
<div class="field-group">
<div><input id="ignore-quotes-input" name="ignore-quotes" type="checkbox" />Ignore Quotation Marks </div>
<div class="field-hint">Ignore quotation marks, using all newlines and separators</div>
</div>
</td>
</tr></table>
</div>
</form>
<form id="project-upload-form" method="post" enctype="multipart/form-data" action="/command/import-project" accept-charset="UTF-8" style="display:none;">
<div id="form-tabs-import-project" class="round-corners">
<a class="form-tab-link" href="javascript:showHide('project-upload-form', 'file-upload-form')">or Create a New Project</a>

View File

@ -2,12 +2,12 @@ function onClickUploadFileButton(evt) {
var projectName = $("#project-name-input")[0].value;
if (! $.trim(projectName).length) {
window.alert("You must specify a project name.");
} else if ($("#project-file-input")[0].files.length === 0) {
window.alert("You must specify select a file to upload.");
} else {
$("#file-upload-form").attr("action",
$("#file-upload-form").attr("action",
"/command/create-project-from-upload?" + [
"split-into-columns=" + $("#split-into-columns-input")[0].checked,
"separator=" + $("#separator-input")[0].value,
@ -15,12 +15,13 @@ function onClickUploadFileButton(evt) {
"header-lines=" + $("#header-lines-input")[0].value,
"skip=" + $("#skip-input")[0].value,
"limit=" + $("#limit-input")[0].value,
"guess-value-type=" + $("#guess-value-type-input")[0].checked
"guess-value-type=" + $("#guess-value-type-input")[0].checked,
"ignore-quotes=" + $("#ignore-quotes-input")[0].checked
].join("&"));
return true;
}
evt.preventDefault();
return false;
}
@ -52,18 +53,18 @@ function formatDate(d) {
function isThereNewRelease() {
var thisRevision = GridworksVersion.revision;
var revision_pattern = /r([0-9]+)/;
if (!revision_pattern.test(thisRevision)) { // probably "trunk"
return false;
}
var latestRevision = GridworksReleases.releases[0].revision;
var thisRev = parseInt(revision_pattern.exec(thisRevision)[1],10);
var latestRev = parseInt(revision_pattern.exec(GridworksReleases.releases[0].revision)[1],10);
return latestRev > thisRev;
}
@ -89,7 +90,7 @@ function renderProjects(data) {
}
}
projects.sort(function(a, b) { return b.date.getTime() - a.date.getTime(); });
var container = $("#projects-container").empty();
if (!projects.length) {
$('<div>')
@ -105,16 +106,16 @@ function renderProjects(data) {
'<th></th>' +
'</tr></table>'
).appendTo(container)[0];
var renderProject = function(project) {
var tr = table.insertRow(table.rows.length);
tr.className = "project";
var nameLink = $('<a></a>')
.text(project.name)
.attr("href", "/project?project=" + project.id)
.appendTo(tr.insertCell(tr.cells.length));
var renameLink = $('<a></a>')
.text("rename")
.attr("href", "javascript:{}")
@ -124,12 +125,12 @@ function renderProjects(data) {
if (name == null) {
return;
}
name = $.trim(name);
if (project.name == name || name.length == 0) {
return;
}
$.ajax({
type: "POST",
url: "/command/rename-project",
@ -144,13 +145,13 @@ function renderProjects(data) {
}
});
}).appendTo(tr.insertCell(tr.cells.length));
$('<div></div>')
.html(formatDate(project.date))
.addClass("last-modified")
.attr("title", project.date.toString())
.appendTo(tr.insertCell(tr.cells.length));
$('<a></a>')
.addClass("delete-project")
.attr("title","Delete this project")
@ -168,18 +169,18 @@ function renderProjects(data) {
fetchProjects();
}
}
});
});
}
return false;
}).appendTo(tr.insertCell(tr.cells.length));
$(tr).mouseenter(function() {
renameLink.css("visibility", "visible");
}).mouseleave(function() {
renameLink.css("visibility", "hidden");
});
};
for (var i = 0; i < projects.length; i++) {
renderProject(projects[i]);
}
@ -193,21 +194,21 @@ function showHide(toHide, toShow) {
function onLoad() {
fetchProjects();
$("#upload-file-button").click(onClickUploadFileButton);
$("#more-options-link").click(function() {
$("#more-options-controls").hide();
$("#more-options").show();
});
var version = (GridworksVersion.version != "$VERSION") ? "Version " + GridworksVersion.version + "-" + GridworksVersion.revision : "";
$("#gridworks-version").text(version);
var script = $('<script></script>')
.attr("src", "http://freebase-gridworks.googlecode.com/svn/support/releases.js")
.attr("type", "text/javascript")
.appendTo(document.body);
var poll = function() {
if ("GridworksReleases" in window) {
if (isThereNewRelease()) {