Use InputStream instead of Reader for JSON import - fixes #698

This commit is contained in:
Tom Morris 2013-03-23 18:36:05 -04:00
parent 6b3592982e
commit 6a91b5d75b
3 changed files with 893 additions and 902 deletions

View File

@ -1,386 +1,379 @@
/* /*
Copyright 2010,2012 Google Inc. Copyright 2010,2012 Google Inc.
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
met: met:
* Redistributions of source code must retain the above copyright * Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above * Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the in the documentation and/or other materials provided with the
distribution. distribution.
* Neither the name of Google Inc. nor the names of its * Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from contributors may be used to endorse or promote products derived from
this software without specific prior written permission. this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
package com.google.refine.importers; package com.google.refine.importers;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.IOException;
import java.io.IOException; import java.io.InputStream;
import java.io.InputStream; import java.io.Serializable;
import java.io.Reader; import java.util.List;
import java.io.Serializable;
import java.util.List; import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.JsonFactory; import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.JsonParser.NumberType;
import org.codehaus.jackson.JsonParser; import org.codehaus.jackson.JsonToken;
import org.codehaus.jackson.JsonParser.NumberType; import org.json.JSONArray;
import org.codehaus.jackson.JsonToken; import org.json.JSONObject;
import org.json.JSONArray; import org.slf4j.Logger;
import org.json.JSONObject; import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import com.google.refine.ProjectMetadata;
import com.google.refine.importers.tree.ImportColumnGroup;
import com.google.refine.ProjectMetadata; import com.google.refine.importers.tree.TreeImportingParserBase;
import com.google.refine.importers.tree.ImportColumnGroup; import com.google.refine.importers.tree.TreeReader;
import com.google.refine.importers.tree.TreeImportingParserBase; import com.google.refine.importers.tree.TreeReaderException;
import com.google.refine.importers.tree.TreeReader; import com.google.refine.importing.ImportingJob;
import com.google.refine.importers.tree.TreeReaderException; import com.google.refine.importing.ImportingUtilities;
import com.google.refine.importing.ImportingJob; import com.google.refine.model.Project;
import com.google.refine.importing.ImportingUtilities; import com.google.refine.util.JSONUtilities;
import com.google.refine.model.Project;
import com.google.refine.util.JSONUtilities; public class JsonImporter extends TreeImportingParserBase {
static final Logger logger = LoggerFactory.getLogger(JsonImporter.class);
public class JsonImporter extends TreeImportingParserBase {
static final Logger logger = LoggerFactory.getLogger(JsonImporter.class); public final static String ANONYMOUS = "_";
public final static String ANONYMOUS = "_"; public JsonImporter() {
super(true);
public JsonImporter() { }
super(false);
} static private class PreviewParsingState {
int tokenCount;
static private class PreviewParsingState { }
int tokenCount;
} final static private int PREVIEW_PARSING_LIMIT = 1000;
final static private int PREVIEW_PARSING_LIMIT = 1000; @Override
public JSONObject createParserUIInitializationData(
@Override ImportingJob job, List<JSONObject> fileRecords, String format) {
public JSONObject createParserUIInitializationData( JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
ImportingJob job, List<JSONObject> fileRecords, String format) { if (fileRecords.size() > 0) {
JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); try {
if (fileRecords.size() > 0) { JSONObject firstFileRecord = fileRecords.get(0);
try { File file = ImportingUtilities.getFile(job, firstFileRecord);
JSONObject firstFileRecord = fileRecords.get(0); JsonFactory factory = new JsonFactory();
File file = ImportingUtilities.getFile(job, firstFileRecord); JsonParser parser = factory.createJsonParser(file);
InputStream is = new FileInputStream(file);
try { PreviewParsingState state = new PreviewParsingState();
JsonFactory factory = new JsonFactory(); Object rootValue = parseForPreview(parser, state);
JsonParser parser = factory.createJsonParser(is); if (rootValue != null) {
JSONUtilities.safePut(options, "dom", rootValue);
PreviewParsingState state = new PreviewParsingState(); }
Object rootValue = parseForPreview(parser, state); } catch (IOException e) {
if (rootValue != null) { logger.error("Error generating parser UI initialization data for JSON file", e);
JSONUtilities.safePut(options, "dom", rootValue); }
} }
} finally {
is.close(); return options;
} }
} catch (IOException e) {
logger.error("Error generating parser UI initialization data for JSON file", e); final static private Object parseForPreview(JsonParser parser, PreviewParsingState state, JsonToken token)
} throws JsonParseException, IOException {
} if (token != null) {
switch (token) {
return options; case START_ARRAY:
} return parseArrayForPreview(parser, state);
case START_OBJECT:
final static private Object parseForPreview(JsonParser parser, PreviewParsingState state, JsonToken token) return parseObjectForPreview(parser, state);
throws JsonParseException, IOException { case VALUE_STRING:
if (token != null) { return parser.getText();
switch (token) { case VALUE_NUMBER_INT:
case START_ARRAY: return Long.valueOf(parser.getLongValue());
return parseArrayForPreview(parser, state); case VALUE_NUMBER_FLOAT:
case START_OBJECT: return Double.valueOf(parser.getDoubleValue());
return parseObjectForPreview(parser, state); case VALUE_TRUE:
case VALUE_STRING: return Boolean.TRUE;
return parser.getText(); case VALUE_FALSE:
case VALUE_NUMBER_INT: return Boolean.FALSE;
return Long.valueOf(parser.getLongValue()); case VALUE_NULL:
case VALUE_NUMBER_FLOAT: return null;
return Double.valueOf(parser.getDoubleValue()); case END_ARRAY:
case VALUE_TRUE: case END_OBJECT:
return Boolean.TRUE; case FIELD_NAME:
case VALUE_FALSE: case NOT_AVAILABLE:
return Boolean.FALSE; case VALUE_EMBEDDED_OBJECT:
case VALUE_NULL: default:
return null; break;
case END_ARRAY: }
case END_OBJECT: }
case FIELD_NAME: return null;
case NOT_AVAILABLE: }
case VALUE_EMBEDDED_OBJECT:
default: final static private Object parseForPreview(JsonParser parser, PreviewParsingState state) {
break; try {
} JsonToken token = parser.nextToken();
} state.tokenCount++;
return null; return parseForPreview(parser, state, token);
} } catch (IOException e) {
return null;
final static private Object parseForPreview(JsonParser parser, PreviewParsingState state) { }
try { }
JsonToken token = parser.nextToken();
state.tokenCount++; final static private JSONObject parseObjectForPreview(JsonParser parser, PreviewParsingState state) {
return parseForPreview(parser, state, token); JSONObject result = new JSONObject();
} catch (IOException e) { loop:while (state.tokenCount < PREVIEW_PARSING_LIMIT) {
return null; try {
} JsonToken token = parser.nextToken();
} if (token == null) {
break;
final static private JSONObject parseObjectForPreview(JsonParser parser, PreviewParsingState state) { }
JSONObject result = new JSONObject(); state.tokenCount++;
loop:while (state.tokenCount < PREVIEW_PARSING_LIMIT) {
try { switch (token) {
JsonToken token = parser.nextToken(); case FIELD_NAME:
if (token == null) { String fieldName = parser.getText();
break; Object fieldValue = parseForPreview(parser, state);
} JSONUtilities.safePut(result, fieldName, fieldValue);
state.tokenCount++; break;
case END_OBJECT:
switch (token) { break loop;
case FIELD_NAME: default:
String fieldName = parser.getText(); break loop;
Object fieldValue = parseForPreview(parser, state); }
JSONUtilities.safePut(result, fieldName, fieldValue); } catch (IOException e) {
break; break;
case END_OBJECT: }
break loop; }
default: return result;
break loop; }
}
} catch (IOException e) { final static private JSONArray parseArrayForPreview(JsonParser parser, PreviewParsingState state) {
break; JSONArray result = new JSONArray();
} loop:while (state.tokenCount < PREVIEW_PARSING_LIMIT) {
} try {
return result; JsonToken token = parser.nextToken();
} if (token == null) {
break;
final static private JSONArray parseArrayForPreview(JsonParser parser, PreviewParsingState state) { }
JSONArray result = new JSONArray(); state.tokenCount++;
loop:while (state.tokenCount < PREVIEW_PARSING_LIMIT) {
try { switch (token) {
JsonToken token = parser.nextToken(); case END_ARRAY:
if (token == null) { break loop;
break; default:
} Object element = parseForPreview(parser, state, token);
state.tokenCount++; JSONUtilities.append(result, element);
}
switch (token) { } catch (IOException e) {
case END_ARRAY: break;
break loop; }
default: }
Object element = parseForPreview(parser, state, token); return result;
JSONUtilities.append(result, element); }
}
} catch (IOException e) { @Override
break; public void parseOneFile(Project project, ProjectMetadata metadata,
} ImportingJob job, String fileSource, InputStream is,
} ImportColumnGroup rootColumnGroup, int limit, JSONObject options, List<Exception> exceptions) {
return result;
} parseOneFile(project, metadata, job, fileSource,
new JSONTreeReader(is), rootColumnGroup, limit, options, exceptions);
@Override }
public void parseOneFile(Project project, ProjectMetadata metadata,
ImportingJob job, String fileSource, Reader reader, static public class JSONTreeReader implements TreeReader {
ImportColumnGroup rootColumnGroup, int limit, JSONObject options, List<Exception> exceptions) { final static Logger logger = LoggerFactory.getLogger("JsonParser");
parseOneFile(project, metadata, job, fileSource, JsonFactory factory = new JsonFactory();
new JSONTreeReader(reader), rootColumnGroup, limit, options, exceptions); JsonParser parser = null;
}
private JsonToken current = null;
static public class JSONTreeReader implements TreeReader { private JsonToken next = null;
final static Logger logger = LoggerFactory.getLogger("JsonParser"); private String fieldName = ANONYMOUS;
private Serializable fieldValue = null;
JsonFactory factory = new JsonFactory();
JsonParser parser = null;
public JSONTreeReader(InputStream is) {
private JsonToken current = null; try {
private JsonToken next = null; parser = factory.createJsonParser(is);
private String fieldName = ANONYMOUS; current = null;
private Serializable fieldValue = null; next = parser.nextToken();
} catch (IOException e) {
e.printStackTrace();
public JSONTreeReader(Reader reader) { }
try { }
parser = factory.createJsonParser(reader);
current = null; /**
next = parser.nextToken(); * Does nothing. All Json is treated as elements
} catch (IOException e) { */
e.printStackTrace(); @Override
} public int getAttributeCount() {
} return 0;
}
/**
* Does nothing. All Json is treated as elements /**
*/ * Does nothing. All Json is treated as elements
@Override */
public int getAttributeCount() { @Override
return 0; public String getAttributeLocalName(int index) {
} return null;
}
/**
* Does nothing. All Json is treated as elements /**
*/ * Does nothing. All Json is treated as elements
@Override */
public String getAttributeLocalName(int index) { @Override
return null; public String getAttributePrefix(int index) {
} return null;
}
/**
* Does nothing. All Json is treated as elements /**
*/ * Does nothing. All Json is treated as elements
@Override */
public String getAttributePrefix(int index) { @Override
return null; public String getAttributeValue(int index) {
} return null;
}
/**
* Does nothing. All Json is treated as elements @Override
*/ public Token current() {
@Override if (current != null) {
public String getAttributeValue(int index) { return this.mapToToken(current);
return null; } else {
} return null;
}
@Override }
public Token current() {
if (current != null) { @Override
return this.mapToToken(current); public String getFieldName() throws TreeReaderException {
} else { return fieldName;
return null; }
}
} /**
* Does nothing. Json does not have prefixes
@Override */
public String getFieldName() throws TreeReaderException { @Override
return fieldName; public String getPrefix() {
} return null;
}
/**
* Does nothing. Json does not have prefixes @Override
*/ public String getFieldValue() throws TreeReaderException {
@Override return fieldValue.toString();
public String getPrefix() { }
return null;
} @Override
public Serializable getValue()
@Override throws TreeReaderException {
public String getFieldValue() throws TreeReaderException { return fieldValue;
return fieldValue.toString(); }
} @Override
public boolean hasNext() {
@Override return next != null;
public Serializable getValue() }
throws TreeReaderException {
return fieldValue; private Serializable getValue(JsonParser parser, JsonToken token) throws IOException {
} if (token != null) {
@Override switch (token) {
public boolean hasNext() { case VALUE_STRING:
return next != null; return parser.getText();
} case VALUE_NUMBER_INT:
if (parser.getNumberType() == NumberType.INT || parser.getNumberType() == NumberType.LONG) {
private Serializable getValue(JsonParser parser, JsonToken token) throws IOException { return Long.valueOf(parser.getLongValue());
if (token != null) { } else {
switch (token) { return parser.getNumberValue();
case VALUE_STRING: }
return parser.getText(); case VALUE_NUMBER_FLOAT:
case VALUE_NUMBER_INT: if (parser.getNumberType() == NumberType.FLOAT) {
if (parser.getNumberType() == NumberType.INT || parser.getNumberType() == NumberType.LONG) { return Float.valueOf(parser.getFloatValue());
return Long.valueOf(parser.getLongValue()); } else if (parser.getNumberType() == NumberType.DOUBLE) {
} else { return Double.valueOf(parser.getDoubleValue());
return parser.getNumberValue(); } else {
} return parser.getNumberValue();
case VALUE_NUMBER_FLOAT: }
if (parser.getNumberType() == NumberType.FLOAT) { case VALUE_TRUE:
return Float.valueOf(parser.getFloatValue()); return Boolean.TRUE;
} else if (parser.getNumberType() == NumberType.DOUBLE) { case VALUE_FALSE:
return Double.valueOf(parser.getDoubleValue()); return Boolean.FALSE;
} else { case VALUE_NULL:
return parser.getNumberValue(); return null;
} case END_ARRAY:
case VALUE_TRUE: default:
return Boolean.TRUE; break;
case VALUE_FALSE: }
return Boolean.FALSE; }
case VALUE_NULL: return null;
return null; }
case END_ARRAY:
default: @Override
break; public Token next() throws TreeReaderException {
} JsonToken previous = current;
} current = next;
return null; next = null; // in case an exception is thrown
} try {
if (current != null) {
@Override if (current.isScalarValue()) {
public Token next() throws TreeReaderException { fieldValue = getValue(parser,current);
JsonToken previous = current; } else {
current = next; fieldValue = null;
next = null; // in case an exception is thrown }
try { if (current == JsonToken.FIELD_NAME) {
if (current != null) { fieldName = parser.getText();
if (current.isScalarValue()) { } else if (current == JsonToken.START_ARRAY
fieldValue = getValue(parser,current); || current == JsonToken.START_OBJECT) {
} else { // Use current field name for next level object
fieldValue = null; // ie elide one level of anonymous fields
} if (previous != JsonToken.FIELD_NAME) {
if (current == JsonToken.FIELD_NAME) { fieldName = ANONYMOUS;
fieldName = parser.getText(); }
} else if (current == JsonToken.START_ARRAY }
|| current == JsonToken.START_OBJECT) { }
// Use current field name for next level object next = parser.nextToken();
// ie elide one level of anonymous fields } catch (IOException e) {
if (previous != JsonToken.FIELD_NAME) { throw new TreeReaderException(e);
fieldName = ANONYMOUS; }
} return current();
} }
}
next = parser.nextToken(); protected Token mapToToken(JsonToken token){
} catch (IOException e) { switch(token){
throw new TreeReaderException(e); case START_ARRAY: return Token.StartEntity;
} case END_ARRAY: return Token.EndEntity;
return current(); case START_OBJECT: return Token.StartEntity;
} case END_OBJECT: return Token.EndEntity;
case VALUE_STRING: return Token.Value;
protected Token mapToToken(JsonToken token){ case FIELD_NAME: return Token.Ignorable; //returned by the getLocalName function()
switch(token){ case VALUE_NUMBER_INT: return Token.Value;
case START_ARRAY: return Token.StartEntity; //Json does not have START_DOCUMENT token type (so ignored as default)
case END_ARRAY: return Token.EndEntity; //Json does not have END_DOCUMENT token type (so ignored as default)
case START_OBJECT: return Token.StartEntity; case VALUE_TRUE : return Token.Value;
case END_OBJECT: return Token.EndEntity; case VALUE_NUMBER_FLOAT : return Token.Value;
case VALUE_STRING: return Token.Value; case VALUE_NULL : return Token.Value;
case FIELD_NAME: return Token.Ignorable; //returned by the getLocalName function() case VALUE_FALSE : return Token.Value;
case VALUE_NUMBER_INT: return Token.Value; case VALUE_EMBEDDED_OBJECT : return Token.Ignorable;
//Json does not have START_DOCUMENT token type (so ignored as default) case NOT_AVAILABLE : return Token.Ignorable;
//Json does not have END_DOCUMENT token type (so ignored as default) default: return Token.Ignorable;
case VALUE_TRUE : return Token.Value; }
case VALUE_NUMBER_FLOAT : return Token.Value; }
case VALUE_NULL : return Token.Value; }
case VALUE_FALSE : return Token.Value; }
case VALUE_EMBEDDED_OBJECT : return Token.Ignorable;
case NOT_AVAILABLE : return Token.Ignorable;
default: return Token.Ignorable;
}
}
}
}

View File

@ -35,7 +35,6 @@ package com.google.refine.tests.importers;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -197,7 +196,7 @@ public class XmlImportUtilitiesTests extends RefineTest {
loadSampleJson(); loadSampleJson();
String[] path = XmlImportUtilitiesStub.detectRecordElement( String[] path = XmlImportUtilitiesStub.detectRecordElement(
new JSONTreeReader(new InputStreamReader(inputStream))); new JSONTreeReader(inputStream));
Assert.assertNotNull(path); Assert.assertNotNull(path);
Assert.assertEquals(path.length, 2); Assert.assertEquals(path.length, 2);
Assert.assertEquals(path[0], JsonImporter.ANONYMOUS); Assert.assertEquals(path[0], JsonImporter.ANONYMOUS);
@ -455,7 +454,7 @@ public class XmlImportUtilitiesTests extends RefineTest {
} }
} }
public TreeReader createJsonParser(){ public TreeReader createJsonParser(){
parser = new JSONTreeReader(new InputStreamReader(inputStream)); parser = new JSONTreeReader(inputStream);
return parser; return parser;
} }
} }