- Issue 112: Refactor Importer API (patch from tfmorris)

- Added support for storing custom metadata in ProjectMetadata.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@1138 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-08-06 05:04:25 +00:00
parent 00c6865d95
commit f411dc9104
12 changed files with 316 additions and 163 deletions

View File

@ -1,6 +1,10 @@
package com.google.gridworks;
import java.io.Serializable;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import org.json.JSONException;
@ -23,7 +27,8 @@ public class ProjectMetadata implements Jsonizable {
private String _encoding;
private int _encodingConfidence;
private PreferenceStore _preferenceStore = new PreferenceStore();
private Map<String, Serializable> _customMetadata = new HashMap<String, Serializable>();
private PreferenceStore _preferenceStore = new PreferenceStore();
final Logger logger = LoggerFactory.getLogger("project_metadata");
@ -51,13 +56,20 @@ public class ProjectMetadata implements Jsonizable {
writer.key("encoding"); writer.value(_encoding);
writer.key("encodingConfidence"); writer.value(_encodingConfidence);
writer.key("customMetadata"); writer.object();
for (String key : _customMetadata.keySet()) {
Serializable value = _customMetadata.get(key);
writer.key(key);
writer.value(value);
}
writer.endObject();
writer.key("preferences"); _preferenceStore.write(writer, options);
}
writer.endObject();
}
public void write(JSONWriter jsonWriter) throws Exception {
Properties options = new Properties();
options.setProperty("mode", "save");
@ -92,6 +104,24 @@ public class ProjectMetadata implements Jsonizable {
}
}
if (obj.has("customMetadata") && !obj.isNull("customMetadata")) {
try {
JSONObject obj2 = obj.getJSONObject("customMetadata");
@SuppressWarnings("unchecked")
Iterator<String> keys = obj2.keys();
while (keys.hasNext()) {
String key = keys.next();
Object value = obj2.get(key);
if (value != null && value instanceof Serializable) {
pm._customMetadata.put(key, (Serializable) value);
}
}
} catch (JSONException e) {
// ignore
}
}
return pm;
}
@ -153,4 +183,16 @@ public class ProjectMetadata implements Jsonizable {
public PreferenceStore getPreferenceStore() {
return _preferenceStore;
}
public Serializable getCustomMetadata(String key) {
return _customMetadata.get(key);
}
public void setCustomMetadata(String key, Serializable value) {
if (value == null) {
_customMetadata.remove(key);
} else {
_customMetadata.put(key, value);
}
}
}

View File

@ -43,7 +43,10 @@ import com.google.gridworks.ProjectManager;
import com.google.gridworks.ProjectMetadata;
import com.google.gridworks.commands.Command;
import com.google.gridworks.importers.Importer;
import com.google.gridworks.importers.ReaderImporter;
import com.google.gridworks.importers.StreamImporter;
import com.google.gridworks.importers.TsvCsvImporter;
import com.google.gridworks.importers.UrlImporter;
import com.google.gridworks.model.Project;
import com.google.gridworks.util.IOUtils;
import com.google.gridworks.util.ParsingUtilities;
@ -400,39 +403,42 @@ public class CreateProjectCommand extends Command {
return result;
}
protected void internalImportURL(
HttpServletRequest request,
Project project,
Properties options,
String urlString
) throws Exception {
protected void internalImportURL(HttpServletRequest request,
Project project, Properties options, String urlString)
throws Exception {
URL url = new URL(urlString);
URLConnection connection = null;
try {
connection = url.openConnection();
connection.setConnectTimeout(5000);
connection.connect();
} catch (Exception e) {
throw new Exception("Cannot connect to " + urlString, e);
}
// Try for a URL importer first
Importer importer = guessUrlImporter(url);
if (importer instanceof UrlImporter) {
((UrlImporter) importer).read(url, project, options);
return;
} else {
// If we couldn't find one, try opening URL and treating as a stream
try {
connection = url.openConnection();
connection.setConnectTimeout(5000);
connection.connect();
} catch (Exception e) {
throw new Exception("Cannot connect to " + urlString, e);
}
InputStream inputStream = null;
try {
inputStream = connection.getInputStream();
} catch (Exception e) {
throw new Exception("Cannot retrieve content from " + url, e);
}
InputStream inputStream = null;
try {
inputStream = connection.getInputStream();
} catch (Exception e) {
throw new Exception("Cannot retrieve content from " + url, e);
}
try {
Importer importer = guessImporter(
connection.getContentType(),
url.getPath()
);
internalInvokeImporter(project, importer, options, inputStream, connection.getContentEncoding());
} finally {
inputStream.close();
try {
importer = guessImporter(connection.getContentType(),
url.getPath());
internalInvokeImporter(project, importer, options, inputStream,
connection.getContentEncoding());
} finally {
inputStream.close();
}
}
}
@ -443,7 +449,7 @@ public class CreateProjectCommand extends Command {
InputStream rawInputStream,
String encoding
) throws Exception {
if (importer.takesReader()) {
if (importer instanceof ReaderImporter) {
BufferedInputStream inputStream = new BufferedInputStream(rawInputStream);
@ -482,28 +488,45 @@ public class CreateProjectCommand extends Command {
new InputStreamReader(inputStream);
}
importer.read(reader, project, options);
((ReaderImporter) importer).read(reader, project, options);
} else {
importer.read(rawInputStream, project, options);
((StreamImporter) importer).read(rawInputStream, project, options);
}
}
protected void internalInvokeImporter(
Project project,
Importer importer,
Properties options,
Reader reader
Project project,
ReaderImporter importer,
Properties options,
Reader reader
) throws Exception {
importer.read(reader, project, options);
}
protected Importer guessImporter(String contentType, String fileName) {
for(Importer i : importers.values()){
protected Importer guessImporter(String contentType, String fileName, boolean provideDefault) {
for (Importer i : importers.values()){
if(i.canImportData(contentType, fileName)){
return i;
}
}
if (provideDefault) {
return new TsvCsvImporter(); // default
} else {
return null;
}
}
protected Importer guessImporter(String contentType, String filename) {
return guessImporter(contentType, filename, true);
}
return new TsvCsvImporter(); //default
protected Importer guessUrlImporter(URL url) {
for (Importer importer : importers.values()){
if (importer instanceof UrlImporter
&& ((UrlImporter) importer).canImportData(url)) {
return importer;
}
}
return null;
}
}

View File

@ -2,7 +2,6 @@ package com.google.gridworks.importers;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
@ -28,18 +27,11 @@ import com.google.gridworks.model.ReconCandidate;
import com.google.gridworks.model.Row;
import com.google.gridworks.model.Recon.Judgment;
public class ExcelImporter implements Importer {
public class ExcelImporter implements StreamImporter {
protected boolean _xmlBased;
public boolean takesReader() {
return false;
}
public void read(Reader reader, Project project, Properties options) throws Exception {
throw new UnsupportedOperationException();
}
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
@Override
public void read(InputStream inputStream, Project project, Properties options) throws ImportException {
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1);
int limit = ImporterUtilities.getIntegerOption("limit", options, -1);
@ -51,7 +43,7 @@ public class ExcelImporter implements Importer {
new XSSFWorkbook(inputStream) :
new HSSFWorkbook(new POIFSFileSystem(inputStream));
} catch (IOException e) {
throw new Exception(
throw new ImportException(
"Attempted to parse file as Excel file but failed. " +
"Try to use Excel to re-save the file as a different Excel version or as TSV and upload again.",
e
@ -94,8 +86,9 @@ public class ExcelImporter implements Importer {
for (int c = firstCell; c <= lastCell; c++) {
org.apache.poi.ss.usermodel.Cell cell = row.getCell(c);
if (cell != null) {
String text = cell.getStringCellValue().trim();
if (text.length() > 0) {
Serializable value = extractCell(cell);
String text = value != null ? value.toString() : null;
if (text != null && text.length() > 0) {
while (columnNames.size() < c + 1) {
columnNames.add(null);
}
@ -194,7 +187,7 @@ public class ExcelImporter implements Importer {
}
}
protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map<String, Recon> reconMap) {
protected Serializable extractCell(org.apache.poi.ss.usermodel.Cell cell) {
int cellType = cell.getCellType();
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR ||
cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) {
@ -222,6 +215,12 @@ public class ExcelImporter implements Importer {
}
}
return value;
}
protected Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map<String, Recon> reconMap) {
Serializable value = extractCell(cell);
if (value != null) {
Recon recon = null;
@ -273,6 +272,7 @@ public class ExcelImporter implements Importer {
}
}
@Override
public boolean canImportData(String contentType, String fileName) {
if (contentType != null) {
contentType = contentType.toLowerCase().trim();

View File

@ -0,0 +1,15 @@
package com.google.gridworks.importers;
/**
* Exception thrown by importers. Typically contains a nested exception
* indicating the underlying cause of the problem.
*/
public class ImportException extends Exception {
private static final long serialVersionUID = 7077314805989174181L;
public ImportException(String message, Throwable cause) {
super(message, cause);
}
}

View File

@ -1,16 +1,14 @@
package com.google.gridworks.importers;
import java.io.InputStream;
import java.io.Reader;
import java.util.Properties;
import com.google.gridworks.model.Project;
public interface Importer {
public boolean takesReader();
public void read(Reader reader, Project project, Properties options) throws Exception;
public void read(InputStream inputStream, Project project, Properties options) throws Exception;
/**
* Determine whether importer can handle given contentType and filename.
*
* @param contentType
* @param fileName
* @return true if the importer can handle this
*/
public boolean canImportData(String contentType, String fileName);
}

View File

@ -2,10 +2,11 @@ package com.google.gridworks.importers;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.util.Properties;
import org.marc4j.MarcPermissiveStreamReader;
@ -15,27 +16,23 @@ import org.marc4j.marc.Record;
import com.google.gridworks.model.Project;
public class MarcImporter implements Importer {
public boolean takesReader() {
return false;
}
public void read(Reader reader, Project project, Properties options)
throws Exception {
throw new UnsupportedOperationException();
}
public class MarcImporter implements StreamImporter {
@Override
public void read(
InputStream inputStream,
Project project,
Properties options
) throws Exception {
) throws ImportException {
int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
int skip = ImporterUtilities.getIntegerOption("skip",options,0);
File tempFile = File.createTempFile("gridworks-import-", ".marc.xml");
File tempFile;
try {
tempFile = File.createTempFile("gridworks-import-", ".marc.xml");
} catch (IOException e) {
throw new ImportException("Unexpected error creating temp file",e);
}
try {
OutputStream os = new FileOutputStream(tempFile);
try {
@ -62,20 +59,31 @@ public class MarcImporter implements Importer {
}
writer.close();
} finally {
os.close();
try {
os.close();
} catch (IOException e) {
// Just ignore - not much we can do anyway
}
}
InputStream is = new FileInputStream(tempFile);
try {
new XmlImporter().read(is, project, options);
} finally {
is.close();
try {
is.close();
} catch (IOException e) {
// Just ignore - not much we can do anyway
}
}
} catch (FileNotFoundException e) {
throw new ImportException("Input file not found", e);
} finally {
tempFile.delete();
}
}
@Override
public boolean canImportData(String contentType, String fileName) {
if (contentType != null) {
contentType = contentType.toLowerCase().trim();

View File

@ -1,6 +1,6 @@
package com.google.gridworks.importers;
import java.io.InputStream;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
@ -14,6 +14,8 @@ import org.jrdf.SortedMemoryJRDFFactory;
import org.jrdf.collection.MemMapFactory;
import org.jrdf.graph.Graph;
import org.jrdf.graph.Triple;
import org.jrdf.parser.ParseException;
import org.jrdf.parser.StatementHandlerException;
import org.jrdf.parser.line.GraphLineParser;
import org.jrdf.parser.line.LineHandler;
import org.jrdf.parser.ntriples.NTriplesParserFactory;
@ -29,25 +31,33 @@ import com.google.gridworks.model.ModelException;
import com.google.gridworks.model.Project;
import com.google.gridworks.model.Row;
public class RdfTripleImporter implements Importer{
JRDFFactory JrdfFactory;
NTriplesParserFactory nTriplesParserFactory;
MemMapFactory newMapFactory;
public class RdfTripleImporter implements ReaderImporter{
private JRDFFactory _jrdfFactory;
private NTriplesParserFactory _nTriplesParserFactory;
private MemMapFactory _newMapFactory;
public RdfTripleImporter(){
JrdfFactory = SortedMemoryJRDFFactory.getFactory();
nTriplesParserFactory = new NTriplesParserFactory();
newMapFactory = new MemMapFactory();
_jrdfFactory = SortedMemoryJRDFFactory.getFactory();
_nTriplesParserFactory = new NTriplesParserFactory();
_newMapFactory = new MemMapFactory();
}
@Override
public void read(Reader reader, Project project, Properties options) throws Exception {
public void read(Reader reader, Project project, Properties options) throws ImportException {
String baseUrl = options.getProperty("base-url");
Graph graph = JrdfFactory.getNewGraph();
LineHandler lineHandler = nTriplesParserFactory.createParser(graph, newMapFactory);
Graph graph = _jrdfFactory.getNewGraph();
LineHandler lineHandler = _nTriplesParserFactory.createParser(graph, _newMapFactory);
GraphLineParser parser = new GraphLineParser(graph, lineHandler);
parser.parse(reader, baseUrl); // fills JRDF graph
try {
parser.parse(reader, baseUrl); // fills JRDF graph
} catch (IOException e) {
throw new ImportException("i/o error while parsing RDF",e);
} catch (ParseException e) {
throw new ImportException("error parsing RDF",e);
} catch (StatementHandlerException e) {
throw new ImportException("error parsing RDF",e);
}
Map<String, List<Row>> subjectToRows = new HashMap<String, List<Row>>();
@ -64,62 +74,53 @@ public class RdfTripleImporter implements Importer{
Column column = project.columnModel.getColumnByName(predicate);
if (column == null) {
column = new Column(project.columnModel.allocateNewCellIndex(), predicate);
try {
project.columnModel.addColumn(-1, column, true);
} catch (ModelException e) {
// ignore
}
column = new Column(project.columnModel.allocateNewCellIndex(), predicate);
try {
project.columnModel.addColumn(-1, column, true);
} catch (ModelException e) {
// ignore
}
}
int cellIndex = column.getCellIndex();
if (subjectToRows.containsKey(subject)) {
List<Row> rows = subjectToRows.get(subject);
for (Row row : rows) {
if (!ExpressionUtils.isNonBlankData(row.getCellValue(cellIndex))) {
row.setCell(cellIndex, new Cell(object, null));
object = null;
break;
}
}
List<Row> rows = subjectToRows.get(subject);
for (Row row : rows) {
if (!ExpressionUtils.isNonBlankData(row.getCellValue(cellIndex))) {
row.setCell(cellIndex, new Cell(object, null));
object = null;
break;
}
}
if (object != null) {
Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
rows.add(row);
if (object != null) {
Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
rows.add(row);
row.setCell(cellIndex, new Cell(object, null));
}
row.setCell(cellIndex, new Cell(object, null));
}
} else {
List<Row> rows = new ArrayList<Row>();
subjectToRows.put(subject, rows);
List<Row> rows = new ArrayList<Row>();
subjectToRows.put(subject, rows);
Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
rows.add(row);
Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
rows.add(row);
row.setCell(subjectColumn.getCellIndex(), new Cell(subject, null));
row.setCell(cellIndex, new Cell(object, null));
row.setCell(subjectColumn.getCellIndex(), new Cell(subject, null));
row.setCell(cellIndex, new Cell(object, null));
}
}
for (Entry<String, List<Row>> entry : subjectToRows.entrySet()) {
project.rows.addAll(entry.getValue());
project.rows.addAll(entry.getValue());
}
} finally {
triples.iterator().close();
}
}
@Override
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
// TODO
throw new UnsupportedOperationException();
}
@Override
public boolean takesReader() {
return true;
}
public boolean canImportData(String contentType, String fileName) {
if (contentType != null) {
contentType = contentType.toLowerCase().trim();

View File

@ -0,0 +1,27 @@
package com.google.gridworks.importers;
import java.io.Reader;
import java.util.Properties;
import com.google.gridworks.model.Project;
/**
* Interface for importers which take a Reader as input.
*/
public interface ReaderImporter extends Importer {
/**
* Read data from a input reader into project.
*
* @param reader
* reader to import data from. It is assumed to be positioned at
* the correct point and ready to go.
* @param project
* project which will contain data
* @param options
* set of properties with import options
* @throws ImportException
*/
public void read(Reader reader, Project project, Properties options)
throws ImportException;
}

View File

@ -0,0 +1,19 @@
package com.google.gridworks.importers;
import java.io.InputStream;
import java.util.Properties;
import com.google.gridworks.model.Project;
public interface StreamImporter extends Importer {
/**
* @param inputStream stream to be imported
* @param project project to import stream into
* @param options
* @throws ImportException
*/
public void read(InputStream inputStream, Project project,
Properties options) throws ImportException;
}

View File

@ -19,8 +19,10 @@ import com.google.gridworks.model.Cell;
import com.google.gridworks.model.Project;
import com.google.gridworks.model.Row;
public class TsvCsvImporter implements Importer {
public void read(Reader reader, Project project, Properties options) throws Exception {
public class TsvCsvImporter implements ReaderImporter,StreamImporter {
@Override
public void read(Reader reader, Project project, Properties options) throws ImportException {
boolean splitIntoColumns = ImporterUtilities.getBooleanOption("split-into-columns", options, true);
String sep = options.getProperty("separator"); // auto-detect if not present
@ -33,11 +35,15 @@ public class TsvCsvImporter implements Importer {
boolean ignoreQuotes = ImporterUtilities.getBooleanOption("ignore-quotes", options, false);
LineNumberReader lnReader = new LineNumberReader(reader);
read(lnReader, project, sep,
limit, skip, ignoreLines, headerLines,
guessValueType, splitIntoColumns, ignoreQuotes
);
try {
read(lnReader, project, sep,
limit, skip, ignoreLines, headerLines,
guessValueType, splitIntoColumns, ignoreQuotes
);
} catch (IOException e) {
throw new ImportException("Import failed",e);
}
}
/**
@ -170,18 +176,22 @@ public class TsvCsvImporter implements Importer {
return cells;
}
public void read(InputStream inputStream, Project project, Properties options) throws Exception {
@Override
public void read(InputStream inputStream, Project project,
Properties options) throws ImportException {
read(new InputStreamReader(inputStream), project, options);
}
public boolean takesReader() {
return true;
}
@Override
public boolean canImportData(String contentType, String fileName) {
if (contentType != null) {
contentType = contentType.toLowerCase().trim();
return false;
return
"text/plain".equals(contentType) ||
"text/csv".equals(contentType) ||
"text/x-csv".equals(contentType) ||
"text/tab-separated-value".equals(contentType);
} else if (fileName != null) {
fileName = fileName.toLowerCase();
if (fileName.endsWith(".tsv")) {

View File

@ -0,0 +1,14 @@
package com.google.gridworks.importers;
import java.net.URL;
import java.util.Properties;
import com.google.gridworks.model.Project;
public interface UrlImporter extends Importer {
public void read(URL url, Project project, Properties options) throws Exception;
public boolean canImportData(URL url);
}

View File

@ -1,9 +1,9 @@
package com.google.gridworks.importers;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.util.Properties;
import org.slf4j.Logger;
@ -12,27 +12,18 @@ import org.slf4j.LoggerFactory;
import com.google.gridworks.importers.XmlImportUtilities.ImportColumnGroup;
import com.google.gridworks.model.Project;
public class XmlImporter implements Importer {
public class XmlImporter implements StreamImporter {
final static Logger logger = LoggerFactory.getLogger("XmlImporter");
public static final int BUFFER_SIZE = 64 * 1024;
public boolean takesReader() {
return false;
}
public void read(Reader reader, Project project, Properties options)
throws Exception {
throw new UnsupportedOperationException();
}
@Override
public void read(
InputStream inputStream,
Project project,
Properties options
) throws Exception {
) throws ImportException {
logger.trace("XmlImporter.read");
PushbackInputStream pis = new PushbackInputStream(inputStream,BUFFER_SIZE);
@ -40,13 +31,17 @@ public class XmlImporter implements Importer {
{
byte[] buffer = new byte[BUFFER_SIZE];
int bytes_read = 0;
while (bytes_read < BUFFER_SIZE) {
int c = pis.read(buffer, bytes_read, BUFFER_SIZE - bytes_read);
if (c == -1) break;
bytes_read +=c ;
try {
while (bytes_read < BUFFER_SIZE) {
int c = pis.read(buffer, bytes_read, BUFFER_SIZE - bytes_read);
if (c == -1) break;
bytes_read +=c ;
}
pis.unread(buffer, 0, bytes_read);
} catch (IOException e) {
throw new ImportException("Read error",e);
}
pis.unread(buffer, 0, bytes_read);
if (options.containsKey("importer-record-tag")) {
recordPath = XmlImportUtilities.detectPathFromTag(
new ByteArrayInputStream(buffer, 0, bytes_read),
@ -68,6 +63,7 @@ public class XmlImporter implements Importer {
project.columnModel.update();
}
@Override
public boolean canImportData(String contentType, String fileName) {
if (contentType != null) {
contentType = contentType.toLowerCase().trim();