Refactor of CreateProjectCommand.java and Importers

The code for determining if an importer is suitable to import a file is now in each respective importer rather than in CreateProjectCommand.  There is an additional method, canImportData, on the Importer interface to support this.

CreateProjectCommand registers Importers from a Hashtable (this is a copy of Tom's code for registering commands in Gridworks Servlet).  Plugging in new importers should be simpler.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@861 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Iain Sproat 2010-05-26 13:18:48 +00:00
parent 017a825600
commit 1c47ff476b
7 changed files with 291 additions and 172 deletions

View File

@ -45,12 +45,8 @@ import com.metaweb.gridworks.Gridworks;
import com.metaweb.gridworks.ProjectManager; import com.metaweb.gridworks.ProjectManager;
import com.metaweb.gridworks.ProjectMetadata; import com.metaweb.gridworks.ProjectMetadata;
import com.metaweb.gridworks.commands.Command; import com.metaweb.gridworks.commands.Command;
import com.metaweb.gridworks.importers.ExcelImporter;
import com.metaweb.gridworks.importers.Importer; import com.metaweb.gridworks.importers.Importer;
import com.metaweb.gridworks.importers.MarcImporter;
import com.metaweb.gridworks.importers.RdfTripleImporter;
import com.metaweb.gridworks.importers.TsvCsvImporter; import com.metaweb.gridworks.importers.TsvCsvImporter;
import com.metaweb.gridworks.importers.XmlImporter;
import com.metaweb.gridworks.model.Project; import com.metaweb.gridworks.model.Project;
import com.metaweb.gridworks.util.IOUtils; import com.metaweb.gridworks.util.IOUtils;
import com.metaweb.gridworks.util.ParsingUtilities; import com.metaweb.gridworks.util.ParsingUtilities;
@ -58,11 +54,76 @@ import com.metaweb.gridworks.util.ParsingUtilities;
public class CreateProjectCommand extends Command { public class CreateProjectCommand extends Command {
final static Logger logger = LoggerFactory.getLogger("create-project_command"); final static Logger logger = LoggerFactory.getLogger("create-project_command");
static final private Map<String, Importer> importers = new HashMap<String, Importer>();
private static final String[][] importerNames = {
{"ExcelImporter", "com.metaweb.gridworks.importers.ExcelImporter"},
{"XmlImporter", "com.metaweb.gridworks.importers.XmlImporter"},
{"RdfTripleImporter", "com.metaweb.gridworks.importers.RdfTripleImporter"},
{"MarcImporter", "com.metaweb.gridworks.importers.MarcImporter"},
{"TsvCsvImporter", "com.metaweb.gridworks.importers.TsvCsvImporter"},
};
static {
registerImporters(importerNames);
}
static public boolean registerImporters(String[][] importers) {
boolean status = true;
for (String[] importer : importerNames) {
String importerName = importer[0];
String className = importer[1];
logger.debug("Loading command " + importerName + " class: " + className);
Importer cmd;
try {
// TODO: May need to use the servlet container's class loader here
cmd = (Importer) Class.forName(className).newInstance();
} catch (InstantiationException e) {
logger.error("Failed to load importer class " + className, e);
status = false;
continue;
} catch (IllegalAccessException e) {
logger.error("Failed to load importer class " + className, e);
status = false;
continue;
} catch (ClassNotFoundException e) {
logger.error("Failed to load importer class " + className, e);
status = false;
continue;
}
status |= registerImporter(importerName, cmd);
}
return status;
}
/**
* Register a single importer.
*
* @param name
* importer verb for importer
* @param commandObject
* object implementing the importer
* @return true if importer was loaded and registered successfully
*/
static public boolean registerImporter(String name,
Importer importerObject) {
if (importers.containsKey(name)) {
return false;
}
importers.put(name, importerObject);
return true;
}
// Currently only for test purposes
static protected boolean unregisterImporter(String verb) {
return importers.remove(verb) != null;
}
@Override @Override
public void doPost(HttpServletRequest request, HttpServletResponse response) public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException { throws ServletException, IOException {
ProjectManager.singleton.setBusy(true); ProjectManager.singleton.setBusy(true);
try { try {
/* /*
@ -73,9 +134,9 @@ public class CreateProjectCommand extends Command {
* Don't call request.getParameter() before calling internalImport(). * Don't call request.getParameter() before calling internalImport().
*/ */
Properties options = ParsingUtilities.parseUrlParameters(request); Properties options = ParsingUtilities.parseUrlParameters(request);
Project project = new Project(); Project project = new Project();
internalImport(request, project, options); internalImport(request, project, options);
/* /*
@ -91,7 +152,7 @@ public class CreateProjectCommand extends Command {
ProjectManager.singleton.registerProject(project, pm); ProjectManager.singleton.registerProject(project, pm);
project.update(); project.update();
redirect(response, "/project.html?project=" + project.id); redirect(response, "/project.html?project=" + project.id);
} catch (Exception e) { } catch (Exception e) {
redirect(response, "/error.html?redirect=index.html&msg=" + redirect(response, "/error.html?redirect=index.html&msg=" +
@ -102,7 +163,7 @@ public class CreateProjectCommand extends Command {
ProjectManager.singleton.setBusy(false); ProjectManager.singleton.setBusy(false);
} }
} }
protected void internalImport( protected void internalImport(
HttpServletRequest request, HttpServletRequest request,
Project project, Project project,
@ -111,7 +172,7 @@ public class CreateProjectCommand extends Command {
ServletFileUpload upload = new ServletFileUpload(); ServletFileUpload upload = new ServletFileUpload();
String url = null; String url = null;
FileItemIterator iter = upload.getItemIterator(request); FileItemIterator iter = upload.getItemIterator(request);
while (iter.hasNext()) { while (iter.hasNext()) {
FileItemStream item = iter.next(); FileItemStream item = iter.next();
@ -138,32 +199,32 @@ public class CreateProjectCommand extends Command {
stream.close(); stream.close();
} }
} }
} }
if (url != null && url.length() > 0) { if (url != null && url.length() > 0) {
internalImportURL(request, project, options, url); internalImportURL(request, project, options, url);
} }
} }
static class SafeInputStream extends FilterInputStream { static class SafeInputStream extends FilterInputStream {
public SafeInputStream(InputStream stream) { public SafeInputStream(InputStream stream) {
super(stream); super(stream);
} }
@Override @Override
public void close() { public void close() {
// some libraries attempt to close the input stream while they can't // some libraries attempt to close the input stream while they can't
// read anymore from it... unfortunately this behavior prevents // read anymore from it... unfortunately this behavior prevents
// the zip input stream from functioning correctly so we just have // the zip input stream from functioning correctly so we just have
// to ignore those close() calls and just close it ourselves // to ignore those close() calls and just close it ourselves
// forcefully later // forcefully later
} }
public void reallyClose() throws IOException { public void reallyClose() throws IOException {
super.close(); super.close();
} }
} }
protected void internalImportFile( protected void internalImportFile(
Project project, Project project,
Properties options, Properties options,
@ -172,13 +233,13 @@ public class CreateProjectCommand extends Command {
) throws Exception { ) throws Exception {
logger.info("Importing '{}'", fileName); logger.info("Importing '{}'", fileName);
if (fileName.endsWith(".zip") || fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz") || fileName.endsWith(".tar.bz2")) { if (fileName.endsWith(".zip") || fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz") || fileName.endsWith(".tar.bz2")) {
// first, save the file on disk, since we need two passes and we might // first, save the file on disk, since we need two passes and we might
// not have enough memory to keep it all in there // not have enough memory to keep it all in there
File file = save(inputStream); File file = save(inputStream);
// in the first pass, gather statistics about what files are in there // in the first pass, gather statistics about what files are in there
// unfortunately, we have to rely on files extensions, which is horrible but // unfortunately, we have to rely on files extensions, which is horrible but
// better than nothing // better than nothing
@ -186,9 +247,9 @@ public class CreateProjectCommand extends Command {
FileInputStream fis = new FileInputStream(file); FileInputStream fis = new FileInputStream(file);
InputStream is = getStream(fileName, fis); InputStream is = getStream(fileName, fis);
// NOTE(SM): unfortunately, java.io does not provide any generalized class for // NOTE(SM): unfortunately, java.io does not provide any generalized class for
// archive-like input streams so while both TarInputStream and ZipInputStream // archive-like input streams so while both TarInputStream and ZipInputStream
// behave precisely the same, there is no polymorphic behavior so we have // behave precisely the same, there is no polymorphic behavior so we have
// to treat each instance explicitly... one of those times you wish you had // to treat each instance explicitly... one of those times you wish you had
// closures // closures
@ -224,10 +285,10 @@ public class CreateProjectCommand extends Command {
if (values.size() == 0) { if (values.size() == 0) {
throw new RuntimeException("The archive contains no files."); throw new RuntimeException("The archive contains no files.");
} }
// this will contain the set of extensions we'll load from the archive // this will contain the set of extensions we'll load from the archive
HashSet<String> exts = new HashSet<String>(); HashSet<String> exts = new HashSet<String>();
// find the extension that is most frequent or those who share the highest frequency value // find the extension that is most frequent or those who share the highest frequency value
if (values.size() == 1) { if (values.size() == 1) {
exts.add(values.get(0).getKey()); exts.add(values.get(0).getKey());
@ -245,7 +306,7 @@ public class CreateProjectCommand extends Command {
} }
} }
} }
logger.info("Most frequent extensions: {}", exts.toString()); logger.info("Most frequent extensions: {}", exts.toString());
// second pass, load the data for real // second pass, load the data for real
@ -299,9 +360,9 @@ public class CreateProjectCommand extends Command {
return o2.getValue() - o1.getValue(); return o2.getValue() - o1.getValue();
} }
} }
private void load(Project project, Properties options, String fileName, InputStream inputStream) throws Exception { private void load(Project project, Properties options, String fileName, InputStream inputStream) throws Exception {
Importer importer = guessImporter(options, null, fileName); Importer importer = guessImporter(null, fileName);
internalInvokeImporter(project, importer, options, inputStream, null); internalInvokeImporter(project, importer, options, inputStream, null);
} }
@ -312,7 +373,7 @@ public class CreateProjectCommand extends Command {
is.close(); is.close();
return temp; return temp;
} }
private void mapExtension(String name, Map<String,Integer> ext_map) { private void mapExtension(String name, Map<String,Integer> ext_map) {
String ext = getExtension(name)[1]; String ext = getExtension(name)[1];
if (ext_map.containsKey(ext)) { if (ext_map.containsKey(ext)) {
@ -323,7 +384,7 @@ public class CreateProjectCommand extends Command {
} }
private InputStream getStream(String fileName, InputStream is) throws IOException { private InputStream getStream(String fileName, InputStream is) throws IOException {
if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) { if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) {
return new TarInputStream(new GZIPInputStream(is)); return new TarInputStream(new GZIPInputStream(is));
} else if (fileName.endsWith(".tar.bz2")) { } else if (fileName.endsWith(".tar.bz2")) {
return new TarInputStream(new CBZip2InputStream(is)); return new TarInputStream(new CBZip2InputStream(is));
@ -331,7 +392,7 @@ public class CreateProjectCommand extends Command {
return new ZipInputStream(is); return new ZipInputStream(is);
} }
} }
private String[] getExtension(String filename) { private String[] getExtension(String filename) {
String[] result = new String[2]; String[] result = new String[2];
int ext_index = filename.lastIndexOf('.'); int ext_index = filename.lastIndexOf('.');
@ -339,7 +400,7 @@ public class CreateProjectCommand extends Command {
result[1] = (ext_index == -1) ? "" : filename.substring(ext_index + 1); result[1] = (ext_index == -1) ? "" : filename.substring(ext_index + 1);
return result; return result;
} }
protected void internalImportURL( protected void internalImportURL(
HttpServletRequest request, HttpServletRequest request,
Project project, Project project,
@ -348,7 +409,7 @@ public class CreateProjectCommand extends Command {
) throws Exception { ) throws Exception {
URL url = new URL(urlString); URL url = new URL(urlString);
URLConnection connection = null; URLConnection connection = null;
try { try {
connection = url.openConnection(); connection = url.openConnection();
connection.setConnectTimeout(5000); connection.setConnectTimeout(5000);
@ -356,27 +417,26 @@ public class CreateProjectCommand extends Command {
} catch (Exception e) { } catch (Exception e) {
throw new Exception("Cannot connect to " + urlString, e); throw new Exception("Cannot connect to " + urlString, e);
} }
InputStream inputStream = null; InputStream inputStream = null;
try { try {
inputStream = connection.getInputStream(); inputStream = connection.getInputStream();
} catch (Exception e) { } catch (Exception e) {
throw new Exception("Cannot retrieve content from " + url, e); throw new Exception("Cannot retrieve content from " + url, e);
} }
try { try {
Importer importer = guessImporter( Importer importer = guessImporter(
options,
connection.getContentType(), connection.getContentType(),
url.getPath() url.getPath()
); );
internalInvokeImporter(project, importer, options, inputStream, connection.getContentEncoding()); internalInvokeImporter(project, importer, options, inputStream, connection.getContentEncoding());
} finally { } finally {
inputStream.close(); inputStream.close();
} }
} }
protected void internalInvokeImporter( protected void internalInvokeImporter(
Project project, Project project,
Importer importer, Importer importer,
@ -387,48 +447,48 @@ public class CreateProjectCommand extends Command {
if (importer.takesReader()) { if (importer.takesReader()) {
BufferedInputStream inputStream = new BufferedInputStream(rawInputStream); BufferedInputStream inputStream = new BufferedInputStream(rawInputStream);
// NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset. // NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset.
// Unfortunately, not all ServletInputStream implementations are marking, so we need do // Unfortunately, not all ServletInputStream implementations are marking, so we need do
// this memory-expensive wrapping to make it work. It's far from ideal but I don't have // this memory-expensive wrapping to make it work. It's far from ideal but I don't have
// a more efficient solution. // a more efficient solution.
byte[] bytes = new byte[1024 * 4]; byte[] bytes = new byte[1024 * 4];
inputStream.mark(bytes.length); inputStream.mark(bytes.length);
inputStream.read(bytes); inputStream.read(bytes);
inputStream.reset(); inputStream.reset();
CharsetDetector detector = new CharsetDetector(); CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that
Reader reader = null; Reader reader = null;
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll(); CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();
for (CharsetMatch charsetMatch : charsetMatches) { for (CharsetMatch charsetMatch : charsetMatches) {
try { try {
reader = new InputStreamReader(inputStream, charsetMatch.getName()); reader = new InputStreamReader(inputStream, charsetMatch.getName());
options.setProperty("encoding", charsetMatch.getName()); options.setProperty("encoding", charsetMatch.getName());
options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence())); options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence()));
logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence()); logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
break; break;
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
// silent // silent
} }
} }
if (reader == null) { // when all else fails if (reader == null) { // when all else fails
reader = encoding != null ? reader = encoding != null ?
new InputStreamReader(inputStream, encoding) : new InputStreamReader(inputStream, encoding) :
new InputStreamReader(inputStream); new InputStreamReader(inputStream);
} }
importer.read(reader, project, options); importer.read(reader, project, options);
} else { } else {
importer.read(rawInputStream, project, options); importer.read(rawInputStream, project, options);
} }
} }
protected void internalInvokeImporter( protected void internalInvokeImporter(
Project project, Project project,
Importer importer, Importer importer,
@ -437,58 +497,14 @@ public class CreateProjectCommand extends Command {
) throws Exception { ) throws Exception {
importer.read(reader, project, options); importer.read(reader, project, options);
} }
protected Importer guessImporter( protected Importer guessImporter(String contentType, String fileName) {
Properties options, String contentType, String fileName) { for(Importer i : importers.values()){
if(i.canImportData(contentType, fileName)){
if (contentType != null) { return i;
contentType = contentType.toLowerCase().trim();
if ("application/msexcel".equals(contentType) ||
"application/x-msexcel".equals(contentType) ||
"application/x-ms-excel".equals(contentType) ||
"application/vnd.ms-excel".equals(contentType) ||
"application/x-excel".equals(contentType) ||
"application/xls".equals(contentType)) {
return new ExcelImporter(false);
} else if("application/x-xls".equals(contentType)) {
return new ExcelImporter(true);
} else if("application/xml".equals(contentType) ||
"text/xml".equals(contentType) ||
"application/rss+xml".equals(contentType) ||
"application/atom+xml".equals(contentType)) {
return new XmlImporter();
} else if("application/rdf+xml".equals(contentType)) {
return new RdfTripleImporter();
} else if ("application/marc".equals(contentType)) {
return new MarcImporter();
}
} else if (fileName != null) {
fileName = fileName.toLowerCase();
if (fileName.endsWith(".xls")) {
return new ExcelImporter(false);
} else if (fileName.endsWith(".xlsx")) {
return new ExcelImporter(true);
} else if (
fileName.endsWith(".xml") ||
fileName.endsWith(".atom") ||
fileName.endsWith(".rss")
) {
return new XmlImporter();
} else if (
fileName.endsWith(".rdf")) {
return new RdfTripleImporter();
} else if (
fileName.endsWith(".mrc") ||
fileName.endsWith(".marc") ||
fileName.contains(".mrc.") ||
fileName.contains(".marc.")
) {
return new MarcImporter();
} }
} }
return new TsvCsvImporter(); return new TsvCsvImporter(); //default
} }
} }

View File

@ -27,16 +27,12 @@ import com.metaweb.gridworks.model.Row;
import com.metaweb.gridworks.model.Recon.Judgment; import com.metaweb.gridworks.model.Recon.Judgment;
public class ExcelImporter implements Importer { public class ExcelImporter implements Importer {
final protected boolean _xmlBased; protected boolean _xmlBased;
public ExcelImporter(boolean xmlBased) {
_xmlBased = xmlBased;
}
public boolean takesReader() { public boolean takesReader() {
return false; return false;
} }
public void read(Reader reader, Project project, Properties options) throws Exception { public void read(Reader reader, Project project, Properties options) throws Exception {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@ -45,11 +41,11 @@ public class ExcelImporter implements Importer {
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1); int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
int limit = ImporterUtilities.getIntegerOption("limit",options,-1); int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
int skip = ImporterUtilities.getIntegerOption("skip",options,0); int skip = ImporterUtilities.getIntegerOption("skip",options,0);
Workbook wb = null; Workbook wb = null;
try { try {
wb = _xmlBased ? wb = _xmlBased ?
new XSSFWorkbook(inputStream) : new XSSFWorkbook(inputStream) :
new HSSFWorkbook(new POIFSFileSystem(inputStream)); new HSSFWorkbook(new POIFSFileSystem(inputStream));
} catch (IOException e) { } catch (IOException e) {
throw new Exception( throw new Exception(
@ -58,16 +54,16 @@ public class ExcelImporter implements Importer {
e e
); );
} }
Sheet sheet = wb.getSheetAt(0); Sheet sheet = wb.getSheetAt(0);
int firstRow = sheet.getFirstRowNum(); int firstRow = sheet.getFirstRowNum();
int lastRow = sheet.getLastRowNum(); int lastRow = sheet.getLastRowNum();
int r = firstRow; int r = firstRow;
List<Integer> nonBlankIndices = null; List<Integer> nonBlankIndices = null;
List<String> nonBlankHeaderStrings = null; List<String> nonBlankHeaderStrings = null;
/* /*
* Find the header row * Find the header row
*/ */
@ -79,13 +75,13 @@ public class ExcelImporter implements Importer {
ignoreLines--; ignoreLines--;
continue; continue;
} }
short firstCell = row.getFirstCellNum(); short firstCell = row.getFirstCellNum();
short lastCell = row.getLastCellNum(); short lastCell = row.getLastCellNum();
if (firstCell >= 0 && firstCell <= lastCell) { if (firstCell >= 0 && firstCell <= lastCell) {
nonBlankIndices = new ArrayList<Integer>(lastCell - firstCell + 1); nonBlankIndices = new ArrayList<Integer>(lastCell - firstCell + 1);
nonBlankHeaderStrings = new ArrayList<String>(lastCell - firstCell + 1); nonBlankHeaderStrings = new ArrayList<String>(lastCell - firstCell + 1);
for (int c = firstCell; c <= lastCell; c++) { for (int c = firstCell; c <= lastCell; c++) {
org.apache.poi.ss.usermodel.Cell cell = row.getCell(c); org.apache.poi.ss.usermodel.Cell cell = row.getCell(c);
if (cell != null) { if (cell != null) {
@ -96,18 +92,18 @@ public class ExcelImporter implements Importer {
} }
} }
} }
if (nonBlankIndices.size() > 0) { if (nonBlankIndices.size() > 0) {
r++; r++;
break; break;
} }
} }
} }
if (nonBlankIndices == null || nonBlankIndices.size() == 0) { if (nonBlankIndices == null || nonBlankIndices.size() == 0) {
return; return;
} }
/* /*
* Create columns * Create columns
*/ */
@ -117,59 +113,59 @@ public class ExcelImporter implements Importer {
if (nameToIndex.containsKey(cell)) { if (nameToIndex.containsKey(cell)) {
int index = nameToIndex.get(cell); int index = nameToIndex.get(cell);
nameToIndex.put(cell, index + 1); nameToIndex.put(cell, index + 1);
cell = cell.contains(" ") ? (cell + " " + index) : (cell + index); cell = cell.contains(" ") ? (cell + " " + index) : (cell + index);
} else { } else {
nameToIndex.put(cell, 2); nameToIndex.put(cell, 2);
} }
Column column = new Column(c, cell); Column column = new Column(c, cell);
project.columnModel.columns.add(column); project.columnModel.columns.add(column);
} }
/* /*
* Now process the data rows * Now process the data rows
*/ */
int rowsWithData = 0; int rowsWithData = 0;
Map<String, Recon> reconMap = new HashMap<String, Recon>(); Map<String, Recon> reconMap = new HashMap<String, Recon>();
for (; r <= lastRow; r++) { for (; r <= lastRow; r++) {
org.apache.poi.ss.usermodel.Row row = sheet.getRow(r); org.apache.poi.ss.usermodel.Row row = sheet.getRow(r);
if (row == null) { if (row == null) {
continue; continue;
} }
short firstCell = row.getFirstCellNum(); short firstCell = row.getFirstCellNum();
short lastCell = row.getLastCellNum(); short lastCell = row.getLastCellNum();
if (firstCell >= 0 && firstCell <= lastCell) { if (firstCell >= 0 && firstCell <= lastCell) {
Row newRow = new Row(nonBlankIndices.size()); Row newRow = new Row(nonBlankIndices.size());
boolean hasData = false; boolean hasData = false;
for (int c = 0; c < nonBlankIndices.size(); c++) { for (int c = 0; c < nonBlankIndices.size(); c++) {
if (c < firstCell || c > lastCell) { if (c < firstCell || c > lastCell) {
continue; continue;
} }
org.apache.poi.ss.usermodel.Cell cell = row.getCell(c); org.apache.poi.ss.usermodel.Cell cell = row.getCell(c);
if (cell == null) { if (cell == null) {
continue; continue;
} }
int cellType = cell.getCellType(); int cellType = cell.getCellType();
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR || if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_ERROR ||
cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) { cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BLANK) {
continue; continue;
} }
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) { if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_FORMULA) {
cellType = cell.getCachedFormulaResultType(); cellType = cell.getCachedFormulaResultType();
} }
Serializable value = null; Serializable value = null;
if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) { if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_BOOLEAN) {
value = cell.getBooleanCellValue(); value = cell.getBooleanCellValue();
} else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) { } else if (cellType == org.apache.poi.ss.usermodel.Cell.CELL_TYPE_NUMERIC) {
double d = cell.getNumericCellValue(); double d = cell.getNumericCellValue();
if (HSSFDateUtil.isCellDateFormatted(cell)) { if (HSSFDateUtil.isCellDateFormatted(cell)) {
value = HSSFDateUtil.getJavaDate(d); value = HSSFDateUtil.getJavaDate(d);
} else { } else {
@ -181,23 +177,23 @@ public class ExcelImporter implements Importer {
value = text; value = text;
} }
} }
if (value != null) { if (value != null) {
Recon recon = null; Recon recon = null;
Hyperlink hyperlink = cell.getHyperlink(); Hyperlink hyperlink = cell.getHyperlink();
if (hyperlink != null) { if (hyperlink != null) {
String url = hyperlink.getAddress(); String url = hyperlink.getAddress();
if (url.startsWith("http://") || if (url.startsWith("http://") ||
url.startsWith("https://")) { url.startsWith("https://")) {
final String sig = "freebase.com/view"; final String sig = "freebase.com/view";
int i = url.indexOf(sig); int i = url.indexOf(sig);
if (i > 0) { if (i > 0) {
String id = url.substring(i + sig.length()); String id = url.substring(i + sig.length());
int q = id.indexOf('?'); int q = id.indexOf('?');
if (q > 0) { if (q > 0) {
id = id.substring(0, q); id = id.substring(0, q);
@ -206,7 +202,7 @@ public class ExcelImporter implements Importer {
if (h > 0) { if (h > 0) {
id = id.substring(0, h); id = id.substring(0, h);
} }
if (reconMap.containsKey(id)) { if (reconMap.containsKey(id)) {
recon = reconMap.get(id); recon = reconMap.get(id);
recon.judgmentBatchSize++; recon.judgmentBatchSize++;
@ -219,26 +215,26 @@ public class ExcelImporter implements Importer {
recon.judgmentAction = "auto"; recon.judgmentAction = "auto";
recon.judgmentBatchSize = 1; recon.judgmentBatchSize = 1;
recon.addCandidate(recon.match); recon.addCandidate(recon.match);
reconMap.put(id, recon); reconMap.put(id, recon);
} }
} }
} }
} }
newRow.setCell(c, new Cell(value, recon)); newRow.setCell(c, new Cell(value, recon));
hasData = true; hasData = true;
} }
} }
if (hasData) { if (hasData) {
rowsWithData++; rowsWithData++;
if (skip <= 0 || rowsWithData > skip) { if (skip <= 0 || rowsWithData > skip) {
project.rows.add(newRow); project.rows.add(newRow);
project.columnModel.setMaxCellIndex(newRow.cells.size()); project.columnModel.setMaxCellIndex(newRow.cells.size());
if (limit > 0 && project.rows.size() >= limit) { if (limit > 0 && project.rows.size() >= limit) {
break; break;
} }
@ -247,4 +243,32 @@ public class ExcelImporter implements Importer {
} }
} }
} }
public boolean canImportData(String contentType, String fileName) {
if (contentType != null) {
contentType = contentType.toLowerCase().trim();
if ("application/msexcel".equals(contentType) ||
"application/x-msexcel".equals(contentType) ||
"application/x-ms-excel".equals(contentType) ||
"application/vnd.ms-excel".equals(contentType) ||
"application/x-excel".equals(contentType) ||
"application/xls".equals(contentType)) {
this._xmlBased = false;
return true;
} else if("application/x-xls".equals(contentType)) {
this._xmlBased = true;
return true;
}
} else if (fileName != null) {
fileName = fileName.toLowerCase();
if (fileName.endsWith(".xls")) {
this._xmlBased = false;
return true;
} else if (fileName.endsWith(".xlsx")) {
this._xmlBased = true;
return true;
}
}
return false;
}
} }

View File

@ -8,7 +8,9 @@ import com.metaweb.gridworks.model.Project;
public interface Importer { public interface Importer {
public boolean takesReader(); public boolean takesReader();
public void read(Reader reader, Project project, Properties options) throws Exception; public void read(Reader reader, Project project, Properties options) throws Exception;
public void read(InputStream inputStream, Project project, Properties options) throws Exception; public void read(InputStream inputStream, Project project, Properties options) throws Exception;
public boolean canImportData(String contentType, String fileName);
} }

View File

@ -20,21 +20,21 @@ public class MarcImporter implements Importer {
public boolean takesReader() { public boolean takesReader() {
return false; return false;
} }
public void read(Reader reader, Project project, Properties options) public void read(Reader reader, Project project, Properties options)
throws Exception { throws Exception {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
public void read( public void read(
InputStream inputStream, InputStream inputStream,
Project project, Project project,
Properties options Properties options
) throws Exception { ) throws Exception {
int limit = ImporterUtilities.getIntegerOption("limit",options,-1); int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
int skip = ImporterUtilities.getIntegerOption("skip",options,0); int skip = ImporterUtilities.getIntegerOption("skip",options,0);
File tempFile = File.createTempFile("gridworks-import-", ".marc.xml"); File tempFile = File.createTempFile("gridworks-import-", ".marc.xml");
try { try {
OutputStream os = new FileOutputStream(tempFile); OutputStream os = new FileOutputStream(tempFile);
@ -45,7 +45,7 @@ public class MarcImporter implements Importer {
true true
); );
MarcWriter writer = new MarcXmlWriter(os, true); MarcWriter writer = new MarcXmlWriter(os, true);
int count = 0; int count = 0;
while (reader.hasNext()) { while (reader.hasNext()) {
Record record = reader.next(); Record record = reader.next();
@ -64,7 +64,7 @@ public class MarcImporter implements Importer {
} finally { } finally {
os.close(); os.close();
} }
InputStream is = new FileInputStream(tempFile); InputStream is = new FileInputStream(tempFile);
try { try {
new XmlImporter().read(is, project, options); new XmlImporter().read(is, project, options);
@ -75,4 +75,25 @@ public class MarcImporter implements Importer {
tempFile.delete(); tempFile.delete();
} }
} }
public boolean canImportData(String contentType, String fileName) {
if (contentType != null) {
contentType = contentType.toLowerCase().trim();
if ("application/marc".equals(contentType)) {
return true;
}
} else if (fileName != null) {
fileName = fileName.toLowerCase();
if (
fileName.endsWith(".mrc") ||
fileName.endsWith(".marc") ||
fileName.contains(".mrc.") ||
fileName.contains(".marc.")
) {
return true;
}
}
return false;
}
} }

View File

@ -43,25 +43,25 @@ public class RdfTripleImporter implements Importer{
@Override @Override
public void read(Reader reader, Project project, Properties options) throws Exception { public void read(Reader reader, Project project, Properties options) throws Exception {
String baseUrl = options.getProperty("base-url"); String baseUrl = options.getProperty("base-url");
Graph graph = JrdfFactory.getNewGraph(); Graph graph = JrdfFactory.getNewGraph();
LineHandler lineHandler = nTriplesParserFactory.createParser(graph, newMapFactory); LineHandler lineHandler = nTriplesParserFactory.createParser(graph, newMapFactory);
GraphLineParser parser = new GraphLineParser(graph, lineHandler); GraphLineParser parser = new GraphLineParser(graph, lineHandler);
parser.parse(reader, baseUrl); // fills JRDF graph parser.parse(reader, baseUrl); // fills JRDF graph
Map<String, List<Row>> subjectToRows = new HashMap<String, List<Row>>(); Map<String, List<Row>> subjectToRows = new HashMap<String, List<Row>>();
Column subjectColumn = new Column(0, "subject"); Column subjectColumn = new Column(0, "subject");
project.columnModel.columns.add(0, subjectColumn); project.columnModel.columns.add(0, subjectColumn);
project.columnModel.setKeyColumnIndex(0); project.columnModel.setKeyColumnIndex(0);
ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE); ClosableIterable<Triple> triples = graph.find(ANY_SUBJECT_NODE, ANY_PREDICATE_NODE, ANY_OBJECT_NODE);
try { try {
for (Triple triple : triples) { for (Triple triple : triples) {
String subject = triple.getSubject().toString(); String subject = triple.getSubject().toString();
String predicate = triple.getPredicate().toString(); String predicate = triple.getPredicate().toString();
String object = triple.getObject().toString(); String object = triple.getObject().toString();
Column column = project.columnModel.getColumnByName(predicate); Column column = project.columnModel.getColumnByName(predicate);
if (column == null) { if (column == null) {
column = new Column(project.columnModel.allocateNewCellIndex(), predicate); column = new Column(project.columnModel.allocateNewCellIndex(), predicate);
@ -71,7 +71,7 @@ public class RdfTripleImporter implements Importer{
// ignore // ignore
} }
} }
int cellIndex = column.getCellIndex(); int cellIndex = column.getCellIndex();
if (subjectToRows.containsKey(subject)) { if (subjectToRows.containsKey(subject)) {
List<Row> rows = subjectToRows.get(subject); List<Row> rows = subjectToRows.get(subject);
@ -82,20 +82,20 @@ public class RdfTripleImporter implements Importer{
break; break;
} }
} }
if (object != null) { if (object != null) {
Row row = new Row(project.columnModel.getMaxCellIndex() + 1); Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
rows.add(row); rows.add(row);
row.setCell(cellIndex, new Cell(object, null)); row.setCell(cellIndex, new Cell(object, null));
} }
} else { } else {
List<Row> rows = new ArrayList<Row>(); List<Row> rows = new ArrayList<Row>();
subjectToRows.put(subject, rows); subjectToRows.put(subject, rows);
Row row = new Row(project.columnModel.getMaxCellIndex() + 1); Row row = new Row(project.columnModel.getMaxCellIndex() + 1);
rows.add(row); rows.add(row);
row.setCell(subjectColumn.getCellIndex(), new Cell(subject, null)); row.setCell(subjectColumn.getCellIndex(), new Cell(subject, null));
row.setCell(cellIndex, new Cell(object, null)); row.setCell(cellIndex, new Cell(object, null));
} }
@ -120,4 +120,21 @@ public class RdfTripleImporter implements Importer{
return true; return true;
} }
public boolean canImportData(String contentType, String fileName) {
if (contentType != null) {
contentType = contentType.toLowerCase().trim();
if("application/rdf+xml".equals(contentType)) {
return true;
}
} else if (fileName != null) {
fileName = fileName.toLowerCase();
if (
fileName.endsWith(".rdf")) {
return true;
}
}
return false;
}
} }

View File

@ -2,6 +2,7 @@ package com.metaweb.gridworks.importers;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader; import java.io.LineNumberReader;
import java.io.Reader; import java.io.Reader;
import java.util.ArrayList; import java.util.ArrayList;
@ -151,10 +152,25 @@ public class TsvCsvImporter implements Importer {
} }
public void read(InputStream inputStream, Project project, Properties options) throws Exception { public void read(InputStream inputStream, Project project, Properties options) throws Exception {
throw new UnsupportedOperationException(); read(new InputStreamReader(inputStream), project, options);
} }
public boolean takesReader() { public boolean takesReader() {
return true; return true;
} }
public boolean canImportData(String contentType, String fileName) {
if (contentType != null) {
contentType = contentType.toLowerCase().trim();
return false;
} else if (fileName != null) {
fileName = fileName.toLowerCase();
if (fileName.endsWith(".tsv")) {
return true;
}else if (fileName.endsWith(".csv")){
return true;
}
}
return false;
}
} }

View File

@ -12,24 +12,24 @@ import com.metaweb.gridworks.model.Project;
public class XmlImporter implements Importer { public class XmlImporter implements Importer {
public static final int BUFFER_SIZE = 64 * 1024; public static final int BUFFER_SIZE = 64 * 1024;
public boolean takesReader() { public boolean takesReader() {
return false; return false;
} }
public void read(Reader reader, Project project, Properties options) public void read(Reader reader, Project project, Properties options)
throws Exception { throws Exception {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
public void read( public void read(
InputStream inputStream, InputStream inputStream,
Project project, Project project,
Properties options Properties options
) throws Exception { ) throws Exception {
PushbackInputStream pis = new PushbackInputStream(inputStream,BUFFER_SIZE); PushbackInputStream pis = new PushbackInputStream(inputStream,BUFFER_SIZE);
String[] recordPath = null; String[] recordPath = null;
{ {
byte[] buffer = new byte[BUFFER_SIZE]; byte[] buffer = new byte[BUFFER_SIZE];
@ -40,10 +40,10 @@ public class XmlImporter implements Importer {
bytes_read +=c ; bytes_read +=c ;
} }
pis.unread(buffer, 0, bytes_read); pis.unread(buffer, 0, bytes_read);
if (options.containsKey("importer-record-tag")) { if (options.containsKey("importer-record-tag")) {
recordPath = XmlImportUtilities.detectPathFromTag( recordPath = XmlImportUtilities.detectPathFromTag(
new ByteArrayInputStream(buffer, 0, bytes_read), new ByteArrayInputStream(buffer, 0, bytes_read),
options.getProperty("importer-record-tag")); options.getProperty("importer-record-tag"));
} else { } else {
recordPath = XmlImportUtilities.detectRecordElement( recordPath = XmlImportUtilities.detectRecordElement(
@ -52,11 +52,34 @@ public class XmlImporter implements Importer {
} }
ImportColumnGroup rootColumnGroup = new ImportColumnGroup(); ImportColumnGroup rootColumnGroup = new ImportColumnGroup();
XmlImportUtilities.importXml(pis, project, recordPath, rootColumnGroup); XmlImportUtilities.importXml(pis, project, recordPath, rootColumnGroup);
XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup); XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup);
project.columnModel.update(); project.columnModel.update();
} }
public boolean canImportData(String contentType, String fileName) {
if (contentType != null) {
contentType = contentType.toLowerCase().trim();
if("application/xml".equals(contentType) ||
"text/xml".equals(contentType) ||
"application/rss+xml".equals(contentType) ||
"application/atom+xml".equals(contentType)) {
return true;
}
} else if (fileName != null) {
fileName = fileName.toLowerCase();
if (
fileName.endsWith(".xml") ||
fileName.endsWith(".atom") ||
fileName.endsWith(".rss")
) {
return true;
}
}
return false;
}
} }