- archive and compressed file importer (supports zip, tar, gz, bz2, tar.gz and tar.bz2)

(works by loading the files that have the most common extensions in the archive)
- changed default max heap to 3Gb


git-svn-id: http://google-refine.googlecode.com/svn/trunk@381 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-04-04 07:48:47 +00:00
parent 65c5aea079
commit 798b2a36ca
5 changed files with 207 additions and 141 deletions

View File

@ -380,7 +380,7 @@ fi
add_option "$JAVA_OPTIONS"
if [ "$GRIDWORKS_MEMORY" == "" ] ; then
GRIDWORKS_MEMORY="1024M"
GRIDWORKS_MEMORY="3072M"
fi
add_option "-Xms256M -Xmx$GRIDWORKS_MEMORY"

View File

@ -125,7 +125,7 @@ set JAVA_OPTIONS=
set OPTS=%OPTS% %JAVA_OPTIONS%
if not "%GRIDWORKS_MEMORY%" == "" goto gotMemory
set GRIDWORKS_MEMORY=1024M
set GRIDWORKS_MEMORY=3072M
:gotMemory
set OPTS=%OPTS% -Xms256M -Xmx%GRIDWORKS_MEMORY%

View File

@ -4,6 +4,7 @@ import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
@ -18,9 +19,12 @@ import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Map.Entry;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
@ -108,7 +112,13 @@ public class CreateProjectCommand extends Command {
if (part.isFile()) {
FilePart filePart = (FilePart) part;
internalImportFile(project, options, filePart.getFileName(), filePart.getInputStream());
InputStream stream = filePart.getInputStream();
String name = filePart.getFileName().toLowerCase();
try {
internalImportFile(project, options, name, stream);
} finally {
stream.close();
}
} else if (part.isParam()) {
ParamPart paramPart = (ParamPart) part;
@ -135,6 +145,25 @@ public class CreateProjectCommand extends Command {
}
}
class SafeInputStream extends FilterInputStream {
public SafeInputStream(InputStream stream) {
super(stream);
}
@Override
public void close() {
// some libraries attempt to close the input stream while they can't
// read anymore from it... unfortunately this behavior prevents
// the zip input stream from functioning correctly so we just have
// to ignore those close() calls and just close it ourselves
// forcefully later
}
public void reallyClose() throws IOException {
super.close();
}
}
protected void internalImportFile(
Project project,
Properties options,
@ -142,7 +171,10 @@ public class CreateProjectCommand extends Command {
InputStream inputStream
) throws Exception {
if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tar.bz2")) {
Gridworks.info("Importing " + fileName + "");
if (fileName.endsWith(".zip") || fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz") || fileName.endsWith(".tar.bz2")) {
// first, save the file on disk, since we need two passes and we might
// not have enough memory to keep it all in there
File file = save(inputStream);
@ -150,24 +182,33 @@ public class CreateProjectCommand extends Command {
// in the first pass, gather statistics about what files are in there
// unfortunately, we have to rely on files extensions, which is horrible but
// better than nothing
BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file));
InputStream is = (fileName.endsWith(".tar.gz")) ? new GZIPInputStream(stream): new CBZip2InputStream(stream);
TarInputStream tis = new TarInputStream(is);
HashMap<String,Integer> ext_map = new HashMap<String,Integer>();
while (true) {
TarEntry entry = tis.getNextEntry();
if (entry == null) break;
if (!entry.isDirectory()) {
String name = entry.getName();
String ext = getExtension(name)[1];
if (ext_map.containsKey(ext)) {
ext_map.put(ext, ext_map.get(ext) + 1);
} else {
ext_map.put(ext, 1);
InputStream is = getStream(fileName, new FileInputStream(file));
// NOTE(SM): unfortunately, java.io does not provide any generalized class for
// archive-like input streams so while both TarInputStream and ZipInputStream
// behave precisely the same, there is no polymorphic behavior so we have
// to treat each instance explicitly... one of those times you wish you had
// closures
if (is instanceof TarInputStream) {
TarInputStream tis = (TarInputStream) is;
TarEntry te;
while ((te = tis.getNextEntry()) != null) {
if (!te.isDirectory()) {
mapExtension(te.getName(),ext_map);
}
}
} else if (is instanceof ZipInputStream) {
ZipInputStream zis = (ZipInputStream) is;
ZipEntry ze;
while ((ze = zis.getNextEntry()) != null) {
if (!ze.isDirectory()) {
mapExtension(ze.getName(),ext_map);
}
}
}
stream.close();
is.close();
// sort extensions by how often they appear
List<Entry<String,Integer>> values = new ArrayList<Entry<String,Integer>>(ext_map.entrySet());
@ -193,17 +234,42 @@ public class CreateProjectCommand extends Command {
}
}
}
Gridworks.log("**** Most frequent extensions: " + exts.toString());
Gridworks.log("Most frequent extensions: " + exts.toString());
} else if (fileName.endsWith(".zip")) {
// second pass, load the data for real
is = getStream(fileName, new FileInputStream(file));
SafeInputStream sis = new SafeInputStream(is);
if (is instanceof TarInputStream) {
TarInputStream tis = (TarInputStream) is;
TarEntry te;
while ((te = tis.getNextEntry()) != null) {
if (!te.isDirectory()) {
String name = te.getName();
String ext = getExtension(name)[1];
if (exts.contains(ext)) {
internalImportFile(project, options, name, sis);
}
}
}
} else if (is instanceof ZipInputStream) {
ZipInputStream zis = (ZipInputStream) is;
ZipEntry ze;
while ((ze = zis.getNextEntry()) != null) {
if (!ze.isDirectory()) {
String name = ze.getName();
String ext = getExtension(name)[1];
if (exts.contains(ext)) {
internalImportFile(project, options, name, sis);
}
}
}
}
sis.reallyClose();
} else if (fileName.endsWith(".gz")) {
String[] frags = getExtension(fileName);
internalImportFile(project, options, frags[0], new GZIPInputStream(inputStream));
internalImportFile(project, options, getExtension(fileName)[0], new GZIPInputStream(inputStream));
} else if (fileName.endsWith(".bz2")) {
String[] frags = getExtension(fileName);
internalImportFile(project, options, frags[0], new CBZip2InputStream(inputStream));
internalImportFile(project, options, getExtension(fileName)[0], new CBZip2InputStream(inputStream));
} else {
load(project, options, fileName, inputStream);
}
@ -218,7 +284,6 @@ public class CreateProjectCommand extends Command {
private void load(Project project, Properties options, String fileName, InputStream inputStream) throws Exception {
Importer importer = guessImporter(options, null, fileName);
internalInvokeImporter(project, importer, options, inputStream, null);
inputStream.close();
}
private File save(InputStream is) throws IOException {
@ -229,6 +294,27 @@ public class CreateProjectCommand extends Command {
return temp;
}
private void mapExtension(String name, Map<String,Integer> ext_map) {
String ext = getExtension(name)[1];
if (ext_map.containsKey(ext)) {
ext_map.put(ext, ext_map.get(ext) + 1);
} else {
ext_map.put(ext, 1);
}
}
private InputStream getStream(String fileName, InputStream is) throws IOException {
if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) {
return new TarInputStream(new GZIPInputStream(is));
} else if (fileName.endsWith(".tar.bz2")) {
return new TarInputStream(new CBZip2InputStream(is));
} else if (fileName.endsWith(".zip")) {
return new ZipInputStream(is);
} else {
return null;
}
}
private String[] getExtension(String filename) {
String[] result = new String[2];
int ext_index = filename.lastIndexOf(".");
@ -296,39 +382,21 @@ public class CreateProjectCommand extends Command {
String encoding
) throws Exception {
int limit = -1;
int skip = 0;
if (options.containsKey("limit")) {
String s = options.getProperty("limit");
try {
limit = Integer.parseInt(s);
} catch (Exception e) {
}
}
if (options.containsKey("skip")) {
String s = options.getProperty("skip");
try {
skip = Integer.parseInt(s);
} catch (Exception e) {
}
}
BufferedInputStream inputStream = new BufferedInputStream(rawInputStream);
int limit = getIntegerOption("limit",options,-1);
int skip = getIntegerOption("skip",options,0);
if (importer.takesReader()) {
/*
* NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset.
* Unfortunately, not all ServletInputStream implementations are marking, so we need do
* this memory-expensive wrapping to make it work. It's far from ideal but I don't have
* a more efficient solution.
*/
BufferedInputStream inputStream = new BufferedInputStream(rawInputStream);
// NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset.
// Unfortunately, not all ServletInputStream implementations are marking, so we need do
// this memory-expensive wrapping to make it work. It's far from ideal but I don't have
// a more efficient solution.
byte[] bytes = new byte[1024 * 4];
{
inputStream.mark(bytes.length);
inputStream.read(bytes);
inputStream.reset();
}
inputStream.mark(bytes.length);
inputStream.read(bytes);
inputStream.reset();
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that
@ -361,7 +429,7 @@ public class CreateProjectCommand extends Command {
importer.read(reader, project, options, skip, limit);
} else {
importer.read(inputStream, project, options, skip, limit);
importer.read(rawInputStream, project, options, skip, limit);
}
}
@ -372,23 +440,8 @@ public class CreateProjectCommand extends Command {
Reader reader
) throws Exception {
int limit = -1;
int skip = 0;
if (options.containsKey("limit")) {
String s = options.getProperty("limit");
try {
limit = Integer.parseInt(s);
} catch (Exception e) {
}
}
if (options.containsKey("skip")) {
String s = options.getProperty("skip");
try {
skip = Integer.parseInt(s);
} catch (Exception e) {
}
}
int limit = getIntegerOption("limit",options,-1);
int skip = getIntegerOption("skip",options,0);
importer.read(reader, project, options, skip, limit);
}
@ -434,4 +487,16 @@ public class CreateProjectCommand extends Command {
return new TsvCsvImporter();
}
private int getIntegerOption(String name, Properties options, int def) {
int value = def;
if (options.containsKey(name)) {
String s = options.getProperty(name);
try {
value = Integer.parseInt(s);
} catch (Exception e) {
}
}
return value;
}
}

View File

@ -21,65 +21,61 @@ public class TsvCsvImporter implements Importer {
throws Exception {
LineNumberReader lnReader = new LineNumberReader(reader);
try {
String sep = options.getProperty("separator"); // auto-detect if not present
String line = null;
boolean first = true;
int cellCount = 1;
RowParser parser = (sep == null || (sep.length() == 0)) ? null : new SeparatorRowParser(sep);
String sep = options.getProperty("separator"); // auto-detect if not present
String line = null;
boolean first = true;
int cellCount = 1;
RowParser parser = (sep == null || (sep.length() == 0)) ? null : new SeparatorRowParser(sep);
int rowsWithData = 0;
while ((line = lnReader.readLine()) != null) {
if (line.trim().length() == 0) {
continue;
}
int rowsWithData = 0;
while ((line = lnReader.readLine()) != null) {
if (line.trim().length() == 0) {
continue;
}
if (parser == null) {
int tab = line.indexOf('\t');
if (tab >= 0) {
sep = "\t";
parser = new SeparatorRowParser(sep);
} else {
sep = ",";
parser = new CSVRowParser();
}
}
if (first) {
String[] cells = StringUtils.splitPreserveAllTokens(line, sep);
first = false;
for (int c = 0; c < cells.length; c++) {
String cell = cells[c];
if (cell.startsWith("\"") && cell.endsWith("\"")) {
cell = cell.substring(1, cell.length() - 1);
}
Column column = new Column(c, cell);
project.columnModel.columns.add(column);
}
cellCount = cells.length;
if (parser == null) {
int tab = line.indexOf('\t');
if (tab >= 0) {
sep = "\t";
parser = new SeparatorRowParser(sep);
} else {
Row row = new Row(cellCount);
sep = ",";
parser = new CSVRowParser();
}
}
if (parser.parseRow(row, line)) {
rowsWithData++;
if (first) {
String[] cells = StringUtils.splitPreserveAllTokens(line, sep);
if (skip <= 0 || rowsWithData > skip) {
project.rows.add(row);
project.columnModel.setMaxCellIndex(row.cells.size());
first = false;
for (int c = 0; c < cells.length; c++) {
String cell = cells[c];
if (cell.startsWith("\"") && cell.endsWith("\"")) {
cell = cell.substring(1, cell.length() - 1);
}
if (limit > 0 && project.rows.size() >= limit) {
break;
}
Column column = new Column(c, cell);
project.columnModel.columns.add(column);
}
cellCount = cells.length;
} else {
Row row = new Row(cellCount);
if (parser.parseRow(row, line)) {
rowsWithData++;
if (skip <= 0 || rowsWithData > skip) {
project.rows.add(row);
project.columnModel.setMaxCellIndex(row.cells.size());
if (limit > 0 && project.rows.size() >= limit) {
break;
}
}
}
}
} finally {
lnReader.close();
}
}

View File

@ -1,8 +1,8 @@
package com.metaweb.gridworks.importers;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.util.Properties;
@ -13,6 +13,8 @@ import com.metaweb.gridworks.model.Project;
public class XmlImporter implements Importer {
public static final int BUFFER_SIZE = 64 * 1024;
public boolean takesReader() {
return false;
}
@ -30,29 +32,32 @@ public class XmlImporter implements Importer {
int skip,
int limit
) throws Exception {
BufferedInputStream bis = new BufferedInputStream(inputStream);
PushbackInputStream pis = new PushbackInputStream(inputStream,BUFFER_SIZE);
String[] recordPath = null;
{
byte[] buffer = new byte[64 * 1024];
bis.mark(buffer.length);
int c = bis.read(buffer);
bis.reset();
byte[] buffer = new byte[BUFFER_SIZE];
int bytes_read = 0;
while (bytes_read < BUFFER_SIZE) {
int c = pis.read(buffer, bytes_read, BUFFER_SIZE - bytes_read);
if (c == -1) break;
bytes_read +=c ;
}
pis.unread(buffer, 0, bytes_read);
if (options.containsKey("importer-record-tag")) {
recordPath = XmlImportUtilities.detectPathFromTag(
new ByteArrayInputStream(buffer, 0, c),
new ByteArrayInputStream(buffer, 0, bytes_read),
options.getProperty("importer-record-tag"));
} else {
recordPath = XmlImportUtilities.detectRecordElement(
new ByteArrayInputStream(buffer, 0, c));
new ByteArrayInputStream(buffer, 0, bytes_read));
}
}
ImportColumnGroup rootColumnGroup = new ImportColumnGroup();
XmlImportUtilities.importXml(bis, project, recordPath, rootColumnGroup);
XmlImportUtilities.importXml(pis, project, recordPath, rootColumnGroup);
XmlImportUtilities.createColumnsFromImport(project, rootColumnGroup);
project.columnModel.update();