RandomSec/main/src/com/google/refine/importing/ImportingUtilities.java

1253 lines
53 KiB
Java

/*
Copyright 2011, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.google.refine.importing;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.fileupload.FileItem;
import org.apache.commons.fileupload.ProgressListener;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.commons.fileupload.servlet.ServletFileUpload;
import org.apache.commons.fileupload.util.Streams;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DecompressingHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.apache.tools.bzip2.CBZip2InputStream;
import org.apache.tools.tar.TarEntry;
import org.apache.tools.tar.TarInputStream;
import org.json.JSONArray;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.refine.ProjectManager;
import com.google.refine.RefineServlet;
import com.google.refine.importing.ImportingManager.Format;
import com.google.refine.importing.UrlRewriter.Result;
import com.google.refine.model.Cell;
import com.google.refine.model.Column;
import com.google.refine.model.ColumnModel;
import com.google.refine.model.Project;
import com.google.refine.model.Row;
import com.google.refine.model.metadata.DataPackageMetadata;
import com.google.refine.model.metadata.IMetadata;
import com.google.refine.model.metadata.MetadataFactory;
import com.google.refine.model.metadata.MetadataFormat;
import com.google.refine.model.metadata.PackageExtension;
import com.google.refine.model.metadata.ProjectMetadata;
import com.google.refine.preference.PreferenceStore;
import com.google.refine.util.JSONUtilities;
import io.frictionlessdata.datapackage.Package;
import io.frictionlessdata.tableschema.Field;
import io.frictionlessdata.tableschema.Schema;
import io.frictionlessdata.tableschema.TypeInferrer;
import io.frictionlessdata.tableschema.exceptions.TypeInferringException;
public class ImportingUtilities {
final static protected Logger logger = LoggerFactory.getLogger("importing-utilities");
private final static String METADATA_FILE_KEY = "metadataFile";
private static final int INFER_ROW_LIMIT = 100;
static public interface Progress {
public void setProgress(String message, int percent);
public boolean isCanceled();
}
static public void loadDataAndPrepareJob(
HttpServletRequest request,
HttpServletResponse response,
Properties parameters,
final ImportingJob job,
JSONObject config) throws IOException, ServletException {
JSONObject retrievalRecord = new JSONObject();
JSONUtilities.safePut(config, "retrievalRecord", retrievalRecord);
JSONUtilities.safePut(config, "state", "loading-raw-data");
final JSONObject progress = new JSONObject();
JSONUtilities.safePut(config, "progress", progress);
try {
ImportingUtilities.retrieveContentFromPostRequest(
request,
parameters,
job.getRawDataDir(),
retrievalRecord,
new Progress() {
@Override
public void setProgress(String message, int percent) {
if (message != null) {
JSONUtilities.safePut(progress, "message", message);
}
JSONUtilities.safePut(progress, "percent", percent);
}
@Override
public boolean isCanceled() {
return job.canceled;
}
}
);
} catch (Exception e) {
JSONUtilities.safePut(config, "state", "error");
JSONUtilities.safePut(config, "error", "Error uploading data");
JSONUtilities.safePut(config, "errorDetails", e.getLocalizedMessage());
return;
}
JSONArray fileSelectionIndexes = new JSONArray();
JSONUtilities.safePut(config, "fileSelection", fileSelectionIndexes);
String bestFormat = ImportingUtilities.autoSelectFiles(job, retrievalRecord, fileSelectionIndexes);
bestFormat = ImportingUtilities.guessBetterFormat(job, bestFormat);
JSONArray rankedFormats = new JSONArray();
ImportingUtilities.rankFormats(job, bestFormat, rankedFormats);
JSONUtilities.safePut(config, "rankedFormats", rankedFormats);
JSONUtilities.safePut(config, "state", "ready");
JSONUtilities.safePut(config, "hasData", true);
config.remove("progress");
}
static public void updateJobWithNewFileSelection(ImportingJob job, JSONArray fileSelectionArray) {
job.setFileSelection(fileSelectionArray);
String bestFormat = ImportingUtilities.getCommonFormatForSelectedFiles(job, fileSelectionArray);
bestFormat = ImportingUtilities.guessBetterFormat(job, bestFormat);
JSONArray rankedFormats = new JSONArray();
ImportingUtilities.rankFormats(job, bestFormat, rankedFormats);
job.setRankedFormats(rankedFormats);
}
static public void retrieveContentFromPostRequest(
HttpServletRequest request,
Properties parameters,
File rawDataDir,
JSONObject retrievalRecord,
final Progress progress
) throws Exception {
JSONArray fileRecords = new JSONArray();
JSONUtilities.safePut(retrievalRecord, "files", fileRecords);
JSONUtilities.safePut(retrievalRecord, "downloadCount", 0);
JSONUtilities.safePut(retrievalRecord, "archiveCount", 0);
int clipboardCount = 0;
int uploadCount = 0;
// This tracks the total progress, which involves uploading data from the client
// as well as downloading data from URLs.
final SavingUpdate update = new SavingUpdate() {
@Override
public void savedMore() {
progress.setProgress(null, calculateProgressPercent(totalExpectedSize, totalRetrievedSize));
}
@Override
public boolean isCanceled() {
return progress.isCanceled();
}
};
DiskFileItemFactory fileItemFactory = new DiskFileItemFactory();
ServletFileUpload upload = new ServletFileUpload(fileItemFactory);
upload.setProgressListener(new ProgressListener() {
boolean setContentLength = false;
long lastBytesRead = 0;
@Override
public void update(long bytesRead, long contentLength, int itemCount) {
if (!setContentLength) {
// Only try to set the content length if we really know it.
if (contentLength >= 0) {
update.totalExpectedSize += contentLength;
setContentLength = true;
}
}
if (setContentLength) {
update.totalRetrievedSize += (bytesRead - lastBytesRead);
lastBytesRead = bytesRead;
update.savedMore();
}
}
});
@SuppressWarnings("unchecked")
List<FileItem> tempFiles = (List<FileItem>)upload.parseRequest(request);
progress.setProgress("Uploading data ...", -1);
for (FileItem fileItem : tempFiles) {
if (progress.isCanceled()) {
break;
}
InputStream stream = fileItem.getInputStream();
String name = fileItem.getFieldName().toLowerCase();
if (fileItem.isFormField()) {
if (name.equals("clipboard")) {
String encoding = request.getCharacterEncoding();
if (encoding == null) {
encoding = "UTF-8";
}
File file = allocateFile(rawDataDir, "clipboard.txt");
JSONObject fileRecord = new JSONObject();
JSONUtilities.safePut(fileRecord, "origin", "clipboard");
JSONUtilities.safePut(fileRecord, "declaredEncoding", encoding);
JSONUtilities.safePut(fileRecord, "declaredMimeType", (String) null);
JSONUtilities.safePut(fileRecord, "format", "text");
JSONUtilities.safePut(fileRecord, "fileName", "(clipboard)");
JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir));
progress.setProgress("Uploading pasted clipboard text",
calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null));
JSONUtilities.append(fileRecords, fileRecord);
clipboardCount++;
} else if (name.equals("download")) {
String urlString = Streams.asString(stream);
download(rawDataDir, retrievalRecord, progress, fileRecords, update, urlString);
processDataPackage(retrievalRecord, fileRecords);
} else if (name.equals("data-package")) {
String urlString = Streams.asString(stream);
List<Result> results = null;
for (UrlRewriter rewriter : ImportingManager.urlRewriters) {
results = rewriter.rewrite(urlString);
if (results != null) {
for (Result result : results) {
download(rawDataDir, retrievalRecord, progress, fileRecords,
update, result.rewrittenUrl, result.metaDataFormat);
}
}
}
} else {
String value = Streams.asString(stream);
parameters.put(name, value);
// TODO: We really want to store this on the request so it's available for everyone
// request.getParameterMap().put(name, value);
}
} else { // is file content
String fileName = fileItem.getName();
if (fileName.length() > 0) {
long fileSize = fileItem.getSize();
File file = allocateFile(rawDataDir, fileName);
JSONObject fileRecord = new JSONObject();
JSONUtilities.safePut(fileRecord, "origin", "upload");
JSONUtilities.safePut(fileRecord, "declaredEncoding", request.getCharacterEncoding());
JSONUtilities.safePut(fileRecord, "declaredMimeType", fileItem.getContentType());
JSONUtilities.safePut(fileRecord, "fileName", fileName);
JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir));
progress.setProgress(
"Saving file " + fileName + " locally (" + formatBytes(fileSize) + " bytes)",
calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null));
if (postProcessRetrievedFile(rawDataDir, file, fileRecord, fileRecords, progress)) {
JSONUtilities.safeInc(retrievalRecord, "archiveCount");
}
processDataPackage(retrievalRecord, fileRecords);
uploadCount++;
}
}
stream.close();
}
// Delete all temp files.
for (FileItem fileItem : tempFiles) {
fileItem.delete();
}
JSONUtilities.safePut(retrievalRecord, "uploadCount", uploadCount);
JSONUtilities.safePut(retrievalRecord, "clipboardCount", clipboardCount);
}
private static void processDataPackage(JSONObject retrievalRecord, JSONArray fileRecords) {
int dataPackageJSONFileIndex = getDataPackageJSONFile(fileRecords);
if (dataPackageJSONFileIndex >= 0) {
JSONObject dataPackageJSONFile = (JSONObject) fileRecords.get(dataPackageJSONFileIndex);
JSONUtilities.safePut(dataPackageJSONFile, "metaDataFormat", MetadataFormat.DATAPACKAGE_METADATA.name());
JSONUtilities.safePut(retrievalRecord, METADATA_FILE_KEY, dataPackageJSONFile);
fileRecords.remove(dataPackageJSONFileIndex);
}
}
private static int getDataPackageJSONFile(JSONArray fileRecords) {
for (int i = 0; i < fileRecords.length(); i++) {
JSONObject file = fileRecords.getJSONObject(i);
if (file.has("archiveFileName") &&
file.has("fileName") &&
file.get("fileName").equals(DataPackageMetadata.DEFAULT_FILE_NAME)) {
return i;
}
}
return -1;
}
private static void download(File rawDataDir, JSONObject retrievalRecord, final Progress progress,
JSONArray fileRecords, final SavingUpdate update, String urlString)
throws URISyntaxException, IOException, ClientProtocolException, Exception {
download(rawDataDir, retrievalRecord, progress, fileRecords, update, urlString, null);
}
/**
* @param rawDataDir
* @param retrievalRecord
* @param progress
* @param fileRecords
* @param update
* @param urlString
* @throws URISyntaxException
* @throws IOException
* @throws ClientProtocolException
* @throws Exception
*/
private static void download(File rawDataDir, JSONObject retrievalRecord, final Progress progress,
JSONArray fileRecords, final SavingUpdate update, String urlString, String metaDataFormat)
throws URISyntaxException, IOException, ClientProtocolException, Exception {
URL url = new URL(urlString);
JSONObject fileRecord = new JSONObject();
JSONUtilities.safePut(fileRecord, "origin", "download");
JSONUtilities.safePut(fileRecord, "url", urlString);
if ("http".equals(url.getProtocol()) || "https".equals(url.getProtocol())) {
DefaultHttpClient client = new DefaultHttpClient();
DecompressingHttpClient httpclient =
new DecompressingHttpClient(client);
HttpGet httpGet = new HttpGet(url.toURI());
httpGet.setHeader("User-Agent", RefineServlet.getUserAgent());
if ("https".equals(url.getProtocol())) {
// HTTPS only - no sending password in the clear over HTTP
String userinfo = url.getUserInfo();
if (userinfo != null) {
int s = userinfo.indexOf(':');
if (s > 0) {
String user = userinfo.substring(0, s);
String pw = userinfo.substring(s + 1, userinfo.length());
client.getCredentialsProvider().setCredentials(
new AuthScope(url.getHost(), 443),
new UsernamePasswordCredentials(user, pw));
}
}
}
HttpResponse response = httpclient.execute(httpGet);
try {
int code = response.getStatusLine().getStatusCode();
if (code != HttpStatus.SC_OK) {
throw new Exception("HTTP response code: " + code +
" when accessing URL: "+ url.toString());
}
HttpEntity entity = response.getEntity();
if (entity == null) {
throw new Exception("No content found in " + url.toString());
}
InputStream stream2 = entity.getContent();
String encoding = null;
if (entity.getContentEncoding() != null) {
encoding = entity.getContentEncoding().getValue();
}
JSONUtilities.safePut(fileRecord, "declaredEncoding", encoding);
String contentType = null;
if (entity.getContentType() != null) {
contentType = entity.getContentType().getValue();
}
JSONUtilities.safePut(fileRecord, "declaredMimeType", contentType);
if (saveStream(stream2, url, rawDataDir, progress, update,
fileRecord, fileRecords,
entity.getContentLength())) {
JSONUtilities.safeInc(retrievalRecord, "archiveCount");
}
if (metaDataFormat != null) {
JSONUtilities.safePut(fileRecord, "metaDataFormat", metaDataFormat);
JSONUtilities.safePut(retrievalRecord, METADATA_FILE_KEY, fileRecord);
fileRecords.remove(0);
}
JSONUtilities.safeInc(retrievalRecord, "downloadCount");
EntityUtils.consume(entity);
} finally {
httpGet.releaseConnection();
}
} else {
// Fallback handling for non HTTP connections (only FTP?)
URLConnection urlConnection = url.openConnection();
urlConnection.setConnectTimeout(5000);
urlConnection.connect();
InputStream stream2 = urlConnection.getInputStream();
JSONUtilities.safePut(fileRecord, "declaredEncoding",
urlConnection.getContentEncoding());
JSONUtilities.safePut(fileRecord, "declaredMimeType",
urlConnection.getContentType());
try {
if (saveStream(stream2, url, rawDataDir, progress,
update, fileRecord, fileRecords,
urlConnection.getContentLength())) {
JSONUtilities.safeInc(retrievalRecord, "archiveCount");
}
if (metaDataFormat != null)
JSONUtilities.safePut(fileRecord, "metaDataFormat", metaDataFormat);
JSONUtilities.safeInc(retrievalRecord, "downloadCount");
} finally {
stream2.close();
}
}
}
private static boolean saveStream(InputStream stream, URL url, File rawDataDir, final Progress progress,
final SavingUpdate update, JSONObject fileRecord, JSONArray fileRecords, long length)
throws IOException, Exception {
String localname = url.getPath();
if (localname.isEmpty() || localname.endsWith("/")) {
localname = localname + "temp";
}
File file = allocateFile(rawDataDir, localname);
JSONUtilities.safePut(fileRecord, "fileName", file.getName());
JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir));
update.totalExpectedSize += length;
progress.setProgress("Downloading " + url.toString(),
calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
long actualLength = saveStreamToFile(stream, file, update);
JSONUtilities.safePut(fileRecord, "size", actualLength);
if (actualLength == 0) {
throw new Exception("No content found in " + url.toString());
} else if (length >= 0) {
update.totalExpectedSize += (actualLength - length);
} else {
update.totalExpectedSize += actualLength;
}
progress.setProgress("Saving " + url.toString() + " locally",
calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
return postProcessRetrievedFile(rawDataDir, file, fileRecord, fileRecords, progress);
}
static public String getRelativePath(File file, File dir) {
String location = file.getAbsolutePath().substring(dir.getAbsolutePath().length());
return (location.startsWith(File.separator)) ? location.substring(1) : location;
}
static public File allocateFile(File dir, String name) {
int q = name.indexOf('?');
if (q > 0) {
name = name.substring(0, q);
}
File file = new File(dir, name);
int dot = name.indexOf('.');
String prefix = dot < 0 ? name : name.substring(0, dot);
String suffix = dot < 0 ? "" : name.substring(dot);
int index = 2;
while (file.exists()) {
file = new File(dir, prefix + "-" + index++ + suffix);
}
file.getParentFile().mkdirs();
return file;
}
static public Reader getFileReader(ImportingJob job, JSONObject fileRecord, String commonEncoding)
throws FileNotFoundException {
return getFileReader(getFile(job, JSONUtilities.getString(fileRecord, "location", "")), fileRecord, commonEncoding);
}
static public Reader getFileReader(File file, JSONObject fileRecord, String commonEncoding) throws FileNotFoundException {
return getReaderFromStream(new FileInputStream(file), fileRecord, commonEncoding);
}
static public Reader getReaderFromStream(InputStream inputStream, JSONObject fileRecord, String commonEncoding) {
String encoding = getEncoding(fileRecord);
if (encoding == null) {
encoding = commonEncoding;
}
if (encoding != null) {
try {
return new InputStreamReader(inputStream, encoding);
} catch (UnsupportedEncodingException e) {
// Ignore and fall through
}
}
return new InputStreamReader(inputStream);
}
static public File getFile(ImportingJob job, JSONObject fileRecord) {
return getFile(job, JSONUtilities.getString(fileRecord, "location", ""));
}
static public File getFile(ImportingJob job, String location) {
return new File(job.getRawDataDir(), location);
}
static public String getFileSource(JSONObject fileRecord) {
return JSONUtilities.getString(
fileRecord,
"url",
JSONUtilities.getString(fileRecord, "fileName", "unknown")
);
}
static private abstract class SavingUpdate {
public long totalExpectedSize = 0;
public long totalRetrievedSize = 0;
abstract public void savedMore();
abstract public boolean isCanceled();
}
static public long saveStreamToFile(InputStream stream, File file, SavingUpdate update) throws IOException {
long length = 0;
FileOutputStream fos = new FileOutputStream(file);
try {
byte[] bytes = new byte[16*1024];
int c;
while ((update == null || !update.isCanceled()) && (c = stream.read(bytes)) > 0) {
fos.write(bytes, 0, c);
length += c;
if (update != null) {
update.totalRetrievedSize += c;
update.savedMore();
}
}
return length;
} finally {
fos.close();
}
}
static public boolean postProcessRetrievedFile(
File rawDataDir, File file, JSONObject fileRecord, JSONArray fileRecords, final Progress progress) {
String mimeType = JSONUtilities.getString(fileRecord, "declaredMimeType", null);
String contentEncoding = JSONUtilities.getString(fileRecord, "declaredEncoding", null);
InputStream archiveIS = tryOpenAsArchive(file, mimeType, contentEncoding);
if (archiveIS != null) {
try {
if (explodeArchive(rawDataDir, archiveIS, fileRecord, fileRecords, progress)) {
file.delete();
return true;
}
} finally {
try {
archiveIS.close();
} catch (IOException e) {
// TODO: what to do?
}
}
}
InputStream uncompressedIS = tryOpenAsCompressedFile(file, mimeType, contentEncoding);
if (uncompressedIS != null) {
try {
File file2 = uncompressFile(rawDataDir, uncompressedIS, fileRecord, progress);
file.delete();
file = file2;
} catch (IOException e) {
// TODO: what to do?
e.printStackTrace();
} finally {
try {
uncompressedIS.close();
} catch (IOException e) {
// TODO: what to do?
}
}
}
postProcessSingleRetrievedFile(file, fileRecord);
JSONUtilities.append(fileRecords, fileRecord);
return false;
}
static public void postProcessSingleRetrievedFile(File file, JSONObject fileRecord) {
if (!fileRecord.has("format")) {
JSONUtilities.safePut(fileRecord, "format",
ImportingManager.getFormat(
file.getName(),
JSONUtilities.getString(fileRecord, "declaredMimeType", null)));
}
}
static public InputStream tryOpenAsArchive(File file, String mimeType) {
return tryOpenAsArchive(file, mimeType, null);
}
static public InputStream tryOpenAsArchive(File file, String mimeType, String contentType) {
String fileName = file.getName();
try {
if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) {
return new TarInputStream(new GZIPInputStream(new FileInputStream(file)));
} else if (fileName.endsWith(".tar.bz2")) {
return new TarInputStream(new CBZip2InputStream(new FileInputStream(file)));
} else if (fileName.endsWith(".tar") || "application/x-tar".equals(contentType)) {
return new TarInputStream(new FileInputStream(file));
} else if (fileName.endsWith(".zip")
|| "application/x-zip-compressed".equals(contentType)
|| "application/zip".equals(contentType)
|| "application/x-compressed".equals(contentType)
|| "multipar/x-zip".equals(contentType)) {
return new ZipInputStream(new FileInputStream(file));
} else if (fileName.endsWith(".kmz")) {
return new ZipInputStream(new FileInputStream(file));
}
} catch (IOException e) {
}
return null;
}
static public boolean explodeArchive(
File rawDataDir,
InputStream archiveIS,
JSONObject archiveFileRecord,
JSONArray fileRecords,
final Progress progress
) {
if (archiveIS instanceof TarInputStream) {
TarInputStream tis = (TarInputStream) archiveIS;
try {
TarEntry te;
while (!progress.isCanceled() && (te = tis.getNextEntry()) != null) {
if (!te.isDirectory()) {
String fileName2 = te.getName();
File file2 = allocateFile(rawDataDir, fileName2);
progress.setProgress("Extracting " + fileName2, -1);
JSONObject fileRecord2 = new JSONObject();
JSONUtilities.safePut(fileRecord2, "origin", JSONUtilities.getString(archiveFileRecord, "origin", null));
JSONUtilities.safePut(fileRecord2, "declaredEncoding", (String) null);
JSONUtilities.safePut(fileRecord2, "declaredMimeType", (String) null);
JSONUtilities.safePut(fileRecord2, "fileName", fileName2);
JSONUtilities.safePut(fileRecord2, "archiveFileName", JSONUtilities.getString(archiveFileRecord, "fileName", null));
JSONUtilities.safePut(fileRecord2, "location", getRelativePath(file2, rawDataDir));
JSONUtilities.safePut(fileRecord2, "size", saveStreamToFile(tis, file2, null));
postProcessSingleRetrievedFile(file2, fileRecord2);
JSONUtilities.append(fileRecords, fileRecord2);
}
}
} catch (IOException e) {
// TODO: what to do?
e.printStackTrace();
}
return true;
} else if (archiveIS instanceof ZipInputStream) {
ZipInputStream zis = (ZipInputStream) archiveIS;
try {
ZipEntry ze;
while (!progress.isCanceled() && (ze = zis.getNextEntry()) != null) {
if (!ze.isDirectory()) {
String fileName2 = ze.getName();
File file2 = allocateFile(rawDataDir, fileName2);
progress.setProgress("Extracting " + fileName2, -1);
JSONObject fileRecord2 = new JSONObject();
JSONUtilities.safePut(fileRecord2, "origin", JSONUtilities.getString(archiveFileRecord, "origin", null));
JSONUtilities.safePut(fileRecord2, "declaredEncoding", (String) null);
JSONUtilities.safePut(fileRecord2, "declaredMimeType", (String) null);
JSONUtilities.safePut(fileRecord2, "fileName", fileName2);
JSONUtilities.safePut(fileRecord2, "archiveFileName", JSONUtilities.getString(archiveFileRecord, "fileName", null));
JSONUtilities.safePut(fileRecord2, "location", getRelativePath(file2, rawDataDir));
JSONUtilities.safePut(fileRecord2, "size", saveStreamToFile(zis, file2, null));
postProcessSingleRetrievedFile(file2, fileRecord2);
JSONUtilities.append(fileRecords, fileRecord2);
}
}
} catch (IOException e) {
// TODO: what to do?
e.printStackTrace();
}
return true;
}
return false;
}
static public InputStream tryOpenAsCompressedFile(File file, String mimeType) {
return tryOpenAsCompressedFile(file, mimeType, null);
}
static public InputStream tryOpenAsCompressedFile(File file, String mimeType, String contentEncoding) {
String fileName = file.getName();
try {
if (fileName.endsWith(".gz")
|| "gzip".equals(contentEncoding)
|| "x-gzip".equals(contentEncoding)
|| "application/x-gzip".equals(mimeType)) {
return new GZIPInputStream(new FileInputStream(file));
} else if (fileName.endsWith(".bz2")
||"application/x-bzip2".equals(mimeType)) {
InputStream is = new FileInputStream(file);
is.mark(4);
if (!(is.read() == 'B' && is.read() == 'Z')) {
// No BZ prefix as appended by command line tools. Reset and hope for the best
is.reset();
}
return new CBZip2InputStream(is);
}
} catch (IOException e) {
logger.warn("Something that looked like a compressed file gave an error on open: "+file,e);
}
return null;
}
static public File uncompressFile(
File rawDataDir,
InputStream uncompressedIS,
JSONObject fileRecord,
final Progress progress
) throws IOException {
String fileName = JSONUtilities.getString(fileRecord, "location", "unknown");
for (String ext : new String[] {".gz",".bz2"}) {
if (fileName.endsWith(ext)) {
fileName = fileName.substring(0, fileName.length()-ext.length());
break;
}
}
File file2 = allocateFile(rawDataDir, fileName);
progress.setProgress("Uncompressing " + fileName, -1);
saveStreamToFile(uncompressedIS, file2, null);
JSONUtilities.safePut(fileRecord, "declaredEncoding", (String) null);
JSONUtilities.safePut(fileRecord, "declaredMimeType", (String) null);
JSONUtilities.safePut(fileRecord, "location", getRelativePath(file2, rawDataDir));
return file2;
}
static private int calculateProgressPercent(long totalExpectedSize, long totalRetrievedSize) {
return totalExpectedSize == 0 ? -1 : (int) (totalRetrievedSize * 100 / totalExpectedSize);
}
static private String formatBytes(long bytes) {
return NumberFormat.getIntegerInstance().format(bytes);
}
static public String getEncoding(JSONObject fileRecord) {
String encoding = JSONUtilities.getString(fileRecord, "encoding", null);
if (encoding == null || encoding.isEmpty()) {
encoding = JSONUtilities.getString(fileRecord, "declaredEncoding", null);
}
return encoding;
}
/**
* Figure out the best (most common) format for the set of files, select
* all files which match that format, and return the format found.
*
* @param job ImportingJob object
* @param retrievalRecord JSON object containing "files" key with all our files
* @param fileSelectionIndexes JSON array of selected file indices matching best format
* @return best (highest frequency) format
*/
static public String autoSelectFiles(ImportingJob job, JSONObject retrievalRecord, JSONArray fileSelectionIndexes) {
final Map<String, Integer> formatToCount = new HashMap<String, Integer>();
List<String> formats = new ArrayList<String>();
JSONArray fileRecords = JSONUtilities.getArray(retrievalRecord, "files");
int count = fileRecords.length();
for (int i = 0; i < count; i++) {
JSONObject fileRecord = JSONUtilities.getObjectElement(fileRecords, i);
String format = JSONUtilities.getString(fileRecord, "format", null);
if (format != null) {
if (formatToCount.containsKey(format)) {
formatToCount.put(format, formatToCount.get(format) + 1);
} else {
formatToCount.put(format, 1);
formats.add(format);
}
}
}
Collections.sort(formats, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return formatToCount.get(o2) - formatToCount.get(o1);
}
});
// Default to text/line-based to to avoid parsing as binary/excel.
String bestFormat = formats.size() > 0 ? formats.get(0) : "text/line-based";
if (JSONUtilities.getInt(retrievalRecord, "archiveCount", 0) == 0) {
// If there's no archive, then select everything
for (int i = 0; i < count; i++) {
JSONUtilities.append(fileSelectionIndexes, i);
}
} else {
// Otherwise, select files matching the best format
for (int i = 0; i < count; i++) {
JSONObject fileRecord = JSONUtilities.getObjectElement(fileRecords, i);
String format = JSONUtilities.getString(fileRecord, "format", null);
if (format != null && format.equals(bestFormat)) {
JSONUtilities.append(fileSelectionIndexes, i);
}
}
// If nothing matches the best format but we have some files,
// then select them all
if (fileSelectionIndexes.length() == 0 && count > 0) {
for (int i = 0; i < count; i++) {
JSONUtilities.append(fileSelectionIndexes, i);
}
}
}
return bestFormat;
}
static public String getCommonFormatForSelectedFiles(ImportingJob job, JSONArray fileSelectionIndexes) {
JSONObject retrievalRecord = job.getRetrievalRecord();
final Map<String, Integer> formatToCount = new HashMap<String, Integer>();
List<String> formats = new ArrayList<String>();
JSONArray fileRecords = JSONUtilities.getArray(retrievalRecord, "files");
int count = fileSelectionIndexes.length();
for (int i = 0; i < count; i++) {
int index = JSONUtilities.getIntElement(fileSelectionIndexes, i, -1);
if (index >= 0 && index < fileRecords.length()) {
JSONObject fileRecord = JSONUtilities.getObjectElement(fileRecords, index);
String format = JSONUtilities.getString(fileRecord, "format", null);
if (format != null) {
if (formatToCount.containsKey(format)) {
formatToCount.put(format, formatToCount.get(format) + 1);
} else {
formatToCount.put(format, 1);
formats.add(format);
}
}
}
}
Collections.sort(formats, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return formatToCount.get(o2) - formatToCount.get(o1);
}
});
return formats.size() > 0 ? formats.get(0) : null;
}
static String guessBetterFormat(ImportingJob job, String bestFormat) {
JSONObject retrievalRecord = job.getRetrievalRecord();
return retrievalRecord != null ? guessBetterFormat(job, retrievalRecord, bestFormat) : bestFormat;
}
static String guessBetterFormat(ImportingJob job, JSONObject retrievalRecord, String bestFormat) {
JSONArray fileRecords = JSONUtilities.getArray(retrievalRecord, "files");
return fileRecords != null ? guessBetterFormat(job, fileRecords, bestFormat) : bestFormat;
}
static String guessBetterFormat(ImportingJob job, JSONArray fileRecords, String bestFormat) {
if (bestFormat != null && fileRecords != null && fileRecords.length() > 0) {
JSONObject firstFileRecord = JSONUtilities.getObjectElement(fileRecords, 0);
String encoding = getEncoding(firstFileRecord);
String location = JSONUtilities.getString(firstFileRecord, "location", null);
if (location != null) {
File file = new File(job.getRawDataDir(), location);
while (true) {
String betterFormat = null;
List<FormatGuesser> guessers = ImportingManager.formatToGuessers.get(bestFormat);
if (guessers != null) {
for (FormatGuesser guesser : guessers) {
betterFormat = guesser.guess(file, encoding, bestFormat);
if (betterFormat != null) {
break;
}
}
}
if (betterFormat != null && !betterFormat.equals(bestFormat)) {
bestFormat = betterFormat;
} else {
break;
}
}
}
}
return bestFormat;
}
static void rankFormats(ImportingJob job, final String bestFormat, JSONArray rankedFormats) {
final Map<String, String[]> formatToSegments = new HashMap<String, String[]>();
boolean download = bestFormat == null ? true :
ImportingManager.formatToRecord.get(bestFormat).download;
List<String> formats = new ArrayList<String>(ImportingManager.formatToRecord.keySet().size());
for (String format : ImportingManager.formatToRecord.keySet()) {
Format record = ImportingManager.formatToRecord.get(format);
if (record.uiClass != null && record.parser != null && record.download == download) {
formats.add(format);
formatToSegments.put(format, format.split("/"));
}
}
if (bestFormat == null) {
Collections.sort(formats);
} else {
Collections.sort(formats, new Comparator<String>() {
@Override
public int compare(String format1, String format2) {
if (format1.equals(bestFormat)) {
return -1;
} else if (format2.equals(bestFormat)) {
return 1;
} else {
return compareBySegments(format1, format2);
}
}
int compareBySegments(String format1, String format2) {
int c = commonSegments(format2) - commonSegments(format1);
return c != 0 ? c : format1.compareTo(format2);
}
int commonSegments(String format) {
String[] bestSegments = formatToSegments.get(bestFormat);
String[] segments = formatToSegments.get(format);
if (bestSegments == null || segments == null) {
return 0;
} else {
int i;
for (i = 0; i < bestSegments.length && i < segments.length; i++) {
if (!bestSegments[i].equals(segments[i])) {
break;
}
}
return i;
}
}
});
}
for (String format : formats) {
JSONUtilities.append(rankedFormats, format);
}
}
static public void previewParse(ImportingJob job, String format, JSONObject optionObj, List<Exception> exceptions) {
Format record = ImportingManager.formatToRecord.get(format);
if (record == null || record.parser == null) {
// TODO: what to do?
return;
}
job.prepareNewProject();
record.parser.parse(
job.project,
job.metadata,
job,
job.getSelectedFileRecords(),
format,
100,
optionObj,
exceptions
);
job.project.update(); // update all internal models, indexes, caches, etc.
}
static public long createProject(
final ImportingJob job,
final String format,
final JSONObject optionObj,
final List<Exception> exceptions,
boolean synchronous) {
final Format record = ImportingManager.formatToRecord.get(format);
if (record == null || record.parser == null) {
// TODO: what to do?
return -1;
}
job.setState("creating-project");
final Project project = new Project();
if (synchronous) {
createProjectSynchronously(
job, format, optionObj, exceptions, record, project);
} else {
new Thread() {
@Override
public void run() {
createProjectSynchronously(
job, format, optionObj, exceptions, record, project);
}
}.start();
}
return project.id;
}
static private void createProjectSynchronously(
final ImportingJob job,
final String format,
final JSONObject optionObj,
final List<Exception> exceptions,
final Format record,
final Project project
) {
ProjectMetadata pm = createProjectMetadata(optionObj);
record.parser.parse(
project,
pm,
job,
job.getSelectedFileRecords(),
format,
-1,
optionObj,
exceptions
);
if (!job.canceled) {
if (exceptions.size() == 0) {
project.update(); // update all internal models, indexes, caches, etc.
boolean hasMetadataFileRecord = ((JSONObject)job.getRetrievalRecord()).has(METADATA_FILE_KEY);
if (hasMetadataFileRecord) {
JSONObject metadataFileRecord = (JSONObject) job.getRetrievalRecord().get(METADATA_FILE_KEY);
String metadataFormat = (String)metadataFileRecord.get("metaDataFormat");
IMetadata metadata = MetadataFactory.buildMetadata(MetadataFormat.valueOf(metadataFormat));
String relativePath = metadataFileRecord.getString("location");
File metadataFile = new File(job.getRawDataDir(), relativePath);
metadata.loadFromFile(metadataFile);
// process the data package metadata
if (MetadataFormat.valueOf(metadataFormat) == MetadataFormat.DATAPACKAGE_METADATA) {
populateDataPackageMetadata(project, pm, (DataPackageMetadata) metadata);
}
logger.info(metadataFileRecord.get("metaDataFormat") + " metadata is set for project " + project.id);
}
ProjectManager.singleton.registerProject(project, pm);
// infer the column type
inferColumnType(project);
job.setProjectID(project.id);
job.setState("created-project");
} else {
job.setError(exceptions);
}
job.touch();
job.updating = false;
}
}
public static void inferColumnType(final Project project) {
if (project.columnModel.columns.get(0).getType().isEmpty()) {
List<Object[]> listCells = new ArrayList<Object[]>(INFER_ROW_LIMIT);
List<Row> rows = project.rows
.stream()
.limit(INFER_ROW_LIMIT)
.map(Row::dup)
.collect(Collectors.toList());
// convert the null object to prevent the NPE
for (Row row : rows) {
for (int i = 0; i < row.cells.size(); i++) {
Cell cell = row.cells.get(i);
if (cell == null) {
row.cells.set(i, new Cell(StringUtils.EMPTY, null));
}
}
listCells.add(row.cells.toArray());
}
try {
JSONObject fieldsJSON = TypeInferrer.getInstance().infer(listCells,
project.columnModel.getColumnNames().toArray(new String[0]),
100);
populateColumnTypes(project.columnModel, fieldsJSON.getJSONArray(Schema.JSON_KEY_FIELDS));
} catch (TypeInferringException e) {
logger.error("infer column type exception.", ExceptionUtils.getStackTrace(e));
}
}
}
private static void populateDataPackageMetadata(Project project, ProjectMetadata pmd, DataPackageMetadata metadata) {
// project metadata
JSONObject pkg = metadata.getPackage().getJson();
pmd.setName(getDataPackageProperty(pkg, Package.JSON_KEY_NAME));
pmd.setDescription(getDataPackageProperty(pkg, PackageExtension.JSON_KEY_DESCRIPTION));
pmd.setTitle(getDataPackageProperty(pkg, PackageExtension.JSON_KEY_TITLE));
pmd.setHomepage(getDataPackageProperty(pkg, PackageExtension.JSON_KEY_HOMEPAGE));
pmd.setImage(getDataPackageProperty(pkg, PackageExtension.JSON_KEY_IMAGE));
pmd.setLicense(getDataPackageProperty(pkg, PackageExtension.JSON_KEY_LICENSE));
pmd.setVersion(getDataPackageProperty(pkg, PackageExtension.JSON_KEY_VERSION));
if (pkg.has(PackageExtension.JSON_KEY_KEYWORKS)) {
String[] tags = pkg.getJSONArray(PackageExtension.JSON_KEY_KEYWORKS).toList().toArray(new String[0]);
pmd.appendTags(tags);
}
// column model
JSONObject schema = metadata.getPackage().getResources().get(0).getSchema();
if (schema != null) {
populateColumnTypes(project.columnModel, schema.getJSONArray(Schema.JSON_KEY_FIELDS));
}
}
private static String getDataPackageProperty(JSONObject pkg, String key) {
return JSONUtilities.getString(pkg, key, StringUtils.EMPTY);
}
/**
* Populate the column model
* @param columnModel
* @param fieldsJSON
*/
private static void populateColumnTypes(ColumnModel columnModel, JSONArray fieldsJSON) {
int cellIndex = 0;
Iterator<Object> iter = fieldsJSON.iterator();
while(iter.hasNext()){
JSONObject fieldJsonObj = (JSONObject)iter.next();
Field field = new Field(fieldJsonObj);
Column column = columnModel.getColumnByCellIndex(cellIndex);
column.setType(field.getType());
column.setFormat(field.getFormat());
column.setDescription(field.getDescription());
column.setTitle(field.getTitle());
column.setConstraints(field.getConstraints());
cellIndex++;
}
}
/**
* Create project metadata. pull the "USER_NAME" from the PreferenceStore as the creator
* @param optionObj
* @return
*/
static public ProjectMetadata createProjectMetadata(JSONObject optionObj) {
ProjectMetadata pm = new ProjectMetadata();
PreferenceStore ps = ProjectManager.singleton.getPreferenceStore();
pm.setName(JSONUtilities.getString(optionObj, "projectName", "Untitled"));
pm.setTags(JSONUtilities.getStringArray(optionObj, "projectTags"));
pm.setTitle(JSONUtilities.getString(optionObj, "title", ""));
pm.setHomepage(JSONUtilities.getString(optionObj, "homepage", ""));
pm.setImage(JSONUtilities.getString(optionObj, "image", ""));
pm.setLicense(JSONUtilities.getString(optionObj, "license", ""));
String encoding = JSONUtilities.getString(optionObj, "encoding", "UTF-8");
if ("".equals(encoding)) {
// encoding can be present, but empty, which won't trigger JSONUtilities default processing
encoding = "UTF-8";
}
pm.setEncoding(encoding);
if (ps.get(PreferenceStore.USER_NAME) != null) {
String creator = (String) ps.get(PreferenceStore.USER_NAME);
pm.setCreator(creator);
}
return pm;
}
}