Switch to Apache HTTP client for downloads - fixes #748

This commit is contained in:
Tom Morris 2013-08-02 18:13:41 -04:00
parent d7531bbbd8
commit 4f7da9d18e
2 changed files with 105 additions and 65 deletions

View File

@ -370,7 +370,10 @@ public class RefineServlet extends Butterfly {
}
static public void setUserAgent(HttpURLConnection httpConnection) {
httpConnection.addRequestProperty("User-Agent", "OpenRefine/" + FULL_VERSION);
httpConnection.addRequestProperty("User-Agent", getUserAgent());
}
static public String getUserAgent() {
return "OpenRefine/" + FULL_VERSION;
}
}

View File

@ -42,7 +42,6 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
@ -65,6 +64,12 @@ import org.apache.commons.fileupload.ProgressListener;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.commons.fileupload.servlet.ServletFileUpload;
import org.apache.commons.fileupload.util.Streams;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DecompressingHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.apache.tools.bzip2.CBZip2InputStream;
import org.apache.tools.tar.TarEntry;
import org.apache.tools.tar.TarInputStream;
@ -210,16 +215,15 @@ public class ImportingUtilities {
}
});
@SuppressWarnings("rawtypes")
List tempFiles = upload.parseRequest(request);
@SuppressWarnings("unchecked")
List<FileItem> tempFiles = (List<FileItem>)upload.parseRequest(request);
progress.setProgress("Uploading data ...", -1);
parts: for (Object obj : tempFiles) {
parts: for (FileItem fileItem : tempFiles) {
if (progress.isCanceled()) {
break;
}
FileItem fileItem = (FileItem) obj;
InputStream stream = fileItem.getInputStream();
String name = fileItem.getFieldName().toLowerCase();
@ -244,10 +248,10 @@ public class ImportingUtilities {
calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null));
JSONUtilities.append(fileRecords, fileRecord);
clipboardCount++;
JSONUtilities.append(fileRecords, fileRecord);
} else if (name.equals("download")) {
String urlString = Streams.asString(stream);
URL url = new URL(urlString);
@ -272,56 +276,62 @@ public class ImportingUtilities {
}
}
URLConnection urlConnection = url.openConnection();
urlConnection.setConnectTimeout(5000);
if (urlConnection instanceof HttpURLConnection) {
HttpURLConnection httpConnection = (HttpURLConnection) urlConnection;
RefineServlet.setUserAgent(httpConnection);
}
// TODO: Set Accept-Encoding on connection so we don't get stuff we can't handle?
urlConnection.connect();
if ("http".equals(url.getProtocol()) || "https".equals(url.getProtocol())) {
DecompressingHttpClient httpclient =
new DecompressingHttpClient(new DefaultHttpClient());
HttpGet httpGet = new HttpGet(url.toURI());
httpGet.setHeader("User-Agent", RefineServlet.getUserAgent());
HttpResponse response = httpclient.execute(httpGet);
InputStream stream2 = urlConnection.getInputStream();
try {
String localname = url.getPath();
if (localname.isEmpty() || localname.endsWith("/")) {
localname = localname + "temp";
response.getStatusLine();
HttpEntity entity = response.getEntity();
if (entity == null) {
throw new Exception("No content found in " + url.toString());
}
File file = allocateFile(rawDataDir, localname);
int contentLength = urlConnection.getContentLength();
if (contentLength > 0) {
update.totalExpectedSize += contentLength;
InputStream stream2 = entity.getContent();
String encoding = null;
if (entity.getContentEncoding() != null) {
encoding = entity.getContentEncoding().getValue();
}
JSONUtilities.safePut(fileRecord, "declaredEncoding", urlConnection.getContentEncoding());
JSONUtilities.safePut(fileRecord, "declaredMimeType", urlConnection.getContentType());
JSONUtilities.safePut(fileRecord, "fileName", file.getName());
JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir));
progress.setProgress("Downloading " + urlString,
calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
long actualLength = saveStreamToFile(stream2, file, update);
JSONUtilities.safePut(fileRecord, "size", actualLength);
if (actualLength == 0) {
throw new Exception("No content found in " + urlString);
} else if (contentLength >= 0) {
update.totalExpectedSize += (actualLength - contentLength);
} else {
update.totalExpectedSize += actualLength;
JSONUtilities.safePut(fileRecord, "declaredEncoding", encoding);
String contentType = null;
if (entity.getContentType().getValue() != null) {
contentType = entity.getContentType().getValue();
}
progress.setProgress("Saving " + urlString + " locally",
calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
if (postProcessRetrievedFile(rawDataDir, file, fileRecord, fileRecords, progress)) {
JSONUtilities.safePut(fileRecord, "declaredMimeType", contentType);
if (saveStream(stream2, url, rawDataDir, progress, update,
fileRecord, fileRecords,
entity.getContentLength())) {
archiveCount++;
}
downloadCount++;
EntityUtils.consume(entity);
} finally {
httpGet.releaseConnection();
}
} else {
// Fallback handling for non HTTP connections (only FTP?)
URLConnection urlConnection = url.openConnection();
urlConnection.setConnectTimeout(5000);
urlConnection.connect();
InputStream stream2 = urlConnection.getInputStream();
JSONUtilities.safePut(fileRecord, "declaredEncoding",
urlConnection.getContentEncoding());
JSONUtilities.safePut(fileRecord, "declaredMimeType",
urlConnection.getContentType());
try {
if (saveStream(stream2, url, rawDataDir, progress,
update, fileRecord, fileRecords,
urlConnection.getContentLength())) {
archiveCount++;
}
downloadCount++;
} finally {
stream2.close();
}
}
} else {
String value = Streams.asString(stream);
parameters.put(name, value);
@ -361,8 +371,8 @@ public class ImportingUtilities {
}
// Delete all temp files.
for (Object obj : tempFiles) {
((FileItem)obj).delete();
for (FileItem fileItem : tempFiles) {
fileItem.delete();
}
JSONUtilities.safePut(retrievalRecord, "uploadCount", uploadCount);
@ -371,6 +381,37 @@ public class ImportingUtilities {
JSONUtilities.safePut(retrievalRecord, "archiveCount", archiveCount);
}
private static boolean saveStream(InputStream stream, URL url, File rawDataDir, final Progress progress,
final SavingUpdate update, JSONObject fileRecord, JSONArray fileRecords, long length)
throws IOException, Exception {
String localname = url.getPath();
if (localname.isEmpty() || localname.endsWith("/")) {
localname = localname + "temp";
}
File file = allocateFile(rawDataDir, localname);
JSONUtilities.safePut(fileRecord, "fileName", file.getName());
JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir));
update.totalExpectedSize += length;
progress.setProgress("Downloading " + url.toString(),
calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
long actualLength = saveStreamToFile(stream, file, update);
JSONUtilities.safePut(fileRecord, "size", actualLength);
if (actualLength == 0) {
throw new Exception("No content found in " + url.toString());
} else if (length >= 0) {
update.totalExpectedSize += (actualLength - length);
} else {
update.totalExpectedSize += actualLength;
}
progress.setProgress("Saving " + url.toString() + " locally",
calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
return postProcessRetrievedFile(rawDataDir, file, fileRecord, fileRecords, progress);
}
static public String getRelativePath(File file, File dir) {
String location = file.getAbsolutePath().substring(dir.getAbsolutePath().length());
return (location.startsWith(File.separator)) ? location.substring(1) : location;
@ -627,17 +668,13 @@ public class ImportingUtilities {
static public InputStream tryOpenAsCompressedFile(File file, String mimeType, String contentEncoding) {
String fileName = file.getName();
try {
/*
* TODO: Do we need to support MIME types as well as content encodings?
* application/x-bzip2
* application/x-gzip
* multipart/x-gzip
*/
if (fileName.endsWith(".gz")
|| "gzip".equals(contentEncoding)
|| "x-gzip".equals(contentEncoding)) {
|| "x-gzip".equals(contentEncoding)
|| "application/x-gzip".equals(mimeType)) {
return new GZIPInputStream(new FileInputStream(file));
} else if (fileName.endsWith(".bz2")) {
} else if (fileName.endsWith(".bz2")
||"application/x-bzip2".equals(mimeType)) {
InputStream is = new FileInputStream(file);
is.mark(4);
if (!(is.read() == 'B' && is.read() == 'Z')) {