From 4f7da9d18e05361a6b1135528394b59f1e13b244 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Fri, 2 Aug 2013 18:13:41 -0400 Subject: [PATCH] Switch to Apache HTTP client for downloads - fixes #748 --- main/src/com/google/refine/RefineServlet.java | 5 +- .../refine/importing/ImportingUtilities.java | 165 +++++++++++------- 2 files changed, 105 insertions(+), 65 deletions(-) diff --git a/main/src/com/google/refine/RefineServlet.java b/main/src/com/google/refine/RefineServlet.java index 8a16c7514..50bf557aa 100644 --- a/main/src/com/google/refine/RefineServlet.java +++ b/main/src/com/google/refine/RefineServlet.java @@ -370,7 +370,10 @@ public class RefineServlet extends Butterfly { } static public void setUserAgent(HttpURLConnection httpConnection) { - httpConnection.addRequestProperty("User-Agent", "OpenRefine/" + FULL_VERSION); + httpConnection.addRequestProperty("User-Agent", getUserAgent()); } + static public String getUserAgent() { + return "OpenRefine/" + FULL_VERSION; + } } \ No newline at end of file diff --git a/main/src/com/google/refine/importing/ImportingUtilities.java b/main/src/com/google/refine/importing/ImportingUtilities.java index 5fd2a1cda..508a3b5a0 100644 --- a/main/src/com/google/refine/importing/ImportingUtilities.java +++ b/main/src/com/google/refine/importing/ImportingUtilities.java @@ -42,7 +42,6 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; -import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; @@ -65,6 +64,12 @@ import org.apache.commons.fileupload.ProgressListener; import org.apache.commons.fileupload.disk.DiskFileItemFactory; import org.apache.commons.fileupload.servlet.ServletFileUpload; import org.apache.commons.fileupload.util.Streams; +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.DecompressingHttpClient; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.util.EntityUtils; import org.apache.tools.bzip2.CBZip2InputStream; import org.apache.tools.tar.TarEntry; import org.apache.tools.tar.TarInputStream; @@ -210,16 +215,15 @@ public class ImportingUtilities { } }); - @SuppressWarnings("rawtypes") - List tempFiles = upload.parseRequest(request); + @SuppressWarnings("unchecked") + List tempFiles = (List)upload.parseRequest(request); progress.setProgress("Uploading data ...", -1); - parts: for (Object obj : tempFiles) { + parts: for (FileItem fileItem : tempFiles) { if (progress.isCanceled()) { break; } - FileItem fileItem = (FileItem) obj; InputStream stream = fileItem.getInputStream(); String name = fileItem.getFieldName().toLowerCase(); @@ -244,10 +248,10 @@ public class ImportingUtilities { calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize)); JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null)); + JSONUtilities.append(fileRecords, fileRecord); clipboardCount++; - JSONUtilities.append(fileRecords, fileRecord); } else if (name.equals("download")) { String urlString = Streams.asString(stream); URL url = new URL(urlString); @@ -271,56 +275,62 @@ public class ImportingUtilities { } } } - - URLConnection urlConnection = url.openConnection(); - urlConnection.setConnectTimeout(5000); - if (urlConnection instanceof HttpURLConnection) { - HttpURLConnection httpConnection = (HttpURLConnection) urlConnection; - RefineServlet.setUserAgent(httpConnection); - } - // TODO: Set Accept-Encoding on connection so we don't get stuff we can't handle? - urlConnection.connect(); - - InputStream stream2 = urlConnection.getInputStream(); - try { - String localname = url.getPath(); - if (localname.isEmpty() || localname.endsWith("/")) { - localname = localname + "temp"; - } - File file = allocateFile(rawDataDir, localname); - - int contentLength = urlConnection.getContentLength(); - if (contentLength > 0) { - update.totalExpectedSize += contentLength; - } - - JSONUtilities.safePut(fileRecord, "declaredEncoding", urlConnection.getContentEncoding()); - JSONUtilities.safePut(fileRecord, "declaredMimeType", urlConnection.getContentType()); - JSONUtilities.safePut(fileRecord, "fileName", file.getName()); - JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir)); - progress.setProgress("Downloading " + urlString, - calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize)); - - long actualLength = saveStreamToFile(stream2, file, update); - JSONUtilities.safePut(fileRecord, "size", actualLength); - if (actualLength == 0) { - throw new Exception("No content found in " + urlString); - } else if (contentLength >= 0) { - update.totalExpectedSize += (actualLength - contentLength); - } else { - update.totalExpectedSize += actualLength; - } - progress.setProgress("Saving " + urlString + " locally", - calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize)); - - if (postProcessRetrievedFile(rawDataDir, file, fileRecord, fileRecords, progress)) { - archiveCount++; - } + if ("http".equals(url.getProtocol()) || "https".equals(url.getProtocol())) { + DecompressingHttpClient httpclient = + new DecompressingHttpClient(new DefaultHttpClient()); + HttpGet httpGet = new HttpGet(url.toURI()); + httpGet.setHeader("User-Agent", RefineServlet.getUserAgent()); - downloadCount++; - } finally { - stream2.close(); + HttpResponse response = httpclient.execute(httpGet); + + try { + response.getStatusLine(); + HttpEntity entity = response.getEntity(); + if (entity == null) { + throw new Exception("No content found in " + url.toString()); + } + InputStream stream2 = entity.getContent(); + String encoding = null; + if (entity.getContentEncoding() != null) { + encoding = entity.getContentEncoding().getValue(); + } + JSONUtilities.safePut(fileRecord, "declaredEncoding", encoding); + String contentType = null; + if (entity.getContentType().getValue() != null) { + contentType = entity.getContentType().getValue(); + } + JSONUtilities.safePut(fileRecord, "declaredMimeType", contentType); + if (saveStream(stream2, url, rawDataDir, progress, update, + fileRecord, fileRecords, + entity.getContentLength())) { + archiveCount++; + } + downloadCount++; + EntityUtils.consume(entity); + } finally { + httpGet.releaseConnection(); + } + } else { + // Fallback handling for non HTTP connections (only FTP?) + URLConnection urlConnection = url.openConnection(); + urlConnection.setConnectTimeout(5000); + urlConnection.connect(); + InputStream stream2 = urlConnection.getInputStream(); + JSONUtilities.safePut(fileRecord, "declaredEncoding", + urlConnection.getContentEncoding()); + JSONUtilities.safePut(fileRecord, "declaredMimeType", + urlConnection.getContentType()); + try { + if (saveStream(stream2, url, rawDataDir, progress, + update, fileRecord, fileRecords, + urlConnection.getContentLength())) { + archiveCount++; + } + downloadCount++; + } finally { + stream2.close(); + } } } else { String value = Streams.asString(stream); @@ -361,8 +371,8 @@ public class ImportingUtilities { } // Delete all temp files. - for (Object obj : tempFiles) { - ((FileItem)obj).delete(); + for (FileItem fileItem : tempFiles) { + fileItem.delete(); } JSONUtilities.safePut(retrievalRecord, "uploadCount", uploadCount); @@ -370,6 +380,37 @@ public class ImportingUtilities { JSONUtilities.safePut(retrievalRecord, "clipboardCount", clipboardCount); JSONUtilities.safePut(retrievalRecord, "archiveCount", archiveCount); } + + private static boolean saveStream(InputStream stream, URL url, File rawDataDir, final Progress progress, + final SavingUpdate update, JSONObject fileRecord, JSONArray fileRecords, long length) + throws IOException, Exception { + String localname = url.getPath(); + if (localname.isEmpty() || localname.endsWith("/")) { + localname = localname + "temp"; + } + File file = allocateFile(rawDataDir, localname); + + JSONUtilities.safePut(fileRecord, "fileName", file.getName()); + JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir)); + + update.totalExpectedSize += length; + + progress.setProgress("Downloading " + url.toString(), + calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize)); + + long actualLength = saveStreamToFile(stream, file, update); + JSONUtilities.safePut(fileRecord, "size", actualLength); + if (actualLength == 0) { + throw new Exception("No content found in " + url.toString()); + } else if (length >= 0) { + update.totalExpectedSize += (actualLength - length); + } else { + update.totalExpectedSize += actualLength; + } + progress.setProgress("Saving " + url.toString() + " locally", + calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize)); + return postProcessRetrievedFile(rawDataDir, file, fileRecord, fileRecords, progress); + } static public String getRelativePath(File file, File dir) { String location = file.getAbsolutePath().substring(dir.getAbsolutePath().length()); @@ -627,17 +668,13 @@ public class ImportingUtilities { static public InputStream tryOpenAsCompressedFile(File file, String mimeType, String contentEncoding) { String fileName = file.getName(); try { - /* - * TODO: Do we need to support MIME types as well as content encodings? - * application/x-bzip2 - * application/x-gzip - * multipart/x-gzip - */ if (fileName.endsWith(".gz") || "gzip".equals(contentEncoding) - || "x-gzip".equals(contentEncoding)) { + || "x-gzip".equals(contentEncoding) + || "application/x-gzip".equals(mimeType)) { return new GZIPInputStream(new FileInputStream(file)); - } else if (fileName.endsWith(".bz2")) { + } else if (fileName.endsWith(".bz2") + ||"application/x-bzip2".equals(mimeType)) { InputStream is = new FileInputStream(file); is.mark(4); if (!(is.read() == 'B' && is.read() == 'Z')) {