Create Project import improvements (#2806)

* Fix charset encoding & MIME type handling Character set (ie what we call "encoding") is part of the Content-Type, *not* the Content-Encoding, which specifies compression (e.g. gzip). This correctly sets the character set encoding as well as cleaning the MIME type so that additional parsing doesn't need to be done downstream (and removes that code). * Use "text" instead of "text/line-based" as default fallback format The TextLineBasedGuesser only tries a limited number of formats (CSV, TSV, fixed), so we can't get out of that hole to find JSON, XML, etc. Start with a more general format instead to improve our guessing odds. * Support content type Structured Name Syntax Suffixes (+json +xml) If we can't find a fully specified content type in our lookup, fall back to just the suffix (which is registered with a leading +) Fixes #2800 Fixes #2805
2020-06-25 02:36:57 -04:00 · 2020-06-25 02:36:57 -04:00 · 4b146acc6e
commit 4b146acc6e
parent 3aa610d6aa
4 changed files with 82 additions and 16 deletions
--- a/main/src/com/google/refine/importing/ImportingManager.java
+++ b/main/src/com/google/refine/importing/ImportingManager.java
@ -256,14 +256,21 @@ public class ImportingManager {
    }
    
    static public String getFormatFromMimeType(String mimeType) {
-        return mimeTypeToFormat.get(mimeType);
+        String format = mimeTypeToFormat.get(mimeType);
+        if (format == null) {
+            // Try Structured Name Syntax Suffix
+            // https://tools.ietf.org/html/rfc6838#section-4.2.8
+            if (mimeType.contains("+")) {
+                // These are registered with a leading plus sign (+) to show they're suffixes
+                mimeType = "+" + mimeType.split("\\+")[1];
+                format = mimeTypeToFormat.get(mimeType);
+            }
+        }
+        return format;
    }
    
    static public String getFormat(String fileName, String mimeType) {
        String fileNameFormat = getFormatFromFileName(fileName);
-        if (mimeType != null) {
-            mimeType = mimeType.split(";")[0];
-        }
        String mimeTypeFormat = mimeType == null ? null : getFormatFromMimeType(mimeType);
        if (mimeTypeFormat == null) {
            return fileNameFormat;
--- a/main/src/com/google/refine/importing/ImportingUtilities.java
+++ b/main/src/com/google/refine/importing/ImportingUtilities.java
@ -44,6 +44,7 @@ import java.io.Reader;
 import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.net.URLConnection;
+import java.nio.charset.Charset;
 import java.text.NumberFormat;
 import java.util.ArrayList;
 import java.util.Collections;
@ -74,6 +75,7 @@ import org.apache.http.auth.UsernamePasswordCredentials;
 import org.apache.http.client.CredentialsProvider;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpGet;
+import org.apache.http.entity.ContentType;
 import org.apache.http.impl.client.BasicCredentialsProvider;
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClientBuilder;
@ -313,16 +315,19 @@ public class ImportingUtilities {
                                throw new Exception("No content found in " + url.toString());
                            }
                            InputStream stream2 = entity.getContent();
-                            String encoding = null;
-                            if (entity.getContentEncoding() != null) {
-                                encoding = entity.getContentEncoding().getValue();
+
+                            String mimeType = null;
+                            String charset = null;
+                            ContentType contentType = ContentType.get(entity);
+                            if (contentType != null) {
+                                mimeType = contentType.getMimeType();
+                                Charset cs = contentType.getCharset();
+                                if (cs != null) {
+                                    charset = cs.toString();
                                }
-                            JSONUtilities.safePut(fileRecord, "declaredEncoding", encoding);
-                            String contentType = null;
-                            if (entity.getContentType() != null) {
-                                contentType = entity.getContentType().getValue();
                            }
-                            JSONUtilities.safePut(fileRecord, "declaredMimeType", contentType);
+                            JSONUtilities.safePut(fileRecord, "declaredMimeType", mimeType);
+                            JSONUtilities.safePut(fileRecord, "declaredEncoding", charset);
                            if (saveStream(stream2, url, rawDataDir, progress, update,
                                    fileRecord, fileRecords,
                                    entity.getContentLength())) {
@ -803,8 +808,9 @@ public class ImportingUtilities {
            }
        });
        
-        // Default to text/line-based to to avoid parsing as binary/excel.
-        String bestFormat = formats.size() > 0 ? formats.get(0) : "text/line-based";
+        // Default to "text" to to avoid parsing as "binary/excel".
+        // "text" is more general than "text/line-based", so a better starting point
+        String bestFormat = formats.size() > 0 ? formats.get(0) : "text";
        if (JSONUtilities.getInt(retrievalRecord, "archiveCount", 0) == 0) {
            // If there's no archive, then select everything
            for (int i = 0; i < count; i++) {
--- a/main/tests/server/src/com/google/refine/importing/ImportingManagerTests.java
+++ b/main/tests/server/src/com/google/refine/importing/ImportingManagerTests.java
@ -0,0 +1,51 @@
+/*******************************************************************************
+ * Copyright (C) 2018, OpenRefine contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ ******************************************************************************/
+package com.google.refine.importing;
+
+import static org.testng.Assert.assertEquals;
+
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+import com.google.refine.importers.ImporterTest;
+
+public class ImportingManagerTests extends ImporterTest {
+
+    @Override
+    @BeforeMethod
+    public void setUp(){
+        super.setUp();
+    }
+
+
+    @Test
+    public void testStructuredNameSuffixFallback() {
+        ImportingManager.registerMimeType("+json", "text/json");
+        assertEquals(ImportingManager.getFormatFromMimeType("application/sparql-results+json"), "text/json");
+    }
+
+}
--- a/main/webapp/modules/core/MOD-INF/controller.js
+++ b/main/webapp/modules/core/MOD-INF/controller.js
@ -280,6 +280,7 @@ function registerImporting() {
  IM.registerMimeType("text/turtle", "text/rdf/ttl");
  IM.registerMimeType("application/xml", "text/xml");
  IM.registerMimeType("text/xml", "text/xml");
+  IM.registerMimeType("+xml", "text/xml"); // suffix will be tried only as fallback
  IM.registerMimeType("application/rdf+xml", "text/rdf/xml");
  IM.registerMimeType("application/ld+json", "text/rdf/ld+json");
  IM.registerMimeType("application/atom+xml", "text/xml");
@ -299,6 +300,7 @@ function registerImporting() {
  IM.registerMimeType("application/json", "text/json");
  IM.registerMimeType("application/javascript", "text/json");
  IM.registerMimeType("text/json", "text/json");
+  IM.registerMimeType("+json", "text/json"); // suffix will be tried only as fallback

  IM.registerMimeType("application/marc", "text/marc");