Create Project import improvements (#2806)

* Fix charset encoding & MIME type handling

Character set (ie what we call "encoding") is part of the Content-Type,
*not* the Content-Encoding, which specifies compression (e.g. gzip).

This correctly sets the character set encoding as well as cleaning
the MIME type so that additional parsing doesn't need to be done
downstream (and removes that code).

* Use "text" instead of "text/line-based" as default fallback format

The TextLineBasedGuesser only tries a limited number of
formats (CSV, TSV, fixed), so we can't get out of that hole to
find JSON, XML, etc.

Start with a more general format instead to improve our
guessing odds.

* Support content type Structured Name Syntax Suffixes (+json +xml)

If we can't find a fully specified content type in our lookup,
fall back to just the suffix (which is registered with a leading +)
Fixes #2800 Fixes #2805
This commit is contained in:
Tom Morris 2020-06-25 02:36:57 -04:00 committed by GitHub
parent 3aa610d6aa
commit 4b146acc6e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 82 additions and 16 deletions

View File

@ -256,14 +256,21 @@ public class ImportingManager {
} }
static public String getFormatFromMimeType(String mimeType) { static public String getFormatFromMimeType(String mimeType) {
return mimeTypeToFormat.get(mimeType); String format = mimeTypeToFormat.get(mimeType);
if (format == null) {
// Try Structured Name Syntax Suffix
// https://tools.ietf.org/html/rfc6838#section-4.2.8
if (mimeType.contains("+")) {
// These are registered with a leading plus sign (+) to show they're suffixes
mimeType = "+" + mimeType.split("\\+")[1];
format = mimeTypeToFormat.get(mimeType);
}
}
return format;
} }
static public String getFormat(String fileName, String mimeType) { static public String getFormat(String fileName, String mimeType) {
String fileNameFormat = getFormatFromFileName(fileName); String fileNameFormat = getFormatFromFileName(fileName);
if (mimeType != null) {
mimeType = mimeType.split(";")[0];
}
String mimeTypeFormat = mimeType == null ? null : getFormatFromMimeType(mimeType); String mimeTypeFormat = mimeType == null ? null : getFormatFromMimeType(mimeType);
if (mimeTypeFormat == null) { if (mimeTypeFormat == null) {
return fileNameFormat; return fileNameFormat;

View File

@ -44,6 +44,7 @@ import java.io.Reader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.URL; import java.net.URL;
import java.net.URLConnection; import java.net.URLConnection;
import java.nio.charset.Charset;
import java.text.NumberFormat; import java.text.NumberFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
@ -74,6 +75,7 @@ import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider; import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClientBuilder;
@ -313,16 +315,19 @@ public class ImportingUtilities {
throw new Exception("No content found in " + url.toString()); throw new Exception("No content found in " + url.toString());
} }
InputStream stream2 = entity.getContent(); InputStream stream2 = entity.getContent();
String encoding = null;
if (entity.getContentEncoding() != null) { String mimeType = null;
encoding = entity.getContentEncoding().getValue(); String charset = null;
ContentType contentType = ContentType.get(entity);
if (contentType != null) {
mimeType = contentType.getMimeType();
Charset cs = contentType.getCharset();
if (cs != null) {
charset = cs.toString();
} }
JSONUtilities.safePut(fileRecord, "declaredEncoding", encoding);
String contentType = null;
if (entity.getContentType() != null) {
contentType = entity.getContentType().getValue();
} }
JSONUtilities.safePut(fileRecord, "declaredMimeType", contentType); JSONUtilities.safePut(fileRecord, "declaredMimeType", mimeType);
JSONUtilities.safePut(fileRecord, "declaredEncoding", charset);
if (saveStream(stream2, url, rawDataDir, progress, update, if (saveStream(stream2, url, rawDataDir, progress, update,
fileRecord, fileRecords, fileRecord, fileRecords,
entity.getContentLength())) { entity.getContentLength())) {
@ -803,8 +808,9 @@ public class ImportingUtilities {
} }
}); });
// Default to text/line-based to to avoid parsing as binary/excel. // Default to "text" to to avoid parsing as "binary/excel".
String bestFormat = formats.size() > 0 ? formats.get(0) : "text/line-based"; // "text" is more general than "text/line-based", so a better starting point
String bestFormat = formats.size() > 0 ? formats.get(0) : "text";
if (JSONUtilities.getInt(retrievalRecord, "archiveCount", 0) == 0) { if (JSONUtilities.getInt(retrievalRecord, "archiveCount", 0) == 0) {
// If there's no archive, then select everything // If there's no archive, then select everything
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {

View File

@ -0,0 +1,51 @@
/*******************************************************************************
* Copyright (C) 2018, OpenRefine contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package com.google.refine.importing;
import static org.testng.Assert.assertEquals;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import com.google.refine.importers.ImporterTest;
public class ImportingManagerTests extends ImporterTest {
@Override
@BeforeMethod
public void setUp(){
super.setUp();
}
@Test
public void testStructuredNameSuffixFallback() {
ImportingManager.registerMimeType("+json", "text/json");
assertEquals(ImportingManager.getFormatFromMimeType("application/sparql-results+json"), "text/json");
}
}

View File

@ -280,6 +280,7 @@ function registerImporting() {
IM.registerMimeType("text/turtle", "text/rdf/ttl"); IM.registerMimeType("text/turtle", "text/rdf/ttl");
IM.registerMimeType("application/xml", "text/xml"); IM.registerMimeType("application/xml", "text/xml");
IM.registerMimeType("text/xml", "text/xml"); IM.registerMimeType("text/xml", "text/xml");
IM.registerMimeType("+xml", "text/xml"); // suffix will be tried only as fallback
IM.registerMimeType("application/rdf+xml", "text/rdf/xml"); IM.registerMimeType("application/rdf+xml", "text/rdf/xml");
IM.registerMimeType("application/ld+json", "text/rdf/ld+json"); IM.registerMimeType("application/ld+json", "text/rdf/ld+json");
IM.registerMimeType("application/atom+xml", "text/xml"); IM.registerMimeType("application/atom+xml", "text/xml");
@ -299,6 +300,7 @@ function registerImporting() {
IM.registerMimeType("application/json", "text/json"); IM.registerMimeType("application/json", "text/json");
IM.registerMimeType("application/javascript", "text/json"); IM.registerMimeType("application/javascript", "text/json");
IM.registerMimeType("text/json", "text/json"); IM.registerMimeType("text/json", "text/json");
IM.registerMimeType("+json", "text/json"); // suffix will be tried only as fallback
IM.registerMimeType("application/marc", "text/marc"); IM.registerMimeType("application/marc", "text/marc");