2010-09-22 19:04:10 +02:00
|
|
|
package com.google.refine.commands.project;
|
2010-05-05 01:24:48 +02:00
|
|
|
|
|
|
|
import java.io.BufferedInputStream;
|
|
|
|
import java.io.File;
|
|
|
|
import java.io.FileInputStream;
|
|
|
|
import java.io.FilterInputStream;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.InputStream;
|
|
|
|
import java.io.InputStreamReader;
|
|
|
|
import java.io.Reader;
|
|
|
|
import java.io.Serializable;
|
|
|
|
import java.io.UnsupportedEncodingException;
|
|
|
|
import java.net.URL;
|
|
|
|
import java.net.URLConnection;
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.Collections;
|
|
|
|
import java.util.Comparator;
|
|
|
|
import java.util.HashMap;
|
|
|
|
import java.util.HashSet;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Properties;
|
|
|
|
import java.util.Map.Entry;
|
|
|
|
import java.util.zip.GZIPInputStream;
|
|
|
|
import java.util.zip.ZipEntry;
|
|
|
|
import java.util.zip.ZipInputStream;
|
|
|
|
|
|
|
|
import javax.servlet.ServletException;
|
|
|
|
import javax.servlet.http.HttpServletRequest;
|
|
|
|
import javax.servlet.http.HttpServletResponse;
|
|
|
|
|
|
|
|
import org.apache.commons.fileupload.FileItemIterator;
|
|
|
|
import org.apache.commons.fileupload.FileItemStream;
|
|
|
|
import org.apache.commons.fileupload.servlet.ServletFileUpload;
|
|
|
|
import org.apache.commons.fileupload.util.Streams;
|
|
|
|
import org.apache.tools.bzip2.CBZip2InputStream;
|
|
|
|
import org.apache.tools.tar.TarEntry;
|
|
|
|
import org.apache.tools.tar.TarInputStream;
|
2010-10-13 06:51:01 +02:00
|
|
|
import org.apache.velocity.VelocityContext;
|
2010-05-05 01:24:48 +02:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
2010-09-22 19:04:10 +02:00
|
|
|
import com.google.refine.ProjectManager;
|
|
|
|
import com.google.refine.ProjectMetadata;
|
|
|
|
import com.google.refine.commands.Command;
|
|
|
|
import com.google.refine.importers.Importer;
|
|
|
|
import com.google.refine.importers.ImporterRegistry;
|
|
|
|
import com.google.refine.importers.ReaderImporter;
|
|
|
|
import com.google.refine.importers.StreamImporter;
|
|
|
|
import com.google.refine.importers.TsvCsvImporter;
|
|
|
|
import com.google.refine.importers.UrlImporter;
|
|
|
|
import com.google.refine.model.Project;
|
|
|
|
import com.google.refine.util.IOUtils;
|
|
|
|
import com.google.refine.util.ParsingUtilities;
|
2010-05-05 01:24:48 +02:00
|
|
|
import com.ibm.icu.text.CharsetDetector;
|
|
|
|
import com.ibm.icu.text.CharsetMatch;
|
|
|
|
|
|
|
|
public class CreateProjectCommand extends Command {
|
|
|
|
|
|
|
|
final static Logger logger = LoggerFactory.getLogger("create-project_command");
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
@Override
|
|
|
|
public void doPost(HttpServletRequest request, HttpServletResponse response)
|
|
|
|
throws ServletException, IOException {
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-12 11:02:41 +02:00
|
|
|
ProjectManager.singleton.setBusy(true);
|
2010-05-05 01:24:48 +02:00
|
|
|
try {
|
|
|
|
/*
|
|
|
|
* The uploaded file is in the POST body as a "file part". If
|
|
|
|
* we call request.getParameter() then the POST body will get
|
|
|
|
* read and we won't have a chance to parse the body ourselves.
|
|
|
|
* This is why we have to parse the URL for parameters ourselves.
|
|
|
|
* Don't call request.getParameter() before calling internalImport().
|
|
|
|
*/
|
|
|
|
Properties options = ParsingUtilities.parseUrlParameters(request);
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
Project project = new Project();
|
2010-09-17 03:00:23 +02:00
|
|
|
ProjectMetadata pm = new ProjectMetadata();
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-09-17 03:00:23 +02:00
|
|
|
internalImport(request, project, pm, options);
|
2010-05-05 01:24:48 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The import process above populates options with parameters
|
|
|
|
* in the POST body. That's why we're constructing the project
|
|
|
|
* metadata object after calling internalImport().
|
|
|
|
*/
|
|
|
|
pm.setName(options.getProperty("project-name"));
|
|
|
|
pm.setPassword(options.getProperty("project-password"));
|
|
|
|
pm.setEncoding(options.getProperty("encoding"));
|
|
|
|
pm.setEncodingConfidence(options.getProperty("encoding_confidence"));
|
|
|
|
ProjectManager.singleton.registerProject(project, pm);
|
|
|
|
|
2010-05-19 06:22:45 +02:00
|
|
|
project.update();
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-06-15 22:28:18 +02:00
|
|
|
redirect(response, "/project?project=" + project.id);
|
2010-05-05 01:24:48 +02:00
|
|
|
} catch (Exception e) {
|
2010-10-13 06:51:01 +02:00
|
|
|
respondWithErrorPage(request, response, "Failed to import file", e);
|
2010-05-09 06:34:36 +02:00
|
|
|
} finally {
|
2010-05-12 11:02:41 +02:00
|
|
|
ProjectManager.singleton.setBusy(false);
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
protected void internalImport(
|
|
|
|
HttpServletRequest request,
|
|
|
|
Project project,
|
2010-09-17 03:00:23 +02:00
|
|
|
ProjectMetadata metadata,
|
2010-05-05 01:24:48 +02:00
|
|
|
Properties options
|
|
|
|
) throws Exception {
|
|
|
|
|
|
|
|
ServletFileUpload upload = new ServletFileUpload();
|
2010-08-06 08:15:05 +02:00
|
|
|
String url = options.getProperty("url");
|
|
|
|
boolean imported = false;
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
FileItemIterator iter = upload.getItemIterator(request);
|
|
|
|
while (iter.hasNext()) {
|
|
|
|
FileItemStream item = iter.next();
|
|
|
|
String name = item.getFieldName().toLowerCase();
|
|
|
|
InputStream stream = item.openStream();
|
|
|
|
if (item.isFormField()) {
|
|
|
|
if (name.equals("raw-text")) {
|
|
|
|
Reader reader = new InputStreamReader(stream,"UTF-8");
|
|
|
|
try {
|
2010-09-17 03:00:23 +02:00
|
|
|
internalInvokeImporter(project, new TsvCsvImporter(), metadata, options, reader);
|
2010-08-06 08:15:05 +02:00
|
|
|
imported = true;
|
2010-05-05 01:24:48 +02:00
|
|
|
} finally {
|
|
|
|
reader.close();
|
|
|
|
}
|
2010-08-06 08:15:05 +02:00
|
|
|
} else if (name.equals("project-url")) {
|
2010-05-05 01:24:48 +02:00
|
|
|
url = Streams.asString(stream);
|
|
|
|
} else {
|
|
|
|
options.put(name, Streams.asString(stream));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
String fileName = item.getName().toLowerCase();
|
2010-08-06 08:15:05 +02:00
|
|
|
if (fileName.length() > 0) {
|
|
|
|
try {
|
2010-09-17 03:00:23 +02:00
|
|
|
internalImportFile(project, metadata, options, fileName, stream);
|
2010-08-06 08:15:05 +02:00
|
|
|
imported = true;
|
|
|
|
} finally {
|
|
|
|
stream.close();
|
|
|
|
}
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
}
|
2010-05-05 01:24:48 +02:00
|
|
|
|
2010-08-06 08:15:05 +02:00
|
|
|
if (!imported && url != null && url.length() > 0) {
|
2010-09-17 03:00:23 +02:00
|
|
|
internalImportURL(request, project, metadata, options, url);
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
static class SafeInputStream extends FilterInputStream {
|
|
|
|
public SafeInputStream(InputStream stream) {
|
|
|
|
super(stream);
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
@Override
|
|
|
|
public void close() {
|
2010-05-26 15:18:48 +02:00
|
|
|
// some libraries attempt to close the input stream while they can't
|
|
|
|
// read anymore from it... unfortunately this behavior prevents
|
2010-05-05 01:24:48 +02:00
|
|
|
// the zip input stream from functioning correctly so we just have
|
|
|
|
// to ignore those close() calls and just close it ourselves
|
|
|
|
// forcefully later
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
public void reallyClose() throws IOException {
|
|
|
|
super.close();
|
|
|
|
}
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
protected void internalImportFile(
|
2010-09-17 03:00:23 +02:00
|
|
|
Project project,
|
|
|
|
ProjectMetadata metadata,
|
|
|
|
Properties options,
|
|
|
|
String fileName,
|
|
|
|
InputStream inputStream
|
2010-05-05 01:24:48 +02:00
|
|
|
) throws Exception {
|
|
|
|
|
|
|
|
logger.info("Importing '{}'", fileName);
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
if (fileName.endsWith(".zip") || fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz") || fileName.endsWith(".tar.bz2")) {
|
|
|
|
|
2010-05-26 15:18:48 +02:00
|
|
|
// first, save the file on disk, since we need two passes and we might
|
2010-05-05 01:24:48 +02:00
|
|
|
// not have enough memory to keep it all in there
|
|
|
|
File file = save(inputStream);
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
// in the first pass, gather statistics about what files are in there
|
|
|
|
// unfortunately, we have to rely on files extensions, which is horrible but
|
|
|
|
// better than nothing
|
|
|
|
HashMap<String,Integer> ext_map = new HashMap<String,Integer>();
|
|
|
|
|
2010-05-05 03:35:51 +02:00
|
|
|
FileInputStream fis = new FileInputStream(file);
|
|
|
|
InputStream is = getStream(fileName, fis);
|
2010-05-26 15:18:48 +02:00
|
|
|
|
|
|
|
// NOTE(SM): unfortunately, java.io does not provide any generalized class for
|
|
|
|
// archive-like input streams so while both TarInputStream and ZipInputStream
|
2010-05-05 01:24:48 +02:00
|
|
|
// behave precisely the same, there is no polymorphic behavior so we have
|
|
|
|
// to treat each instance explicitly... one of those times you wish you had
|
|
|
|
// closures
|
|
|
|
try {
|
|
|
|
if (is instanceof TarInputStream) {
|
|
|
|
TarInputStream tis = (TarInputStream) is;
|
|
|
|
TarEntry te;
|
|
|
|
while ((te = tis.getNextEntry()) != null) {
|
|
|
|
if (!te.isDirectory()) {
|
|
|
|
mapExtension(te.getName(),ext_map);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (is instanceof ZipInputStream) {
|
|
|
|
ZipInputStream zis = (ZipInputStream) is;
|
|
|
|
ZipEntry ze;
|
|
|
|
while ((ze = zis.getNextEntry()) != null) {
|
|
|
|
if (!ze.isDirectory()) {
|
|
|
|
mapExtension(ze.getName(),ext_map);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} finally {
|
|
|
|
try {
|
|
|
|
is.close();
|
2010-05-05 03:35:51 +02:00
|
|
|
fis.close();
|
2010-05-05 01:24:48 +02:00
|
|
|
} catch (IOException e) {}
|
|
|
|
}
|
|
|
|
|
|
|
|
// sort extensions by how often they appear
|
|
|
|
List<Entry<String,Integer>> values = new ArrayList<Entry<String,Integer>>(ext_map.entrySet());
|
|
|
|
Collections.sort(values, new ValuesComparator());
|
|
|
|
|
|
|
|
if (values.size() == 0) {
|
|
|
|
throw new RuntimeException("The archive contains no files.");
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
// this will contain the set of extensions we'll load from the archive
|
|
|
|
HashSet<String> exts = new HashSet<String>();
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
// find the extension that is most frequent or those who share the highest frequency value
|
|
|
|
if (values.size() == 1) {
|
|
|
|
exts.add(values.get(0).getKey());
|
|
|
|
} else {
|
|
|
|
Entry<String,Integer> most_frequent = values.get(0);
|
|
|
|
Entry<String,Integer> second_most_frequent = values.get(1);
|
|
|
|
if (most_frequent.getValue() > second_most_frequent.getValue()) { // we have a winner
|
|
|
|
exts.add(most_frequent.getKey());
|
|
|
|
} else { // multiple extensions have the same frequency
|
|
|
|
int winning_frequency = most_frequent.getValue();
|
|
|
|
for (Entry<String,Integer> e : values) {
|
|
|
|
if (e.getValue() == winning_frequency) {
|
|
|
|
exts.add(e.getKey());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
logger.info("Most frequent extensions: {}", exts.toString());
|
|
|
|
|
|
|
|
// second pass, load the data for real
|
|
|
|
is = getStream(fileName, new FileInputStream(file));
|
|
|
|
SafeInputStream sis = new SafeInputStream(is);
|
|
|
|
try {
|
|
|
|
if (is instanceof TarInputStream) {
|
|
|
|
TarInputStream tis = (TarInputStream) is;
|
|
|
|
TarEntry te;
|
|
|
|
while ((te = tis.getNextEntry()) != null) {
|
|
|
|
if (!te.isDirectory()) {
|
|
|
|
String name = te.getName();
|
|
|
|
String ext = getExtension(name)[1];
|
|
|
|
if (exts.contains(ext)) {
|
2010-09-17 03:00:23 +02:00
|
|
|
internalImportFile(project, metadata, options, name, sis);
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (is instanceof ZipInputStream) {
|
|
|
|
ZipInputStream zis = (ZipInputStream) is;
|
|
|
|
ZipEntry ze;
|
|
|
|
while ((ze = zis.getNextEntry()) != null) {
|
|
|
|
if (!ze.isDirectory()) {
|
|
|
|
String name = ze.getName();
|
|
|
|
String ext = getExtension(name)[1];
|
|
|
|
if (exts.contains(ext)) {
|
2010-09-17 03:00:23 +02:00
|
|
|
internalImportFile(project, metadata, options, name, sis);
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} finally {
|
|
|
|
try {
|
|
|
|
sis.reallyClose();
|
|
|
|
} catch (IOException e) {}
|
|
|
|
}
|
|
|
|
|
|
|
|
} else if (fileName.endsWith(".gz")) {
|
2010-09-17 03:00:23 +02:00
|
|
|
internalImportFile(project, metadata, options, getExtension(fileName)[0], new GZIPInputStream(inputStream));
|
2010-05-05 01:24:48 +02:00
|
|
|
} else if (fileName.endsWith(".bz2")) {
|
2010-09-17 03:00:23 +02:00
|
|
|
internalImportFile(project, metadata, options, getExtension(fileName)[0], new CBZip2InputStream(inputStream));
|
2010-05-05 01:24:48 +02:00
|
|
|
} else {
|
2010-09-17 03:00:23 +02:00
|
|
|
load(project, metadata, options, fileName, inputStream);
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public static class ValuesComparator implements Comparator<Entry<String,Integer>>, Serializable {
|
|
|
|
private static final long serialVersionUID = 8845863616149837657L;
|
|
|
|
|
|
|
|
public int compare(Entry<String,Integer> o1, Entry<String,Integer> o2) {
|
|
|
|
return o2.getValue() - o1.getValue();
|
|
|
|
}
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-09-17 03:00:23 +02:00
|
|
|
private void load(Project project, ProjectMetadata metadata, Properties options, String fileName, InputStream inputStream) throws Exception {
|
2010-08-22 03:46:32 +02:00
|
|
|
Importer importer = ImporterRegistry.guessImporter(null, fileName);
|
2010-09-17 03:00:23 +02:00
|
|
|
internalInvokeImporter(project, importer, metadata, options, inputStream, null);
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
private File save(InputStream is) throws IOException {
|
2010-05-29 01:19:08 +02:00
|
|
|
File temp = this.servlet.getTempFile(Long.toString(System.currentTimeMillis()));
|
2010-05-05 01:24:48 +02:00
|
|
|
temp.deleteOnExit();
|
|
|
|
IOUtils.copy(is,temp);
|
|
|
|
is.close();
|
|
|
|
return temp;
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
private void mapExtension(String name, Map<String,Integer> ext_map) {
|
|
|
|
String ext = getExtension(name)[1];
|
|
|
|
if (ext_map.containsKey(ext)) {
|
|
|
|
ext_map.put(ext, ext_map.get(ext) + 1);
|
|
|
|
} else {
|
|
|
|
ext_map.put(ext, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private InputStream getStream(String fileName, InputStream is) throws IOException {
|
2010-05-26 15:18:48 +02:00
|
|
|
if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) {
|
2010-05-05 01:24:48 +02:00
|
|
|
return new TarInputStream(new GZIPInputStream(is));
|
|
|
|
} else if (fileName.endsWith(".tar.bz2")) {
|
|
|
|
return new TarInputStream(new CBZip2InputStream(is));
|
|
|
|
} else {
|
|
|
|
return new ZipInputStream(is);
|
|
|
|
}
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
private String[] getExtension(String filename) {
|
|
|
|
String[] result = new String[2];
|
|
|
|
int ext_index = filename.lastIndexOf('.');
|
|
|
|
result[0] = (ext_index == -1) ? filename : filename.substring(0,ext_index);
|
|
|
|
result[1] = (ext_index == -1) ? "" : filename.substring(ext_index + 1);
|
|
|
|
return result;
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-09-17 03:00:23 +02:00
|
|
|
protected void internalImportURL(
|
|
|
|
HttpServletRequest request,
|
|
|
|
Project project,
|
|
|
|
ProjectMetadata metadata,
|
|
|
|
Properties options,
|
|
|
|
String urlString) throws Exception {
|
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
URL url = new URL(urlString);
|
|
|
|
URLConnection connection = null;
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-08-06 07:04:25 +02:00
|
|
|
// Try for a URL importer first
|
2010-08-22 03:46:32 +02:00
|
|
|
Importer importer = ImporterRegistry.guessUrlImporter(url);
|
2010-08-06 07:04:25 +02:00
|
|
|
if (importer instanceof UrlImporter) {
|
2010-09-17 03:00:23 +02:00
|
|
|
((UrlImporter) importer).read(url, project, metadata, options);
|
2010-08-06 07:04:25 +02:00
|
|
|
} else {
|
|
|
|
// If we couldn't find one, try opening URL and treating as a stream
|
|
|
|
try {
|
|
|
|
connection = url.openConnection();
|
|
|
|
connection.setConnectTimeout(5000);
|
|
|
|
connection.connect();
|
|
|
|
} catch (Exception e) {
|
|
|
|
throw new Exception("Cannot connect to " + urlString, e);
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-08-06 07:04:25 +02:00
|
|
|
InputStream inputStream = null;
|
|
|
|
try {
|
|
|
|
inputStream = connection.getInputStream();
|
|
|
|
} catch (Exception e) {
|
|
|
|
throw new Exception("Cannot retrieve content from " + url, e);
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-08-06 07:04:25 +02:00
|
|
|
try {
|
2010-08-25 19:35:16 +02:00
|
|
|
String contentType = connection.getContentType();
|
|
|
|
int semicolon = contentType.indexOf(';');
|
|
|
|
if (semicolon >= 0) {
|
|
|
|
contentType = contentType.substring(0, semicolon);
|
|
|
|
}
|
|
|
|
|
|
|
|
importer = ImporterRegistry.guessImporter(contentType, url.getPath());
|
2010-08-06 08:15:05 +02:00
|
|
|
|
2010-09-17 03:00:23 +02:00
|
|
|
internalInvokeImporter(project, importer, metadata, options, inputStream, connection.getContentEncoding());
|
2010-08-06 07:04:25 +02:00
|
|
|
} finally {
|
|
|
|
inputStream.close();
|
|
|
|
}
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
protected void internalInvokeImporter(
|
2010-09-17 03:00:23 +02:00
|
|
|
Project project,
|
|
|
|
Importer importer,
|
|
|
|
ProjectMetadata metadata,
|
|
|
|
Properties options,
|
|
|
|
InputStream rawInputStream,
|
|
|
|
String encoding
|
2010-05-05 01:24:48 +02:00
|
|
|
) throws Exception {
|
2010-08-06 07:04:25 +02:00
|
|
|
if (importer instanceof ReaderImporter) {
|
2010-05-05 01:24:48 +02:00
|
|
|
|
|
|
|
BufferedInputStream inputStream = new BufferedInputStream(rawInputStream);
|
2010-05-26 15:18:48 +02:00
|
|
|
|
|
|
|
// NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset.
|
|
|
|
// Unfortunately, not all ServletInputStream implementations are marking, so we need do
|
|
|
|
// this memory-expensive wrapping to make it work. It's far from ideal but I don't have
|
2010-05-05 01:24:48 +02:00
|
|
|
// a more efficient solution.
|
|
|
|
byte[] bytes = new byte[1024 * 4];
|
|
|
|
inputStream.mark(bytes.length);
|
|
|
|
inputStream.read(bytes);
|
|
|
|
inputStream.reset();
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
CharsetDetector detector = new CharsetDetector();
|
|
|
|
detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
Reader reader = null;
|
|
|
|
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();
|
|
|
|
for (CharsetMatch charsetMatch : charsetMatches) {
|
|
|
|
try {
|
2010-09-28 02:26:53 +02:00
|
|
|
int confidence = charsetMatch.getConfidence();
|
|
|
|
if (confidence >= 50) {
|
|
|
|
reader = new InputStreamReader(inputStream, charsetMatch.getName());
|
|
|
|
|
|
|
|
options.setProperty("encoding", charsetMatch.getName());
|
|
|
|
options.setProperty("encoding_confidence", Integer.toString(confidence));
|
|
|
|
|
|
|
|
logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
|
|
|
|
}
|
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
break;
|
|
|
|
} catch (UnsupportedEncodingException e) {
|
|
|
|
// silent
|
|
|
|
}
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
if (reader == null) { // when all else fails
|
|
|
|
reader = encoding != null ?
|
|
|
|
new InputStreamReader(inputStream, encoding) :
|
|
|
|
new InputStreamReader(inputStream);
|
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-09-17 03:00:23 +02:00
|
|
|
((ReaderImporter) importer).read(reader, project, metadata, options);
|
2010-05-05 01:24:48 +02:00
|
|
|
} else {
|
2010-09-17 03:00:23 +02:00
|
|
|
((StreamImporter) importer).read(rawInputStream, project, metadata, options);
|
2010-05-26 15:18:48 +02:00
|
|
|
}
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
protected void internalInvokeImporter(
|
2010-09-17 03:00:23 +02:00
|
|
|
Project project,
|
|
|
|
ReaderImporter importer,
|
|
|
|
ProjectMetadata metadata,
|
|
|
|
Properties options,
|
|
|
|
Reader reader
|
2010-05-05 01:24:48 +02:00
|
|
|
) throws Exception {
|
2010-09-17 03:00:23 +02:00
|
|
|
importer.read(reader, project, metadata, options);
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|
2010-05-26 15:18:48 +02:00
|
|
|
|
2010-05-05 01:24:48 +02:00
|
|
|
}
|