Increased file upload size limit to 1GB.
Fixed charset detector to be more robust in trying more than one charset. git-svn-id: http://google-refine.googlecode.com/svn/trunk@326 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
fd85be7816
commit
ff0049307e
@ -1,12 +1,12 @@
|
|||||||
package com.metaweb.gridworks.commands.edit;
|
package com.metaweb.gridworks.commands.edit;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.net.URLConnection;
|
import java.net.URLConnection;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
@ -67,12 +67,7 @@ public class CreateProjectCommand extends Command {
|
|||||||
Project project,
|
Project project,
|
||||||
Properties options
|
Properties options
|
||||||
) throws Exception {
|
) throws Exception {
|
||||||
MultipartParser parser = null;
|
MultipartParser parser = new MultipartParser(request, 1024 * 1024 * 1024);
|
||||||
try {
|
|
||||||
parser = new MultipartParser(request, 20 * 1024 * 1024);
|
|
||||||
} catch (Exception e) {
|
|
||||||
// silent
|
|
||||||
}
|
|
||||||
|
|
||||||
if (parser != null) {
|
if (parser != null) {
|
||||||
Part part = null;
|
Part part = null;
|
||||||
@ -100,24 +95,56 @@ public class CreateProjectCommand extends Command {
|
|||||||
|
|
||||||
if (part.isFile()) {
|
if (part.isFile()) {
|
||||||
FilePart filePart = (FilePart) part;
|
FilePart filePart = (FilePart) part;
|
||||||
|
BufferedInputStream inputStream = new BufferedInputStream(filePart.getInputStream());
|
||||||
|
|
||||||
Importer importer = guessImporter(options, null, filePart.getFileName());
|
Importer importer = guessImporter(options, null, filePart.getFileName());
|
||||||
|
|
||||||
if (importer.takesReader()) {
|
if (importer.takesReader()) {
|
||||||
|
/*
|
||||||
|
* NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset.
|
||||||
|
* Unfortunately, not all ServletInputStream implementations are marking, so we need do
|
||||||
|
* this memory-expensive wrapping to make it work. It's far from ideal but I don't have
|
||||||
|
* a more efficient solution.
|
||||||
|
*/
|
||||||
|
byte[] bytes = new byte[1024 * 4];
|
||||||
|
{
|
||||||
|
inputStream.mark(bytes.length);
|
||||||
|
inputStream.read(bytes);
|
||||||
|
inputStream.reset();
|
||||||
|
}
|
||||||
|
|
||||||
CharsetDetector detector = new CharsetDetector();
|
CharsetDetector detector = new CharsetDetector();
|
||||||
detector.setDeclaredEncoding("utf8"); // the content on the web is encoded in UTF-8 so assume that
|
detector.setDeclaredEncoding("utf8"); // the content on the web is encoded in UTF-8 so assume that
|
||||||
CharsetMatch charsetMatch = detector.setText(enforceMarking(filePart.getInputStream())).detect();
|
|
||||||
options.setProperty("encoding", charsetMatch.getName());
|
Reader reader = null;
|
||||||
options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence()));
|
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();
|
||||||
Gridworks.log("Best encoding guess: " + charsetMatch.getName() + " [confidence: " + charsetMatch.getConfidence() + "]");
|
for (CharsetMatch charsetMatch : charsetMatches) {
|
||||||
Reader reader = charsetMatch.getReader();
|
try {
|
||||||
|
reader = new InputStreamReader(inputStream, charsetMatch.getName());
|
||||||
|
|
||||||
|
options.setProperty("encoding", charsetMatch.getName());
|
||||||
|
options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence()));
|
||||||
|
|
||||||
|
Gridworks.log(
|
||||||
|
"Best encoding guess: " +
|
||||||
|
charsetMatch.getName() +
|
||||||
|
" [confidence: " + charsetMatch.getConfidence() + "]");
|
||||||
|
|
||||||
|
break;
|
||||||
|
} catch (UnsupportedEncodingException e) {
|
||||||
|
// silent
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (reader == null) {
|
||||||
|
reader = new InputStreamReader(inputStream); // all else has failed
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
importer.read(charsetMatch.getReader(), project, options, skip, limit);
|
importer.read(reader, project, options, skip, limit);
|
||||||
} finally {
|
} finally {
|
||||||
reader.close();
|
reader.close();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
InputStream inputStream = filePart.getInputStream();
|
|
||||||
try {
|
try {
|
||||||
importer.read(inputStream, project, options, skip, limit);
|
importer.read(inputStream, project, options, skip, limit);
|
||||||
} finally {
|
} finally {
|
||||||
@ -224,29 +251,4 @@ public class CreateProjectCommand extends Command {
|
|||||||
|
|
||||||
return new TsvCsvImporter();
|
return new TsvCsvImporter();
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset. Unfortunately, not
|
|
||||||
* all ServletInputStream implementations are marking, so we need do this memory-expensive wrapping to make
|
|
||||||
* it work. It's far from ideal but I don't have a more efficient solution.
|
|
||||||
*/
|
|
||||||
private static InputStream enforceMarking(InputStream input) throws IOException {
|
|
||||||
if (input.markSupported()) {
|
|
||||||
return input;
|
|
||||||
} else {
|
|
||||||
ByteArrayOutputStream output = new ByteArrayOutputStream(64 * 1024);
|
|
||||||
|
|
||||||
byte[] buffer = new byte[1024 * 4];
|
|
||||||
long count = 0;
|
|
||||||
int n = 0;
|
|
||||||
while (-1 != (n = input.read(buffer))) {
|
|
||||||
output.write(buffer, 0, n);
|
|
||||||
count += n;
|
|
||||||
}
|
|
||||||
input.close();
|
|
||||||
|
|
||||||
return new ByteArrayInputStream(output.toByteArray());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user