Increased file upload size limit to 1GB.

Fixed charset detector to be more robust in trying more than one charset.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@326 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-03-19 19:53:11 +00:00
parent fd85be7816
commit ff0049307e

View File

@ -1,12 +1,12 @@
package com.metaweb.gridworks.commands.edit;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Properties;
@ -67,12 +67,7 @@ public class CreateProjectCommand extends Command {
Project project,
Properties options
) throws Exception {
MultipartParser parser = null;
try {
parser = new MultipartParser(request, 20 * 1024 * 1024);
} catch (Exception e) {
// silent
}
MultipartParser parser = new MultipartParser(request, 1024 * 1024 * 1024);
if (parser != null) {
Part part = null;
@ -100,24 +95,56 @@ public class CreateProjectCommand extends Command {
if (part.isFile()) {
FilePart filePart = (FilePart) part;
BufferedInputStream inputStream = new BufferedInputStream(filePart.getInputStream());
Importer importer = guessImporter(options, null, filePart.getFileName());
if (importer.takesReader()) {
/*
* NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset.
* Unfortunately, not all ServletInputStream implementations are marking, so we need do
* this memory-expensive wrapping to make it work. It's far from ideal but I don't have
* a more efficient solution.
*/
byte[] bytes = new byte[1024 * 4];
{
inputStream.mark(bytes.length);
inputStream.read(bytes);
inputStream.reset();
}
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("utf8"); // the content on the web is encoded in UTF-8 so assume that
CharsetMatch charsetMatch = detector.setText(enforceMarking(filePart.getInputStream())).detect();
options.setProperty("encoding", charsetMatch.getName());
options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence()));
Gridworks.log("Best encoding guess: " + charsetMatch.getName() + " [confidence: " + charsetMatch.getConfidence() + "]");
Reader reader = charsetMatch.getReader();
Reader reader = null;
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();
for (CharsetMatch charsetMatch : charsetMatches) {
try {
reader = new InputStreamReader(inputStream, charsetMatch.getName());
options.setProperty("encoding", charsetMatch.getName());
options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence()));
Gridworks.log(
"Best encoding guess: " +
charsetMatch.getName() +
" [confidence: " + charsetMatch.getConfidence() + "]");
break;
} catch (UnsupportedEncodingException e) {
// silent
}
}
if (reader == null) {
reader = new InputStreamReader(inputStream); // all else has failed
}
try {
importer.read(charsetMatch.getReader(), project, options, skip, limit);
importer.read(reader, project, options, skip, limit);
} finally {
reader.close();
}
} else {
InputStream inputStream = filePart.getInputStream();
try {
importer.read(inputStream, project, options, skip, limit);
} finally {
@ -224,29 +251,4 @@ public class CreateProjectCommand extends Command {
return new TsvCsvImporter();
}
/*
* NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset. Unfortunately, not
* all ServletInputStream implementations are marking, so we need do this memory-expensive wrapping to make
* it work. It's far from ideal but I don't have a more efficient solution.
*/
private static InputStream enforceMarking(InputStream input) throws IOException {
if (input.markSupported()) {
return input;
} else {
ByteArrayOutputStream output = new ByteArrayOutputStream(64 * 1024);
byte[] buffer = new byte[1024 * 4];
long count = 0;
int n = 0;
while (-1 != (n = input.read(buffer))) {
output.write(buffer, 0, n);
count += n;
}
input.close();
return new ByteArrayInputStream(output.toByteArray());
}
}
}