Issue 404 - Fix indeterminate behavior in character encoding guesser. Thanks to Paul Makepeace.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@2120 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Tom Morris 2011-06-14 04:29:44 +00:00
parent 5fa5e7b28a
commit 2af22f9485

View File

@ -1,6 +1,6 @@
/* /*
Copyright 2010, Google Inc. Copyright 2010,2011. Google Inc.
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -447,38 +447,35 @@ public class CreateProjectCommand extends Command {
) throws Exception { ) throws Exception {
if (importer instanceof ReaderImporter) { if (importer instanceof ReaderImporter) {
BufferedInputStream inputStream = new BufferedInputStream(rawInputStream); // NOTE: The ICU4J char detection code requires the input stream to support mark/reset.
InputStream inputStream = rawInputStream;
// NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset. if (!inputStream.markSupported()) {
// Unfortunately, not all ServletInputStream implementations are marking, so we need do inputStream = new BufferedInputStream(rawInputStream);
// this memory-expensive wrapping to make it work. It's far from ideal but I don't have }
// a more efficient solution.
byte[] bytes = new byte[1024 * 4];
inputStream.mark(bytes.length);
inputStream.read(bytes);
inputStream.reset();
CharsetDetector detector = new CharsetDetector(); CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that
options.setProperty("encoding_confidence", "0"); // in case we don't find anything suitable options.setProperty("encoding_confidence", "0"); // in case we don't find anything suitable
InputStreamReader reader = null; InputStreamReader reader = null;
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll(); CharsetMatch[] charsetMatches = detector.setText(inputStream).detectAll();
if (charsetMatches.length > 0) { for (CharsetMatch charsetMatch : charsetMatches) { // matches are ordered - first is best match
CharsetMatch charsetMatch = charsetMatches[0]; // matches are ordered - first is best match String matchName = charsetMatch.getName();
int confidence = charsetMatch.getConfidence(); int confidence = charsetMatch.getConfidence();
// Threshold was 50. Do we ever want to not use our best guess even if it's low confidence? - tfmorris // Threshold was 50. Do we ever want to not use our best guess even if it's low confidence? - tfmorris
if (confidence >= 20) { if (confidence >= 20) {
logger.info("Encoding guess: {} [confidence: {}]", matchName, confidence);
try { try {
reader = new InputStreamReader(inputStream, charsetMatch.getName()); reader = new InputStreamReader(inputStream, matchName);
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
// ignored - we'll fall back to a different reader later logger.debug("Unsupported InputStreamReader charset encoding: {} [confidence: {}]; skipping", matchName, confidence);
continue;
} }
// Encoding will be set later at common exit point // Encoding will be set later at common exit point
options.setProperty("encoding_confidence", Integer.toString(confidence)); options.setProperty("encoding_confidence", Integer.toString(confidence));
logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence()); break;
} else { } else {
logger.debug("Poor encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence()); logger.debug("Poor encoding guess: {} [confidence: {}]; skipping", matchName, confidence);
} }
} }