Issue 404 - Fix indeterminate behavior in character encoding guesser. Thanks to Paul Makepeace.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@2120 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
5fa5e7b28a
commit
2af22f9485
@ -1,6 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
|
|
||||||
Copyright 2010, Google Inc.
|
Copyright 2010,2011. Google Inc.
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
@ -447,38 +447,35 @@ public class CreateProjectCommand extends Command {
|
|||||||
) throws Exception {
|
) throws Exception {
|
||||||
if (importer instanceof ReaderImporter) {
|
if (importer instanceof ReaderImporter) {
|
||||||
|
|
||||||
BufferedInputStream inputStream = new BufferedInputStream(rawInputStream);
|
// NOTE: The ICU4J char detection code requires the input stream to support mark/reset.
|
||||||
|
InputStream inputStream = rawInputStream;
|
||||||
// NOTE(SM): The ICU4J char detection code requires the input stream to support mark/reset.
|
if (!inputStream.markSupported()) {
|
||||||
// Unfortunately, not all ServletInputStream implementations are marking, so we need do
|
inputStream = new BufferedInputStream(rawInputStream);
|
||||||
// this memory-expensive wrapping to make it work. It's far from ideal but I don't have
|
}
|
||||||
// a more efficient solution.
|
|
||||||
byte[] bytes = new byte[1024 * 4];
|
|
||||||
inputStream.mark(bytes.length);
|
|
||||||
inputStream.read(bytes);
|
|
||||||
inputStream.reset();
|
|
||||||
|
|
||||||
CharsetDetector detector = new CharsetDetector();
|
CharsetDetector detector = new CharsetDetector();
|
||||||
detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that
|
detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that
|
||||||
options.setProperty("encoding_confidence", "0"); // in case we don't find anything suitable
|
options.setProperty("encoding_confidence", "0"); // in case we don't find anything suitable
|
||||||
|
|
||||||
InputStreamReader reader = null;
|
InputStreamReader reader = null;
|
||||||
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();
|
CharsetMatch[] charsetMatches = detector.setText(inputStream).detectAll();
|
||||||
if (charsetMatches.length > 0) {
|
for (CharsetMatch charsetMatch : charsetMatches) { // matches are ordered - first is best match
|
||||||
CharsetMatch charsetMatch = charsetMatches[0]; // matches are ordered - first is best match
|
String matchName = charsetMatch.getName();
|
||||||
int confidence = charsetMatch.getConfidence();
|
int confidence = charsetMatch.getConfidence();
|
||||||
// Threshold was 50. Do we ever want to not use our best guess even if it's low confidence? - tfmorris
|
// Threshold was 50. Do we ever want to not use our best guess even if it's low confidence? - tfmorris
|
||||||
if (confidence >= 20) {
|
if (confidence >= 20) {
|
||||||
|
logger.info("Encoding guess: {} [confidence: {}]", matchName, confidence);
|
||||||
try {
|
try {
|
||||||
reader = new InputStreamReader(inputStream, charsetMatch.getName());
|
reader = new InputStreamReader(inputStream, matchName);
|
||||||
} catch (UnsupportedEncodingException e) {
|
} catch (UnsupportedEncodingException e) {
|
||||||
// ignored - we'll fall back to a different reader later
|
logger.debug("Unsupported InputStreamReader charset encoding: {} [confidence: {}]; skipping", matchName, confidence);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
// Encoding will be set later at common exit point
|
// Encoding will be set later at common exit point
|
||||||
options.setProperty("encoding_confidence", Integer.toString(confidence));
|
options.setProperty("encoding_confidence", Integer.toString(confidence));
|
||||||
logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
|
break;
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Poor encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
|
logger.debug("Poor encoding guess: {} [confidence: {}]; skipping", matchName, confidence);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user