Issue 237 - Make sure project's character encoding is always set. Lower minimum confidence threshold for guesser.
git-svn-id: http://google-refine.googlecode.com/svn/trunk@1931 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
d5a742944e
commit
080ec5332e
@ -448,35 +448,44 @@ public class CreateProjectCommand extends Command {
|
|||||||
|
|
||||||
CharsetDetector detector = new CharsetDetector();
|
CharsetDetector detector = new CharsetDetector();
|
||||||
detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that
|
detector.setDeclaredEncoding("utf8"); // most of the content on the web is encoded in UTF-8 so start with that
|
||||||
|
options.setProperty("encoding_confidence", "0"); // in case we don't find anything suitable
|
||||||
|
|
||||||
Reader reader = null;
|
InputStreamReader reader = null;
|
||||||
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();
|
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();
|
||||||
for (CharsetMatch charsetMatch : charsetMatches) {
|
if (charsetMatches.length > 0) {
|
||||||
try {
|
CharsetMatch charsetMatch = charsetMatches[0]; // matches are ordered - first is best match
|
||||||
int confidence = charsetMatch.getConfidence();
|
int confidence = charsetMatch.getConfidence();
|
||||||
if (confidence >= 50) {
|
// Threshold was 50. Do we ever want to not use our best guess even if it's low confidence? - tfmorris
|
||||||
|
if (confidence >= 20) {
|
||||||
|
try {
|
||||||
reader = new InputStreamReader(inputStream, charsetMatch.getName());
|
reader = new InputStreamReader(inputStream, charsetMatch.getName());
|
||||||
|
} catch (UnsupportedEncodingException e) {
|
||||||
options.setProperty("encoding", charsetMatch.getName());
|
// ignored - we'll fall back to a different reader later
|
||||||
options.setProperty("encoding_confidence", Integer.toString(confidence));
|
|
||||||
|
|
||||||
logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
|
|
||||||
}
|
}
|
||||||
|
// Encoding will be set later at common exit point
|
||||||
break;
|
options.setProperty("encoding_confidence", Integer.toString(confidence));
|
||||||
} catch (UnsupportedEncodingException e) {
|
logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
|
||||||
// silent
|
} else {
|
||||||
|
logger.debug("Poor encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (reader == null) { // when all else fails
|
if (reader == null) { // when all else fails
|
||||||
reader = encoding != null ?
|
if (encoding != null) {
|
||||||
new InputStreamReader(inputStream, encoding) :
|
reader = new InputStreamReader(inputStream, encoding);
|
||||||
new InputStreamReader(inputStream);
|
} else {
|
||||||
|
reader = new InputStreamReader(inputStream);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
// Get the actual encoding which will be used and save it for project metadata
|
||||||
|
options.setProperty("encoding", reader.getEncoding());
|
||||||
|
|
||||||
((ReaderImporter) importer).read(reader, project, metadata, options);
|
((ReaderImporter) importer).read(reader, project, metadata, options);
|
||||||
} else {
|
} else {
|
||||||
|
// TODO: How do we set character encoding here?
|
||||||
|
// Things won't work right if it's not set, so pick some arbitrary values
|
||||||
|
options.setProperty("encoding", encoding);
|
||||||
|
options.setProperty("encoding_confidence", "0");
|
||||||
((StreamImporter) importer).read(rawInputStream, project, metadata, options);
|
((StreamImporter) importer).read(rawInputStream, project, metadata, options);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user