Fixed Issue 110: Import of single column text file with Postal Codes shows only 1 row with lots of � chars (?).

(by enforcing a confidence threshold on the encoding guessing)

git-svn-id: http://google-refine.googlecode.com/svn/trunk@1367 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2010-09-28 00:26:53 +00:00
parent 14d046bb7a
commit 823fe989a4

View File

@ -421,13 +421,16 @@ public class CreateProjectCommand extends Command {
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();
for (CharsetMatch charsetMatch : charsetMatches) {
try {
reader = new InputStreamReader(inputStream, charsetMatch.getName());
options.setProperty("encoding", charsetMatch.getName());
options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence()));
logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
int confidence = charsetMatch.getConfidence();
if (confidence >= 50) {
reader = new InputStreamReader(inputStream, charsetMatch.getName());
options.setProperty("encoding", charsetMatch.getName());
options.setProperty("encoding_confidence", Integer.toString(confidence));
logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
}
break;
} catch (UnsupportedEncodingException e) {
// silent