Fixed Issue 110: Import of single column text file with Postal Codes shows only 1 row with lots of � chars (?).
(by enforcing a confidence threshold on the encoding guessing) git-svn-id: http://google-refine.googlecode.com/svn/trunk@1367 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
14d046bb7a
commit
823fe989a4
@ -421,13 +421,16 @@ public class CreateProjectCommand extends Command {
|
||||
CharsetMatch[] charsetMatches = detector.setText(bytes).detectAll();
|
||||
for (CharsetMatch charsetMatch : charsetMatches) {
|
||||
try {
|
||||
reader = new InputStreamReader(inputStream, charsetMatch.getName());
|
||||
|
||||
options.setProperty("encoding", charsetMatch.getName());
|
||||
options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence()));
|
||||
|
||||
logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
|
||||
|
||||
int confidence = charsetMatch.getConfidence();
|
||||
if (confidence >= 50) {
|
||||
reader = new InputStreamReader(inputStream, charsetMatch.getName());
|
||||
|
||||
options.setProperty("encoding", charsetMatch.getName());
|
||||
options.setProperty("encoding_confidence", Integer.toString(confidence));
|
||||
|
||||
logger.info("Best encoding guess: {} [confidence: {}]", charsetMatch.getName(), charsetMatch.getConfidence());
|
||||
}
|
||||
|
||||
break;
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
// silent
|
||||
|
Loading…
Reference in New Issue
Block a user