Stricter detection of json and xml formats on import, by checking for initial nonspace character.

git-svn-id: http://google-refine.googlecode.com/svn/trunk@2266 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
David Huynh 2011-09-30 01:47:42 +00:00
parent 2deae9d785
commit 7935dfd60e

View File

@ -25,6 +25,9 @@ public class TextFormatGuesser implements FormatGuesser {
int openAngleBrackets = 0; int openAngleBrackets = 0;
int closeAngleBrackets = 0; int closeAngleBrackets = 0;
char firstChar = ' ';
boolean foundFirstChar = false;
char[] chars = new char[4096]; char[] chars = new char[4096];
int c; int c;
while (totalBytes < 64 * 1024 && (c = reader.read(chars)) > 0) { while (totalBytes < 64 * 1024 && (c = reader.read(chars)) > 0) {
@ -34,16 +37,26 @@ public class TextFormatGuesser implements FormatGuesser {
openAngleBrackets += countSubstrings(chunk, "<"); openAngleBrackets += countSubstrings(chunk, "<");
closeAngleBrackets += countSubstrings(chunk, ">"); closeAngleBrackets += countSubstrings(chunk, ">");
if (!foundFirstChar) {
chunk = chunk.trim();
if (chunk.length() > 0) {
firstChar = chunk.charAt(0);
foundFirstChar = true;
}
}
totalBytes += c; totalBytes += c;
} }
if (openBraces >= 5 && closeBraces >= 5) { if (foundFirstChar) {
if ((firstChar == '{' || firstChar == '[') &&
openBraces >= 5 && closeBraces >= 5) {
return "text/json"; return "text/json";
} else if (openAngleBrackets >= 5 && closeAngleBrackets >= 5) { } else if (firstChar == '<' &&
openAngleBrackets >= 5 && closeAngleBrackets >= 5) {
return "text/xml"; return "text/xml";
} else {
return "text/line-based";
} }
}
return "text/line-based";
} finally { } finally {
is.close(); is.close();
} }