From 7935dfd60e8be49e942f919f31f293fc88e1d1a2 Mon Sep 17 00:00:00 2001 From: David Huynh Date: Fri, 30 Sep 2011 01:47:42 +0000 Subject: [PATCH] Stricter detection of json and xml formats on import, by checking for initial nonspace character. git-svn-id: http://google-refine.googlecode.com/svn/trunk@2266 7d457c2a-affb-35e4-300a-418c747d4874 --- .../refine/importers/TextFormatGuesser.java | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/main/src/com/google/refine/importers/TextFormatGuesser.java b/main/src/com/google/refine/importers/TextFormatGuesser.java index cddf04c6a..9b559c0ea 100644 --- a/main/src/com/google/refine/importers/TextFormatGuesser.java +++ b/main/src/com/google/refine/importers/TextFormatGuesser.java @@ -25,6 +25,9 @@ public class TextFormatGuesser implements FormatGuesser { int openAngleBrackets = 0; int closeAngleBrackets = 0; + char firstChar = ' '; + boolean foundFirstChar = false; + char[] chars = new char[4096]; int c; while (totalBytes < 64 * 1024 && (c = reader.read(chars)) > 0) { @@ -34,16 +37,26 @@ public class TextFormatGuesser implements FormatGuesser { openAngleBrackets += countSubstrings(chunk, "<"); closeAngleBrackets += countSubstrings(chunk, ">"); + if (!foundFirstChar) { + chunk = chunk.trim(); + if (chunk.length() > 0) { + firstChar = chunk.charAt(0); + foundFirstChar = true; + } + } totalBytes += c; } - if (openBraces >= 5 && closeBraces >= 5) { - return "text/json"; - } else if (openAngleBrackets >= 5 && closeAngleBrackets >= 5) { - return "text/xml"; - } else { - return "text/line-based"; + if (foundFirstChar) { + if ((firstChar == '{' || firstChar == '[') && + openBraces >= 5 && closeBraces >= 5) { + return "text/json"; + } else if (firstChar == '<' && + openAngleBrackets >= 5 && closeAngleBrackets >= 5) { + return "text/xml"; + } } + return "text/line-based"; } finally { is.close(); }