From 28ff2295fd290089b9cb31f1dd84a3435b2230dd Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Thu, 8 Mar 2012 15:53:55 +0000 Subject: [PATCH] Issue 490 - Handle separator guessing for CSVs with quoted fields containing commas git-svn-id: http://google-refine.googlecode.com/svn/trunk@2458 7d457c2a-affb-35e4-300a-418c747d4874 --- .../importers/SeparatorBasedImporter.java | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/main/src/com/google/refine/importers/SeparatorBasedImporter.java b/main/src/com/google/refine/importers/SeparatorBasedImporter.java index 9e994b2b5..cd28d92bf 100644 --- a/main/src/com/google/refine/importers/SeparatorBasedImporter.java +++ b/main/src/com/google/refine/importers/SeparatorBasedImporter.java @@ -146,7 +146,8 @@ public class SeparatorBasedImporter extends TabularImportingParserBase { if (location != null) { File file = new File(job.getRawDataDir(), location); - Separator separator = guessSeparator(file, encoding); + // Quotes are turned on by default, so use that for guessing + Separator separator = guessSeparator(file, encoding, true); if (separator != null) { return StringEscapeUtils.escapeJava(Character.toString(separator.separator)); } @@ -164,8 +165,12 @@ public class SeparatorBasedImporter extends TabularImportingParserBase { double averagePerLine; double stddev; } - + static public Separator guessSeparator(File file, String encoding) { + return guessSeparator(file, encoding, false); // quotes off for backward compatibility + } + + static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) { try { InputStream is = new FileInputStream(file); Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is); @@ -175,14 +180,15 @@ public class SeparatorBasedImporter extends TabularImportingParserBase { List separators = new ArrayList(); Map separatorMap = new HashMap(); - int totalBytes = 0; + int totalChars = 0; int lineCount = 0; + boolean inQuote = false; String s; - while (totalBytes < 64 * 1024 && + while (totalChars < 64 * 1024 && lineCount < 100 && (s = lineNumberReader.readLine()) != null) { - totalBytes += s.length() + 1; // count the new line character + totalChars += s.length() + 1; // count the new line character if (s.length() == 0) { continue; } @@ -190,8 +196,12 @@ public class SeparatorBasedImporter extends TabularImportingParserBase { for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); - if (!Character.isLetterOrDigit(c) && - !"\"' .-".contains(s.subSequence(i, i + 1))) { + if ('"' == c) { + inQuote = !inQuote; + } + if (!Character.isLetterOrDigit(c) + && !"\"' .-".contains(s.subSequence(i, i + 1)) + && (!handleQuotes || !inQuote)) { Separator separator = separatorMap.get(c); if (separator == null) { separator = new Separator(); @@ -214,9 +224,10 @@ public class SeparatorBasedImporter extends TabularImportingParserBase { if (separators.size() > 0) { for (Separator separator : separators) { separator.averagePerLine = separator.totalCount / (double) lineCount; - separator.stddev = Math.sqrt( - separator.totalOfSquaredCount / (double) lineCount - - separator.averagePerLine * separator.averagePerLine); + separator.stddev = Math.sqrt( + (((double)lineCount * separator.totalOfSquaredCount) - (separator.totalCount * separator.totalCount)) + / ((double)lineCount*(lineCount-1)) + ); } Collections.sort(separators, new Comparator() {