Issue 490 - Handle separator guessing for CSVs with quoted fields containing commas
git-svn-id: http://google-refine.googlecode.com/svn/trunk@2458 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
9a680e8307
commit
28ff2295fd
@ -146,7 +146,8 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
|
|||||||
|
|
||||||
if (location != null) {
|
if (location != null) {
|
||||||
File file = new File(job.getRawDataDir(), location);
|
File file = new File(job.getRawDataDir(), location);
|
||||||
Separator separator = guessSeparator(file, encoding);
|
// Quotes are turned on by default, so use that for guessing
|
||||||
|
Separator separator = guessSeparator(file, encoding, true);
|
||||||
if (separator != null) {
|
if (separator != null) {
|
||||||
return StringEscapeUtils.escapeJava(Character.toString(separator.separator));
|
return StringEscapeUtils.escapeJava(Character.toString(separator.separator));
|
||||||
}
|
}
|
||||||
@ -166,6 +167,10 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static public Separator guessSeparator(File file, String encoding) {
|
static public Separator guessSeparator(File file, String encoding) {
|
||||||
|
return guessSeparator(file, encoding, false); // quotes off for backward compatibility
|
||||||
|
}
|
||||||
|
|
||||||
|
static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) {
|
||||||
try {
|
try {
|
||||||
InputStream is = new FileInputStream(file);
|
InputStream is = new FileInputStream(file);
|
||||||
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
|
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
|
||||||
@ -175,14 +180,15 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
|
|||||||
List<Separator> separators = new ArrayList<SeparatorBasedImporter.Separator>();
|
List<Separator> separators = new ArrayList<SeparatorBasedImporter.Separator>();
|
||||||
Map<Character, Separator> separatorMap = new HashMap<Character, SeparatorBasedImporter.Separator>();
|
Map<Character, Separator> separatorMap = new HashMap<Character, SeparatorBasedImporter.Separator>();
|
||||||
|
|
||||||
int totalBytes = 0;
|
int totalChars = 0;
|
||||||
int lineCount = 0;
|
int lineCount = 0;
|
||||||
|
boolean inQuote = false;
|
||||||
String s;
|
String s;
|
||||||
while (totalBytes < 64 * 1024 &&
|
while (totalChars < 64 * 1024 &&
|
||||||
lineCount < 100 &&
|
lineCount < 100 &&
|
||||||
(s = lineNumberReader.readLine()) != null) {
|
(s = lineNumberReader.readLine()) != null) {
|
||||||
|
|
||||||
totalBytes += s.length() + 1; // count the new line character
|
totalChars += s.length() + 1; // count the new line character
|
||||||
if (s.length() == 0) {
|
if (s.length() == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -190,8 +196,12 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
|
|||||||
|
|
||||||
for (int i = 0; i < s.length(); i++) {
|
for (int i = 0; i < s.length(); i++) {
|
||||||
char c = s.charAt(i);
|
char c = s.charAt(i);
|
||||||
if (!Character.isLetterOrDigit(c) &&
|
if ('"' == c) {
|
||||||
!"\"' .-".contains(s.subSequence(i, i + 1))) {
|
inQuote = !inQuote;
|
||||||
|
}
|
||||||
|
if (!Character.isLetterOrDigit(c)
|
||||||
|
&& !"\"' .-".contains(s.subSequence(i, i + 1))
|
||||||
|
&& (!handleQuotes || !inQuote)) {
|
||||||
Separator separator = separatorMap.get(c);
|
Separator separator = separatorMap.get(c);
|
||||||
if (separator == null) {
|
if (separator == null) {
|
||||||
separator = new Separator();
|
separator = new Separator();
|
||||||
@ -215,8 +225,9 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
|
|||||||
for (Separator separator : separators) {
|
for (Separator separator : separators) {
|
||||||
separator.averagePerLine = separator.totalCount / (double) lineCount;
|
separator.averagePerLine = separator.totalCount / (double) lineCount;
|
||||||
separator.stddev = Math.sqrt(
|
separator.stddev = Math.sqrt(
|
||||||
separator.totalOfSquaredCount / (double) lineCount -
|
(((double)lineCount * separator.totalOfSquaredCount) - (separator.totalCount * separator.totalCount))
|
||||||
separator.averagePerLine * separator.averagePerLine);
|
/ ((double)lineCount*(lineCount-1))
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
Collections.sort(separators, new Comparator<Separator>() {
|
Collections.sort(separators, new Comparator<Separator>() {
|
||||||
|
Loading…
Reference in New Issue
Block a user