Handle quoted fields with embedded new lines. Sort separators by score

rather than just standard deviation
This commit is contained in:
Tom Morris 2013-08-02 17:59:09 -04:00
parent f4ff227340
commit d7531bbbd8

View File

@ -168,6 +168,7 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
return guessSeparator(file, encoding, false); // quotes off for backward compatibility return guessSeparator(file, encoding, false); // quotes off for backward compatibility
} }
// TODO: Move this to the CSV project?
static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) { static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) {
try { try {
InputStream is = new FileInputStream(file); InputStream is = new FileInputStream(file);
@ -190,7 +191,9 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
if (s.length() == 0) { if (s.length() == 0) {
continue; continue;
} }
lineCount++; if (!inQuote) {
lineCount++;
}
for (int i = 0; i < s.length(); i++) { for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i); char c = s.charAt(i);
@ -212,10 +215,12 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
} }
} }
for (Separator separator : separators) { if (!inQuote) {
separator.totalCount += separator.currentLineCount; for (Separator separator : separators) {
separator.totalOfSquaredCount += separator.currentLineCount * separator.currentLineCount; separator.totalCount += separator.currentLineCount;
separator.currentLineCount = 0; separator.totalOfSquaredCount += separator.currentLineCount * separator.currentLineCount;
separator.currentLineCount = 0;
}
} }
} }
@ -231,14 +236,16 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
Collections.sort(separators, new Comparator<Separator>() { Collections.sort(separators, new Comparator<Separator>() {
@Override @Override
public int compare(Separator sep0, Separator sep1) { public int compare(Separator sep0, Separator sep1) {
return Double.compare(sep0.stddev, sep1.stddev); return Double.compare(sep0.stddev / sep0.averagePerLine,
sep1.stddev / sep1.averagePerLine);
} }
}); });
for (Separator separator : separators) {
if (separator.stddev / separator.averagePerLine < 0.1) { Separator separator = separators.get(0);
return separator; if (separator.stddev / separator.averagePerLine < 0.1) {
} return separator;
} }
} }
} finally { } finally {
lineNumberReader.close(); lineNumberReader.close();