Handle quoted fields with embedded new lines. Sort separators by score
rather than just standard deviation
This commit is contained in:
parent
f4ff227340
commit
d7531bbbd8
@ -168,6 +168,7 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
|
|||||||
return guessSeparator(file, encoding, false); // quotes off for backward compatibility
|
return guessSeparator(file, encoding, false); // quotes off for backward compatibility
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Move this to the CSV project?
|
||||||
static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) {
|
static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) {
|
||||||
try {
|
try {
|
||||||
InputStream is = new FileInputStream(file);
|
InputStream is = new FileInputStream(file);
|
||||||
@ -190,7 +191,9 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
|
|||||||
if (s.length() == 0) {
|
if (s.length() == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
lineCount++;
|
if (!inQuote) {
|
||||||
|
lineCount++;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < s.length(); i++) {
|
for (int i = 0; i < s.length(); i++) {
|
||||||
char c = s.charAt(i);
|
char c = s.charAt(i);
|
||||||
@ -212,10 +215,12 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Separator separator : separators) {
|
if (!inQuote) {
|
||||||
separator.totalCount += separator.currentLineCount;
|
for (Separator separator : separators) {
|
||||||
separator.totalOfSquaredCount += separator.currentLineCount * separator.currentLineCount;
|
separator.totalCount += separator.currentLineCount;
|
||||||
separator.currentLineCount = 0;
|
separator.totalOfSquaredCount += separator.currentLineCount * separator.currentLineCount;
|
||||||
|
separator.currentLineCount = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -231,14 +236,16 @@ public class SeparatorBasedImporter extends TabularImportingParserBase {
|
|||||||
Collections.sort(separators, new Comparator<Separator>() {
|
Collections.sort(separators, new Comparator<Separator>() {
|
||||||
@Override
|
@Override
|
||||||
public int compare(Separator sep0, Separator sep1) {
|
public int compare(Separator sep0, Separator sep1) {
|
||||||
return Double.compare(sep0.stddev, sep1.stddev);
|
return Double.compare(sep0.stddev / sep0.averagePerLine,
|
||||||
|
sep1.stddev / sep1.averagePerLine);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
for (Separator separator : separators) {
|
|
||||||
if (separator.stddev / separator.averagePerLine < 0.1) {
|
Separator separator = separators.get(0);
|
||||||
return separator;
|
if (separator.stddev / separator.averagePerLine < 0.1) {
|
||||||
}
|
return separator;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
lineNumberReader.close();
|
lineNumberReader.close();
|
||||||
|
Loading…
Reference in New Issue
Block a user