Don't guess field widths unless we have at least 3 lines

- Investigation of #685 showed that single line files were being guessed
as fixed field width
This commit is contained in:
Tom Morris 2013-03-04 17:47:06 -05:00
parent 6b676f7513
commit 369bfffb2f

View File

@ -1,225 +1,225 @@
package com.google.refine.importers; package com.google.refine.importers;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.LineNumberReader; import java.io.LineNumberReader;
import java.io.Reader; import java.io.Reader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.json.JSONArray; import org.json.JSONArray;
import org.json.JSONObject; import org.json.JSONObject;
import com.google.refine.ProjectMetadata; import com.google.refine.ProjectMetadata;
import com.google.refine.importing.ImportingJob; import com.google.refine.importing.ImportingJob;
import com.google.refine.importing.ImportingUtilities; import com.google.refine.importing.ImportingUtilities;
import com.google.refine.model.Project; import com.google.refine.model.Project;
import com.google.refine.util.JSONUtilities; import com.google.refine.util.JSONUtilities;
public class FixedWidthImporter extends TabularImportingParserBase { public class FixedWidthImporter extends TabularImportingParserBase {
public FixedWidthImporter() { public FixedWidthImporter() {
super(false); super(false);
} }
@Override @Override
public JSONObject createParserUIInitializationData( public JSONObject createParserUIInitializationData(
ImportingJob job, List<JSONObject> fileRecords, String format) { ImportingJob job, List<JSONObject> fileRecords, String format) {
JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
JSONArray columnWidths = new JSONArray(); JSONArray columnWidths = new JSONArray();
if (fileRecords.size() > 0) { if (fileRecords.size() > 0) {
JSONObject firstFileRecord = fileRecords.get(0); JSONObject firstFileRecord = fileRecords.get(0);
String encoding = ImportingUtilities.getEncoding(firstFileRecord); String encoding = ImportingUtilities.getEncoding(firstFileRecord);
String location = JSONUtilities.getString(firstFileRecord, "location", null); String location = JSONUtilities.getString(firstFileRecord, "location", null);
if (location != null) { if (location != null) {
File file = new File(job.getRawDataDir(), location); File file = new File(job.getRawDataDir(), location);
int[] columnWidthsA = guessColumnWidths(file, encoding); int[] columnWidthsA = guessColumnWidths(file, encoding);
if (columnWidthsA != null) { if (columnWidthsA != null) {
for (int w : columnWidthsA) { for (int w : columnWidthsA) {
JSONUtilities.append(columnWidths, w); JSONUtilities.append(columnWidths, w);
} }
} }
} }
JSONUtilities.safePut(options, "headerLines", 0); JSONUtilities.safePut(options, "headerLines", 0);
JSONUtilities.safePut(options, "columnWidths", columnWidths); JSONUtilities.safePut(options, "columnWidths", columnWidths);
JSONUtilities.safePut(options, "guessCellValueTypes", true); JSONUtilities.safePut(options, "guessCellValueTypes", true);
} }
return options; return options;
} }
@Override @Override
public void parseOneFile( public void parseOneFile(
Project project, Project project,
ProjectMetadata metadata, ProjectMetadata metadata,
ImportingJob job, ImportingJob job,
String fileSource, String fileSource,
Reader reader, Reader reader,
int limit, int limit,
JSONObject options, JSONObject options,
List<Exception> exceptions List<Exception> exceptions
) { ) {
final int[] columnWidths = JSONUtilities.getIntArray(options, "columnWidths"); final int[] columnWidths = JSONUtilities.getIntArray(options, "columnWidths");
List<Object> retrievedColumnNames = null; List<Object> retrievedColumnNames = null;
if (options.has("columnNames")) { if (options.has("columnNames")) {
String[] strings = JSONUtilities.getStringArray(options, "columnNames"); String[] strings = JSONUtilities.getStringArray(options, "columnNames");
if (strings.length > 0) { if (strings.length > 0) {
retrievedColumnNames = new ArrayList<Object>(); retrievedColumnNames = new ArrayList<Object>();
for (String s : strings) { for (String s : strings) {
s = s.trim(); s = s.trim();
if (!s.isEmpty()) { if (!s.isEmpty()) {
retrievedColumnNames.add(s); retrievedColumnNames.add(s);
} }
} }
if (retrievedColumnNames.size() > 0) { if (retrievedColumnNames.size() > 0) {
JSONUtilities.safePut(options, "headerLines", 1); JSONUtilities.safePut(options, "headerLines", 1);
} else { } else {
retrievedColumnNames = null; retrievedColumnNames = null;
} }
} }
} }
final List<Object> columnNames = retrievedColumnNames; final List<Object> columnNames = retrievedColumnNames;
final LineNumberReader lnReader = new LineNumberReader(reader); final LineNumberReader lnReader = new LineNumberReader(reader);
TableDataReader dataReader = new TableDataReader() { TableDataReader dataReader = new TableDataReader() {
boolean usedColumnNames = false; boolean usedColumnNames = false;
@Override @Override
public List<Object> getNextRowOfCells() throws IOException { public List<Object> getNextRowOfCells() throws IOException {
if (columnNames != null && !usedColumnNames) { if (columnNames != null && !usedColumnNames) {
usedColumnNames = true; usedColumnNames = true;
return columnNames; return columnNames;
} else { } else {
String line = lnReader.readLine(); String line = lnReader.readLine();
if (line == null) { if (line == null) {
return null; return null;
} else { } else {
return getCells(line, columnWidths); return getCells(line, columnWidths);
} }
} }
} }
}; };
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
} }
/** /**
* Splits the line into columns * Splits the line into columns
* @param line * @param line
* @param lnReader * @param lnReader
* @param splitIntoColumns * @param splitIntoColumns
* @return * @return
*/ */
static private ArrayList<Object> getCells(String line, int[] widths) { static private ArrayList<Object> getCells(String line, int[] widths) {
ArrayList<Object> cells = new ArrayList<Object>(); ArrayList<Object> cells = new ArrayList<Object>();
int columnStartCursor = 0; int columnStartCursor = 0;
int columnEndCursor = 0; int columnEndCursor = 0;
for (int width : widths) { for (int width : widths) {
if (columnStartCursor >= line.length()) { if (columnStartCursor >= line.length()) {
cells.add(null); //FIXME is adding a null cell (to represent no data) OK? cells.add(null); //FIXME is adding a null cell (to represent no data) OK?
continue; continue;
} }
columnEndCursor = columnStartCursor + width; columnEndCursor = columnStartCursor + width;
if (columnEndCursor > line.length()) { if (columnEndCursor > line.length()) {
columnEndCursor = line.length(); columnEndCursor = line.length();
} }
if (columnEndCursor <= columnStartCursor) { if (columnEndCursor <= columnStartCursor) {
cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK? cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK?
continue; continue;
} }
cells.add(line.substring(columnStartCursor, columnEndCursor)); cells.add(line.substring(columnStartCursor, columnEndCursor));
columnStartCursor = columnEndCursor; columnStartCursor = columnEndCursor;
} }
// Residual text // Residual text
if (columnStartCursor < line.length()) { if (columnStartCursor < line.length()) {
cells.add(line.substring(columnStartCursor)); cells.add(line.substring(columnStartCursor));
} }
return cells; return cells;
} }
static public int[] guessColumnWidths(File file, String encoding) { static public int[] guessColumnWidths(File file, String encoding) {
try { try {
InputStream is = new FileInputStream(file); InputStream is = new FileInputStream(file);
Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is); Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
LineNumberReader lineNumberReader = new LineNumberReader(reader); LineNumberReader lineNumberReader = new LineNumberReader(reader);
try { try {
int[] counts = null; int[] counts = null;
int totalBytes = 0; int totalBytes = 0;
int lineCount = 0; int lineCount = 0;
String s; String s;
while (totalBytes < 64 * 1024 && while (totalBytes < 64 * 1024 &&
lineCount < 100 && lineCount < 100 &&
(s = lineNumberReader.readLine()) != null) { (s = lineNumberReader.readLine()) != null) {
totalBytes += s.length() + 1; // count the new line character totalBytes += s.length() + 1; // count the new line character
if (s.length() == 0) { if (s.length() == 0) {
continue; continue;
} }
lineCount++; lineCount++;
if (counts == null) { if (counts == null) {
counts = new int[s.length()]; counts = new int[s.length()];
for (int c = 0; c < counts.length; c++) { for (int c = 0; c < counts.length; c++) {
counts[c] = 0; counts[c] = 0;
} }
} }
for (int c = 0; c < counts.length && c < s.length(); c++) { for (int c = 0; c < counts.length && c < s.length(); c++) {
char ch = s.charAt(c); char ch = s.charAt(c);
if (ch == ' ') { if (ch == ' ') {
counts[c]++; counts[c]++;
} }
} }
} }
if (counts != null) { if (counts != null && lineCount > 2) {
List<Integer> widths = new ArrayList<Integer>(); List<Integer> widths = new ArrayList<Integer>();
int startIndex = 0; int startIndex = 0;
for (int c = 0; c < counts.length; c++) { for (int c = 0; c < counts.length; c++) {
int count = counts[c]; int count = counts[c];
if (count == lineCount) { if (count == lineCount) {
widths.add(c - startIndex + 1); widths.add(c - startIndex + 1);
startIndex = c + 1; startIndex = c + 1;
} }
} }
for (int i = widths.size() - 2; i >= 0; i--) { for (int i = widths.size() - 2; i >= 0; i--) {
if (widths.get(i) == 1) { if (widths.get(i) == 1) {
widths.set(i + 1, widths.get(i + 1) + 1); widths.set(i + 1, widths.get(i + 1) + 1);
widths.remove(i); widths.remove(i);
} }
} }
int[] widthA = new int[widths.size()]; int[] widthA = new int[widths.size()];
for (int i = 0; i < widthA.length; i++) { for (int i = 0; i < widthA.length; i++) {
widthA[i] = widths.get(i); widthA[i] = widths.get(i);
} }
return widthA; return widthA;
} }
} finally { } finally {
lineNumberReader.close(); lineNumberReader.close();
reader.close(); reader.close();
is.close(); is.close();
} }
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
e.printStackTrace(); e.printStackTrace();
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
return null; return null;
} }
} }