Don't guess field widths unless we have at least 3 lines
- Investigation of #685 showed that single line files were being guessed as fixed field width
This commit is contained in:
parent
6b676f7513
commit
369bfffb2f
@ -1,225 +1,225 @@
|
||||
package com.google.refine.importers;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Reader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
|
||||
import com.google.refine.ProjectMetadata;
|
||||
import com.google.refine.importing.ImportingJob;
|
||||
import com.google.refine.importing.ImportingUtilities;
|
||||
import com.google.refine.model.Project;
|
||||
import com.google.refine.util.JSONUtilities;
|
||||
|
||||
public class FixedWidthImporter extends TabularImportingParserBase {
|
||||
public FixedWidthImporter() {
|
||||
super(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JSONObject createParserUIInitializationData(
|
||||
ImportingJob job, List<JSONObject> fileRecords, String format) {
|
||||
JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
|
||||
JSONArray columnWidths = new JSONArray();
|
||||
if (fileRecords.size() > 0) {
|
||||
JSONObject firstFileRecord = fileRecords.get(0);
|
||||
String encoding = ImportingUtilities.getEncoding(firstFileRecord);
|
||||
String location = JSONUtilities.getString(firstFileRecord, "location", null);
|
||||
if (location != null) {
|
||||
File file = new File(job.getRawDataDir(), location);
|
||||
int[] columnWidthsA = guessColumnWidths(file, encoding);
|
||||
if (columnWidthsA != null) {
|
||||
for (int w : columnWidthsA) {
|
||||
JSONUtilities.append(columnWidths, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
JSONUtilities.safePut(options, "headerLines", 0);
|
||||
JSONUtilities.safePut(options, "columnWidths", columnWidths);
|
||||
JSONUtilities.safePut(options, "guessCellValueTypes", true);
|
||||
}
|
||||
return options;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void parseOneFile(
|
||||
Project project,
|
||||
ProjectMetadata metadata,
|
||||
ImportingJob job,
|
||||
String fileSource,
|
||||
Reader reader,
|
||||
int limit,
|
||||
JSONObject options,
|
||||
List<Exception> exceptions
|
||||
) {
|
||||
final int[] columnWidths = JSONUtilities.getIntArray(options, "columnWidths");
|
||||
|
||||
List<Object> retrievedColumnNames = null;
|
||||
if (options.has("columnNames")) {
|
||||
String[] strings = JSONUtilities.getStringArray(options, "columnNames");
|
||||
if (strings.length > 0) {
|
||||
retrievedColumnNames = new ArrayList<Object>();
|
||||
for (String s : strings) {
|
||||
s = s.trim();
|
||||
if (!s.isEmpty()) {
|
||||
retrievedColumnNames.add(s);
|
||||
}
|
||||
}
|
||||
|
||||
if (retrievedColumnNames.size() > 0) {
|
||||
JSONUtilities.safePut(options, "headerLines", 1);
|
||||
} else {
|
||||
retrievedColumnNames = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final List<Object> columnNames = retrievedColumnNames;
|
||||
final LineNumberReader lnReader = new LineNumberReader(reader);
|
||||
|
||||
TableDataReader dataReader = new TableDataReader() {
|
||||
boolean usedColumnNames = false;
|
||||
|
||||
@Override
|
||||
public List<Object> getNextRowOfCells() throws IOException {
|
||||
if (columnNames != null && !usedColumnNames) {
|
||||
usedColumnNames = true;
|
||||
return columnNames;
|
||||
} else {
|
||||
String line = lnReader.readLine();
|
||||
if (line == null) {
|
||||
return null;
|
||||
} else {
|
||||
return getCells(line, columnWidths);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits the line into columns
|
||||
* @param line
|
||||
* @param lnReader
|
||||
* @param splitIntoColumns
|
||||
* @return
|
||||
*/
|
||||
static private ArrayList<Object> getCells(String line, int[] widths) {
|
||||
ArrayList<Object> cells = new ArrayList<Object>();
|
||||
|
||||
int columnStartCursor = 0;
|
||||
int columnEndCursor = 0;
|
||||
for (int width : widths) {
|
||||
if (columnStartCursor >= line.length()) {
|
||||
cells.add(null); //FIXME is adding a null cell (to represent no data) OK?
|
||||
continue;
|
||||
}
|
||||
|
||||
columnEndCursor = columnStartCursor + width;
|
||||
|
||||
if (columnEndCursor > line.length()) {
|
||||
columnEndCursor = line.length();
|
||||
}
|
||||
if (columnEndCursor <= columnStartCursor) {
|
||||
cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK?
|
||||
continue;
|
||||
}
|
||||
|
||||
cells.add(line.substring(columnStartCursor, columnEndCursor));
|
||||
|
||||
columnStartCursor = columnEndCursor;
|
||||
}
|
||||
|
||||
// Residual text
|
||||
if (columnStartCursor < line.length()) {
|
||||
cells.add(line.substring(columnStartCursor));
|
||||
}
|
||||
return cells;
|
||||
}
|
||||
|
||||
static public int[] guessColumnWidths(File file, String encoding) {
|
||||
try {
|
||||
InputStream is = new FileInputStream(file);
|
||||
Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
|
||||
LineNumberReader lineNumberReader = new LineNumberReader(reader);
|
||||
|
||||
try {
|
||||
int[] counts = null;
|
||||
int totalBytes = 0;
|
||||
int lineCount = 0;
|
||||
String s;
|
||||
while (totalBytes < 64 * 1024 &&
|
||||
lineCount < 100 &&
|
||||
(s = lineNumberReader.readLine()) != null) {
|
||||
|
||||
totalBytes += s.length() + 1; // count the new line character
|
||||
if (s.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
lineCount++;
|
||||
|
||||
if (counts == null) {
|
||||
counts = new int[s.length()];
|
||||
for (int c = 0; c < counts.length; c++) {
|
||||
counts[c] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (int c = 0; c < counts.length && c < s.length(); c++) {
|
||||
char ch = s.charAt(c);
|
||||
if (ch == ' ') {
|
||||
counts[c]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (counts != null) {
|
||||
List<Integer> widths = new ArrayList<Integer>();
|
||||
|
||||
int startIndex = 0;
|
||||
for (int c = 0; c < counts.length; c++) {
|
||||
int count = counts[c];
|
||||
if (count == lineCount) {
|
||||
widths.add(c - startIndex + 1);
|
||||
startIndex = c + 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = widths.size() - 2; i >= 0; i--) {
|
||||
if (widths.get(i) == 1) {
|
||||
widths.set(i + 1, widths.get(i + 1) + 1);
|
||||
widths.remove(i);
|
||||
}
|
||||
}
|
||||
|
||||
int[] widthA = new int[widths.size()];
|
||||
for (int i = 0; i < widthA.length; i++) {
|
||||
widthA[i] = widths.get(i);
|
||||
}
|
||||
return widthA;
|
||||
}
|
||||
} finally {
|
||||
lineNumberReader.close();
|
||||
reader.close();
|
||||
is.close();
|
||||
}
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
package com.google.refine.importers;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Reader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
|
||||
import com.google.refine.ProjectMetadata;
|
||||
import com.google.refine.importing.ImportingJob;
|
||||
import com.google.refine.importing.ImportingUtilities;
|
||||
import com.google.refine.model.Project;
|
||||
import com.google.refine.util.JSONUtilities;
|
||||
|
||||
public class FixedWidthImporter extends TabularImportingParserBase {
|
||||
public FixedWidthImporter() {
|
||||
super(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JSONObject createParserUIInitializationData(
|
||||
ImportingJob job, List<JSONObject> fileRecords, String format) {
|
||||
JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
|
||||
JSONArray columnWidths = new JSONArray();
|
||||
if (fileRecords.size() > 0) {
|
||||
JSONObject firstFileRecord = fileRecords.get(0);
|
||||
String encoding = ImportingUtilities.getEncoding(firstFileRecord);
|
||||
String location = JSONUtilities.getString(firstFileRecord, "location", null);
|
||||
if (location != null) {
|
||||
File file = new File(job.getRawDataDir(), location);
|
||||
int[] columnWidthsA = guessColumnWidths(file, encoding);
|
||||
if (columnWidthsA != null) {
|
||||
for (int w : columnWidthsA) {
|
||||
JSONUtilities.append(columnWidths, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
JSONUtilities.safePut(options, "headerLines", 0);
|
||||
JSONUtilities.safePut(options, "columnWidths", columnWidths);
|
||||
JSONUtilities.safePut(options, "guessCellValueTypes", true);
|
||||
}
|
||||
return options;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void parseOneFile(
|
||||
Project project,
|
||||
ProjectMetadata metadata,
|
||||
ImportingJob job,
|
||||
String fileSource,
|
||||
Reader reader,
|
||||
int limit,
|
||||
JSONObject options,
|
||||
List<Exception> exceptions
|
||||
) {
|
||||
final int[] columnWidths = JSONUtilities.getIntArray(options, "columnWidths");
|
||||
|
||||
List<Object> retrievedColumnNames = null;
|
||||
if (options.has("columnNames")) {
|
||||
String[] strings = JSONUtilities.getStringArray(options, "columnNames");
|
||||
if (strings.length > 0) {
|
||||
retrievedColumnNames = new ArrayList<Object>();
|
||||
for (String s : strings) {
|
||||
s = s.trim();
|
||||
if (!s.isEmpty()) {
|
||||
retrievedColumnNames.add(s);
|
||||
}
|
||||
}
|
||||
|
||||
if (retrievedColumnNames.size() > 0) {
|
||||
JSONUtilities.safePut(options, "headerLines", 1);
|
||||
} else {
|
||||
retrievedColumnNames = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final List<Object> columnNames = retrievedColumnNames;
|
||||
final LineNumberReader lnReader = new LineNumberReader(reader);
|
||||
|
||||
TableDataReader dataReader = new TableDataReader() {
|
||||
boolean usedColumnNames = false;
|
||||
|
||||
@Override
|
||||
public List<Object> getNextRowOfCells() throws IOException {
|
||||
if (columnNames != null && !usedColumnNames) {
|
||||
usedColumnNames = true;
|
||||
return columnNames;
|
||||
} else {
|
||||
String line = lnReader.readLine();
|
||||
if (line == null) {
|
||||
return null;
|
||||
} else {
|
||||
return getCells(line, columnWidths);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits the line into columns
|
||||
* @param line
|
||||
* @param lnReader
|
||||
* @param splitIntoColumns
|
||||
* @return
|
||||
*/
|
||||
static private ArrayList<Object> getCells(String line, int[] widths) {
|
||||
ArrayList<Object> cells = new ArrayList<Object>();
|
||||
|
||||
int columnStartCursor = 0;
|
||||
int columnEndCursor = 0;
|
||||
for (int width : widths) {
|
||||
if (columnStartCursor >= line.length()) {
|
||||
cells.add(null); //FIXME is adding a null cell (to represent no data) OK?
|
||||
continue;
|
||||
}
|
||||
|
||||
columnEndCursor = columnStartCursor + width;
|
||||
|
||||
if (columnEndCursor > line.length()) {
|
||||
columnEndCursor = line.length();
|
||||
}
|
||||
if (columnEndCursor <= columnStartCursor) {
|
||||
cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK?
|
||||
continue;
|
||||
}
|
||||
|
||||
cells.add(line.substring(columnStartCursor, columnEndCursor));
|
||||
|
||||
columnStartCursor = columnEndCursor;
|
||||
}
|
||||
|
||||
// Residual text
|
||||
if (columnStartCursor < line.length()) {
|
||||
cells.add(line.substring(columnStartCursor));
|
||||
}
|
||||
return cells;
|
||||
}
|
||||
|
||||
static public int[] guessColumnWidths(File file, String encoding) {
|
||||
try {
|
||||
InputStream is = new FileInputStream(file);
|
||||
Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
|
||||
LineNumberReader lineNumberReader = new LineNumberReader(reader);
|
||||
|
||||
try {
|
||||
int[] counts = null;
|
||||
int totalBytes = 0;
|
||||
int lineCount = 0;
|
||||
String s;
|
||||
while (totalBytes < 64 * 1024 &&
|
||||
lineCount < 100 &&
|
||||
(s = lineNumberReader.readLine()) != null) {
|
||||
|
||||
totalBytes += s.length() + 1; // count the new line character
|
||||
if (s.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
lineCount++;
|
||||
|
||||
if (counts == null) {
|
||||
counts = new int[s.length()];
|
||||
for (int c = 0; c < counts.length; c++) {
|
||||
counts[c] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (int c = 0; c < counts.length && c < s.length(); c++) {
|
||||
char ch = s.charAt(c);
|
||||
if (ch == ' ') {
|
||||
counts[c]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (counts != null && lineCount > 2) {
|
||||
List<Integer> widths = new ArrayList<Integer>();
|
||||
|
||||
int startIndex = 0;
|
||||
for (int c = 0; c < counts.length; c++) {
|
||||
int count = counts[c];
|
||||
if (count == lineCount) {
|
||||
widths.add(c - startIndex + 1);
|
||||
startIndex = c + 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = widths.size() - 2; i >= 0; i--) {
|
||||
if (widths.get(i) == 1) {
|
||||
widths.set(i + 1, widths.get(i + 1) + 1);
|
||||
widths.remove(i);
|
||||
}
|
||||
}
|
||||
|
||||
int[] widthA = new int[widths.size()];
|
||||
for (int i = 0; i < widthA.length; i++) {
|
||||
widthA[i] = widths.get(i);
|
||||
}
|
||||
return widthA;
|
||||
}
|
||||
} finally {
|
||||
lineNumberReader.close();
|
||||
reader.close();
|
||||
is.close();
|
||||
}
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user