From 369bfffb2f586eee9dc416320ff08d9af55c1832 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Mon, 4 Mar 2013 17:47:06 -0500 Subject: [PATCH] Don't guess field widths unless we have at least 3 lines - Investigation of #685 showed that single line files were being guessed as fixed field width --- .../refine/importers/FixedWidthImporter.java | 450 +++++++++--------- 1 file changed, 225 insertions(+), 225 deletions(-) diff --git a/main/src/com/google/refine/importers/FixedWidthImporter.java b/main/src/com/google/refine/importers/FixedWidthImporter.java index cc1312202..70ebb13be 100644 --- a/main/src/com/google/refine/importers/FixedWidthImporter.java +++ b/main/src/com/google/refine/importers/FixedWidthImporter.java @@ -1,225 +1,225 @@ -package com.google.refine.importers; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.LineNumberReader; -import java.io.Reader; -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.List; - -import org.json.JSONArray; -import org.json.JSONObject; - -import com.google.refine.ProjectMetadata; -import com.google.refine.importing.ImportingJob; -import com.google.refine.importing.ImportingUtilities; -import com.google.refine.model.Project; -import com.google.refine.util.JSONUtilities; - -public class FixedWidthImporter extends TabularImportingParserBase { - public FixedWidthImporter() { - super(false); - } - - @Override - public JSONObject createParserUIInitializationData( - ImportingJob job, List fileRecords, String format) { - JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); - JSONArray columnWidths = new JSONArray(); - if (fileRecords.size() > 0) { - JSONObject firstFileRecord = fileRecords.get(0); - String encoding = ImportingUtilities.getEncoding(firstFileRecord); - String location = JSONUtilities.getString(firstFileRecord, "location", null); - if (location != null) { - File file = new File(job.getRawDataDir(), location); - int[] columnWidthsA = guessColumnWidths(file, encoding); - if (columnWidthsA != null) { - for (int w : columnWidthsA) { - JSONUtilities.append(columnWidths, w); - } - } - } - - JSONUtilities.safePut(options, "headerLines", 0); - JSONUtilities.safePut(options, "columnWidths", columnWidths); - JSONUtilities.safePut(options, "guessCellValueTypes", true); - } - return options; - } - - @Override - public void parseOneFile( - Project project, - ProjectMetadata metadata, - ImportingJob job, - String fileSource, - Reader reader, - int limit, - JSONObject options, - List exceptions - ) { - final int[] columnWidths = JSONUtilities.getIntArray(options, "columnWidths"); - - List retrievedColumnNames = null; - if (options.has("columnNames")) { - String[] strings = JSONUtilities.getStringArray(options, "columnNames"); - if (strings.length > 0) { - retrievedColumnNames = new ArrayList(); - for (String s : strings) { - s = s.trim(); - if (!s.isEmpty()) { - retrievedColumnNames.add(s); - } - } - - if (retrievedColumnNames.size() > 0) { - JSONUtilities.safePut(options, "headerLines", 1); - } else { - retrievedColumnNames = null; - } - } - } - - final List columnNames = retrievedColumnNames; - final LineNumberReader lnReader = new LineNumberReader(reader); - - TableDataReader dataReader = new TableDataReader() { - boolean usedColumnNames = false; - - @Override - public List getNextRowOfCells() throws IOException { - if (columnNames != null && !usedColumnNames) { - usedColumnNames = true; - return columnNames; - } else { - String line = lnReader.readLine(); - if (line == null) { - return null; - } else { - return getCells(line, columnWidths); - } - } - } - }; - - TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); - } - - /** - * Splits the line into columns - * @param line - * @param lnReader - * @param splitIntoColumns - * @return - */ - static private ArrayList getCells(String line, int[] widths) { - ArrayList cells = new ArrayList(); - - int columnStartCursor = 0; - int columnEndCursor = 0; - for (int width : widths) { - if (columnStartCursor >= line.length()) { - cells.add(null); //FIXME is adding a null cell (to represent no data) OK? - continue; - } - - columnEndCursor = columnStartCursor + width; - - if (columnEndCursor > line.length()) { - columnEndCursor = line.length(); - } - if (columnEndCursor <= columnStartCursor) { - cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK? - continue; - } - - cells.add(line.substring(columnStartCursor, columnEndCursor)); - - columnStartCursor = columnEndCursor; - } - - // Residual text - if (columnStartCursor < line.length()) { - cells.add(line.substring(columnStartCursor)); - } - return cells; - } - - static public int[] guessColumnWidths(File file, String encoding) { - try { - InputStream is = new FileInputStream(file); - Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is); - LineNumberReader lineNumberReader = new LineNumberReader(reader); - - try { - int[] counts = null; - int totalBytes = 0; - int lineCount = 0; - String s; - while (totalBytes < 64 * 1024 && - lineCount < 100 && - (s = lineNumberReader.readLine()) != null) { - - totalBytes += s.length() + 1; // count the new line character - if (s.length() == 0) { - continue; - } - lineCount++; - - if (counts == null) { - counts = new int[s.length()]; - for (int c = 0; c < counts.length; c++) { - counts[c] = 0; - } - } - - for (int c = 0; c < counts.length && c < s.length(); c++) { - char ch = s.charAt(c); - if (ch == ' ') { - counts[c]++; - } - } - } - - if (counts != null) { - List widths = new ArrayList(); - - int startIndex = 0; - for (int c = 0; c < counts.length; c++) { - int count = counts[c]; - if (count == lineCount) { - widths.add(c - startIndex + 1); - startIndex = c + 1; - } - } - - for (int i = widths.size() - 2; i >= 0; i--) { - if (widths.get(i) == 1) { - widths.set(i + 1, widths.get(i + 1) + 1); - widths.remove(i); - } - } - - int[] widthA = new int[widths.size()]; - for (int i = 0; i < widthA.length; i++) { - widthA[i] = widths.get(i); - } - return widthA; - } - } finally { - lineNumberReader.close(); - reader.close(); - is.close(); - } - } catch (UnsupportedEncodingException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } -} +package com.google.refine.importers; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.List; + +import org.json.JSONArray; +import org.json.JSONObject; + +import com.google.refine.ProjectMetadata; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingUtilities; +import com.google.refine.model.Project; +import com.google.refine.util.JSONUtilities; + +public class FixedWidthImporter extends TabularImportingParserBase { + public FixedWidthImporter() { + super(false); + } + + @Override + public JSONObject createParserUIInitializationData( + ImportingJob job, List fileRecords, String format) { + JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); + JSONArray columnWidths = new JSONArray(); + if (fileRecords.size() > 0) { + JSONObject firstFileRecord = fileRecords.get(0); + String encoding = ImportingUtilities.getEncoding(firstFileRecord); + String location = JSONUtilities.getString(firstFileRecord, "location", null); + if (location != null) { + File file = new File(job.getRawDataDir(), location); + int[] columnWidthsA = guessColumnWidths(file, encoding); + if (columnWidthsA != null) { + for (int w : columnWidthsA) { + JSONUtilities.append(columnWidths, w); + } + } + } + + JSONUtilities.safePut(options, "headerLines", 0); + JSONUtilities.safePut(options, "columnWidths", columnWidths); + JSONUtilities.safePut(options, "guessCellValueTypes", true); + } + return options; + } + + @Override + public void parseOneFile( + Project project, + ProjectMetadata metadata, + ImportingJob job, + String fileSource, + Reader reader, + int limit, + JSONObject options, + List exceptions + ) { + final int[] columnWidths = JSONUtilities.getIntArray(options, "columnWidths"); + + List retrievedColumnNames = null; + if (options.has("columnNames")) { + String[] strings = JSONUtilities.getStringArray(options, "columnNames"); + if (strings.length > 0) { + retrievedColumnNames = new ArrayList(); + for (String s : strings) { + s = s.trim(); + if (!s.isEmpty()) { + retrievedColumnNames.add(s); + } + } + + if (retrievedColumnNames.size() > 0) { + JSONUtilities.safePut(options, "headerLines", 1); + } else { + retrievedColumnNames = null; + } + } + } + + final List columnNames = retrievedColumnNames; + final LineNumberReader lnReader = new LineNumberReader(reader); + + TableDataReader dataReader = new TableDataReader() { + boolean usedColumnNames = false; + + @Override + public List getNextRowOfCells() throws IOException { + if (columnNames != null && !usedColumnNames) { + usedColumnNames = true; + return columnNames; + } else { + String line = lnReader.readLine(); + if (line == null) { + return null; + } else { + return getCells(line, columnWidths); + } + } + } + }; + + TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); + } + + /** + * Splits the line into columns + * @param line + * @param lnReader + * @param splitIntoColumns + * @return + */ + static private ArrayList getCells(String line, int[] widths) { + ArrayList cells = new ArrayList(); + + int columnStartCursor = 0; + int columnEndCursor = 0; + for (int width : widths) { + if (columnStartCursor >= line.length()) { + cells.add(null); //FIXME is adding a null cell (to represent no data) OK? + continue; + } + + columnEndCursor = columnStartCursor + width; + + if (columnEndCursor > line.length()) { + columnEndCursor = line.length(); + } + if (columnEndCursor <= columnStartCursor) { + cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK? + continue; + } + + cells.add(line.substring(columnStartCursor, columnEndCursor)); + + columnStartCursor = columnEndCursor; + } + + // Residual text + if (columnStartCursor < line.length()) { + cells.add(line.substring(columnStartCursor)); + } + return cells; + } + + static public int[] guessColumnWidths(File file, String encoding) { + try { + InputStream is = new FileInputStream(file); + Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is); + LineNumberReader lineNumberReader = new LineNumberReader(reader); + + try { + int[] counts = null; + int totalBytes = 0; + int lineCount = 0; + String s; + while (totalBytes < 64 * 1024 && + lineCount < 100 && + (s = lineNumberReader.readLine()) != null) { + + totalBytes += s.length() + 1; // count the new line character + if (s.length() == 0) { + continue; + } + lineCount++; + + if (counts == null) { + counts = new int[s.length()]; + for (int c = 0; c < counts.length; c++) { + counts[c] = 0; + } + } + + for (int c = 0; c < counts.length && c < s.length(); c++) { + char ch = s.charAt(c); + if (ch == ' ') { + counts[c]++; + } + } + } + + if (counts != null && lineCount > 2) { + List widths = new ArrayList(); + + int startIndex = 0; + for (int c = 0; c < counts.length; c++) { + int count = counts[c]; + if (count == lineCount) { + widths.add(c - startIndex + 1); + startIndex = c + 1; + } + } + + for (int i = widths.size() - 2; i >= 0; i--) { + if (widths.get(i) == 1) { + widths.set(i + 1, widths.get(i + 1) + 1); + widths.remove(i); + } + } + + int[] widthA = new int[widths.size()]; + for (int i = 0; i < widthA.length; i++) { + widthA[i] = widths.get(i); + } + return widthA; + } + } finally { + lineNumberReader.close(); + reader.close(); + is.close(); + } + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return null; + } +}