Adding a Fixed Width data importer (Issue 85) and associated tests.

Although this importer is 'wired up', it requires a property "fixed-column-widths" which is not (yet) implemented in the UI. But the ImporterRegister.guessImporter method will probably select the CsvTsvImporter before the FixedWidthImporter anyway. I suggest an improvement to the project creation UI and/or the guessImporter method will be required. git-svn-id: http://google-refine.googlecode.com/svn/trunk@1857 7d457c2a-affb-35e4-300a-418c747d4874
2010-11-11 13:15:41 +00:00 · 2010-11-11 13:15:41 +00:00 · 2f564589f5
commit 2f564589f5
parent 4f9dc38066
3 changed files with 367 additions and 1 deletions
--- a/main/src/com/google/refine/importers/FixedWidthImporter.java
+++ b/main/src/com/google/refine/importers/FixedWidthImporter.java
@ -0,0 +1,240 @@
 package com.google.refine.importers;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.Reader;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Properties;
 import java.util.regex.Pattern;
 import javax.servlet.ServletException;
 import org.apache.commons.lang.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.google.refine.ProjectMetadata;
 import com.google.refine.expr.ExpressionUtils;
 import com.google.refine.model.Cell;
 import com.google.refine.model.Project;
 import com.google.refine.model.Row;
 public class FixedWidthImporter implements ReaderImporter, StreamImporter { //TODO this class is almost an exact copy of TsvCsvImporter.  Could we combine the two, or combine common functions into a common abstract supertype?
    final static Logger logger = LoggerFactory.getLogger("FixedWidthImporter");
    @Override
    public boolean canImportData(String contentType, String fileName) {
        if (contentType != null) {
            contentType = contentType.toLowerCase().trim();
            //filter out tree structure data
            if("application/json".equals(contentType)||
                    "text/json".equals(contentType)||
                    "application/xml".equals(contentType) ||
                    "text/xml".equals(contentType) ||
                    "application/rss+xml".equals(contentType) ||
                    "application/atom+xml".equals(contentType) ||
                    "application/rdf+xml".equals(contentType))  //TODO add more tree data types.
                return false;
            return
                "text/plain".equals(contentType)
                || "text/fixed-width".equals(contentType);  //FIXME Is text/fixed-width a valid contentType?
        }
        return false;
    }
    @Override
    public void read(InputStream inputStream, Project project,
            ProjectMetadata metadata, Properties options)
            throws ImportException {
        read(new InputStreamReader(inputStream), project, metadata, options);
    }
    @Override
    public void read(Reader reader, Project project, ProjectMetadata metadata,
            Properties options) throws ImportException {
        boolean splitIntoColumns = ImporterUtilities.getBooleanOption("split-into-columns", options, true);
        String columnWidths = options.getProperty("fixed-column-widths");
        int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
        int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1);
        int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
        int skip = ImporterUtilities.getIntegerOption("skip",options,0);
        boolean guessValueType = ImporterUtilities.getBooleanOption("guess-value-type", options, true);
        LineNumberReader lnReader = new LineNumberReader(reader);
        read(lnReader, project, columnWidths,
            limit, skip, ignoreLines, headerLines,
            guessValueType, splitIntoColumns
        );
    }
    /**
    *
    * @param lnReader
    *           LineNumberReader used to read file or string contents
    * @param project
    *           The project into which the parsed data will be added
    * @param columnWidths
    *           Expects a comma separated string of integers which indicate the number of characters in each line
    * @param limit
    *           The maximum number of rows of data to import
    * @param skip
    *           The number of initial data rows to skip
    * @param ignoreLines
    *           The number of initial lines within the data source which should be ignored entirely
    * @param headerLines
    *           The number of lines in the data source which describe each column
    * @param guessValueType
    *           Whether the parser should try and guess the type of the value being parsed
    * @param splitIntoColumns
    *           Whether the parser should try and split the data source into columns
    * @throws IOException
    */
    public void read(LineNumberReader lnReader, Project project,
            String sep, int limit, int skip, int ignoreLines,
            int headerLines, boolean guessValueType, boolean splitIntoColumns) throws ImportException{
                int[] columnWidths = null;
                columnWidths = getColumnWidthsFromString( sep );
                if(columnWidths.length < 2)
                    splitIntoColumns = false;
                List<String> columnNames = new ArrayList<String>();
                String line = null;
                int rowsWithData = 0;
                try {
                    while ((line = lnReader.readLine()) != null) {
                        if (ignoreLines > 0) {
                            ignoreLines--;
                            continue;
                        } else if (StringUtils.isBlank(line)) {
                            continue;
                        }
                        if (headerLines > 0) {
                            //column headers
                            headerLines--;
                            ArrayList<String> cells = getCells(line, columnWidths, splitIntoColumns);
                            for (int c = 0; c < cells.size(); c++) {
                                String cell = cells.get(c).trim();
                                //add column even if cell is blank
                                ImporterUtilities.appendColumnName(columnNames, c, cell);
                            }
                        } else {
                            //data
                            Row row = new Row(columnNames.size());
                            ArrayList<String> cells = getCells(line, columnWidths, splitIntoColumns);
                            if( cells != null && cells.size() > 0 )
                                rowsWithData++;
                            if (skip <=0  || rowsWithData > skip){
                                //add parsed data to row
                                for(String s : cells){
                                    if (ExpressionUtils.isNonBlankData(s)) {
                                        Serializable value = guessValueType ? ImporterUtilities.parseCellValue(s) : s;
                                        row.cells.add(new Cell(value, null));
                                    }else{
                                        row.cells.add(null);
                                    }
                                }
                                project.rows.add(row);
                                project.columnModel.setMaxCellIndex(row.cells.size());
                                ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
                                if (limit > 0 && project.rows.size() >= limit) {
                                    break;
                                }
                            }
                        }
                    }
                } catch (IOException e) {
                    throw new ImportException("The fixed width importer could not read the next line", e);
                }
                ImporterUtilities.setupColumns(project, columnNames);
        }
    /**
     * Splits the line into columns
     * @param line
     * @param lnReader
     * @param splitIntoColumns
     * @return
     */
    private ArrayList<String> getCells(String line, int[] widths, boolean splitIntoColumns) {
        ArrayList<String> cells = new ArrayList<String>();
        if(splitIntoColumns){
            int columnStartCursor = 0;
            int columnEndCursor = 0;
            for(int width : widths){
                if(columnStartCursor >= line.length()){
                    cells.add(null); //FIXME is adding a null cell (to represent no data) OK?
                    continue;
                }
                columnEndCursor = columnStartCursor + width;
                if(columnEndCursor > line.length())
                    columnEndCursor = line.length();
                if(columnEndCursor <= columnStartCursor){
                    cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK? 
                    continue;
                }
                cells.add(line.substring(columnStartCursor, columnEndCursor));
                columnStartCursor = columnEndCursor;
            }
        }else{
            cells.add(line);
        }
        return cells;
    }
    /**
     * Converts the expected string of comma separated integers into an array of integers.
     * Also performs a basic sanity check on the provided data.
     * 
     * @param sep
     * A comma separated string of integers. e.g. 4,2,5,22,19
     * @return
     * @throws ServletException
     */
    public int[] getColumnWidthsFromString(String sep) throws ImportException {
        String[] splitSep = Pattern.compile(",").split(sep);
        int[] widths = new int[splitSep.length];
        for(int i = 0;  i < splitSep.length; i++){
            try{
                int parsedInt = Integer.parseInt(splitSep[i]);
                if( parsedInt < 0 )
                    throw new ImportException("A column cannot have a width of less than zero", null);
                widths[i] = parsedInt;
            }catch(NumberFormatException e){
                throw new ImportException("For a fixed column width import, the column widths must be given as a comma separated string of integers.  e.g. 1,3,5,22,19", e);
            }
        }
        return widths;
    }
 }
--- a/main/src/com/google/refine/importers/ImporterRegistry.java
+++ b/main/src/com/google/refine/importers/ImporterRegistry.java
@ -52,7 +52,8 @@ abstract public class ImporterRegistry {
        {"RdfTripleImporter", "com.google.refine.importers.RdfTripleImporter"},
        {"MarcImporter", "com.google.refine.importers.MarcImporter"},
        {"TsvCsvImporter", "com.google.refine.importers.TsvCsvImporter"},
-        {"JsonImporter", "com.google.refine.importers.JsonImporter"}
+        {"JsonImporter", "com.google.refine.importers.JsonImporter"},
        {"FixedWidthImporter", "com.google.refine.importers.FixedWidthImporter"}
    };
    static {
--- a/main/tests/server/src/com/google/refine/tests/importers/FixedWidthImporterTests.java
+++ b/main/tests/server/src/com/google/refine/tests/importers/FixedWidthImporterTests.java
@ -0,0 +1,125 @@
 package com.google.refine.tests.importers;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 import java.io.StringReader;
 import java.util.Properties;
 import org.slf4j.LoggerFactory;
 import org.testng.Assert;
 import org.testng.annotations.AfterMethod;
 import org.testng.annotations.BeforeMethod;
 import org.testng.annotations.BeforeTest;
 import org.testng.annotations.Test;
 import com.google.refine.ProjectMetadata;
 import com.google.refine.importers.FixedWidthImporter;
 import com.google.refine.importers.ImportException;
 import com.google.refine.model.Project;
 import com.google.refine.tests.RefineTest;
 public class FixedWidthImporterTests extends RefineTest {
    @BeforeTest
    public void init() {
        logger = LoggerFactory.getLogger(this.getClass());
    }
    //constants
    String SAMPLE_ROW = "NDB_NoShrt_DescWater";
    String SAMPLE_ROW_WIDTHS = "6,9,5";
    //System Under Test
    FixedWidthImporter SUT = null;
    //mock dependencies
    Project project = null;
    Properties properties = null;
    @BeforeMethod
    public void SetUp(){
        SUT = new FixedWidthImporter();
        project = new Project(); //FIXME - should we try and use mock(Project.class); - seems unnecessary complexity
        properties = mock(Properties.class);
    }
    @AfterMethod
    public void TearDown(){
        SUT = null;
        project = null;
        properties = null;
    }
    //TODO a lot of these tests are very similar to the TsvCsvImporterTests.  It might be possible to overlap them
    @Test
    public void canParseSeparator(){
        int[] i = null;
        try {
            i = SUT.getColumnWidthsFromString("1,2,3");
        } catch (ImportException e) {
            Assert.fail(e.getMessage());
        }
        Assert.assertNotNull(i);
        Assert.assertEquals(i[0], 1);
        Assert.assertEquals(i[1], 2);
        Assert.assertEquals(i[2], 3);
    }
    //---------------------read tests------------------------
    @Test
    public void readFixedWidth(){
        StringReader reader = new StringReader(SAMPLE_ROW + "\nTooShort");
        when(properties.getProperty("fixed-column-widths")).thenReturn(SAMPLE_ROW_WIDTHS);
        whenGetIntegerOption("ignore",properties,0);
        whenGetIntegerOption("header-lines",properties,0);
        whenGetIntegerOption("limit",properties,-1);
        whenGetIntegerOption("skip",properties,0);
        try {
            SUT.read(reader, project, new ProjectMetadata(), properties);
        } catch (Exception e) {
            Assert.fail(e.getMessage());
        }
        Assert.assertEquals(project.rows.size(), 2);
        Assert.assertEquals(project.rows.get(0).cells.size(), 3);
        Assert.assertEquals((String)project.rows.get(0).cells.get(0).value, "NDB_No");
        Assert.assertEquals((String)project.rows.get(0).cells.get(1).value, "Shrt_Desc");
        Assert.assertEquals((String)project.rows.get(0).cells.get(2).value, "Water");
        Assert.assertEquals(project.rows.get(1).cells.size(), 3);
        Assert.assertEquals((String)project.rows.get(1).cells.get(0).value, "TooSho");
        Assert.assertEquals((String)project.rows.get(1).cells.get(1).value, "rt");
        Assert.assertNull(project.rows.get(1).cells.get(2));
        verify(properties, times(1)).getProperty("fixed-column-widths");
        verifyGetOption("ignore",properties);
        verifyGetOption("header-lines",properties);
        verifyGetOption("limit",properties);
        verifyGetOption("skip",properties);
    }
    //----helpers----
    public void whenGetBooleanOption(String name, Properties properties, Boolean def){
        when(properties.containsKey(name)).thenReturn(true);
        when(properties.getProperty(name)).thenReturn(Boolean.toString(def));
    }
    public void whenGetIntegerOption(String name, Properties properties, int def){
        when(properties.containsKey(name)).thenReturn(true);
        when(properties.getProperty(name)).thenReturn(Integer.toString(def));
    }
    public void verifyGetOption(String name, Properties properties){
        verify(properties, times(1)).containsKey(name);
        verify(properties, times(1)).getProperty(name);
    }
 }