Adding a Fixed Width data importer (Issue 85) and associated tests.

Although this importer is 'wired up', it requires a property "fixed-column-widths" which is not (yet) implemented in the UI. But the ImporterRegister.guessImporter method will probably select the CsvTsvImporter before the FixedWidthImporter anyway. I suggest an improvement to the project creation UI and/or the guessImporter method will be required. git-svn-id: http://google-refine.googlecode.com/svn/trunk@1857 7d457c2a-affb-35e4-300a-418c747d4874
2010-11-11 13:15:41 +00:00 · 2010-11-11 13:15:41 +00:00 · 2f564589f5
commit 2f564589f5
parent 4f9dc38066
3 changed files with 367 additions and 1 deletions
--- a/main/src/com/google/refine/importers/FixedWidthImporter.java
+++ b/main/src/com/google/refine/importers/FixedWidthImporter.java
@ -0,0 +1,240 @@
+package com.google.refine.importers;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
+import java.io.Reader;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import java.util.regex.Pattern;
+
+import javax.servlet.ServletException;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.refine.ProjectMetadata;
+import com.google.refine.expr.ExpressionUtils;
+import com.google.refine.model.Cell;
+import com.google.refine.model.Project;
+import com.google.refine.model.Row;
+
+public class FixedWidthImporter implements ReaderImporter, StreamImporter { //TODO this class is almost an exact copy of TsvCsvImporter.  Could we combine the two, or combine common functions into a common abstract supertype?
+
+    final static Logger logger = LoggerFactory.getLogger("FixedWidthImporter");
+    
+    @Override
+    public boolean canImportData(String contentType, String fileName) {
+        if (contentType != null) {
+            contentType = contentType.toLowerCase().trim();
+            
+            //filter out tree structure data
+            if("application/json".equals(contentType)||
+                    "text/json".equals(contentType)||
+                    "application/xml".equals(contentType) ||
+                    "text/xml".equals(contentType) ||
+                    "application/rss+xml".equals(contentType) ||
+                    "application/atom+xml".equals(contentType) ||
+                    "application/rdf+xml".equals(contentType))  //TODO add more tree data types.
+                return false;
+            
+            return
+                "text/plain".equals(contentType)
+                || "text/fixed-width".equals(contentType);  //FIXME Is text/fixed-width a valid contentType?
+        }
+        return false;
+    }
+
+    @Override
+    public void read(InputStream inputStream, Project project,
+            ProjectMetadata metadata, Properties options)
+            throws ImportException {
+        read(new InputStreamReader(inputStream), project, metadata, options);
+    }
+
+    @Override
+    public void read(Reader reader, Project project, ProjectMetadata metadata,
+            Properties options) throws ImportException {
+        boolean splitIntoColumns = ImporterUtilities.getBooleanOption("split-into-columns", options, true);
+        String columnWidths = options.getProperty("fixed-column-widths");
+        int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
+        int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1);
+
+        int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
+        int skip = ImporterUtilities.getIntegerOption("skip",options,0);
+        boolean guessValueType = ImporterUtilities.getBooleanOption("guess-value-type", options, true);
+
+        LineNumberReader lnReader = new LineNumberReader(reader);
+        
+        
+        read(lnReader, project, columnWidths,
+            limit, skip, ignoreLines, headerLines,
+            guessValueType, splitIntoColumns
+        );
+        
+    }
+
+    /**
+    *
+    * @param lnReader
+    *           LineNumberReader used to read file or string contents
+    * @param project
+    *           The project into which the parsed data will be added
+    * @param columnWidths
+    *           Expects a comma separated string of integers which indicate the number of characters in each line
+    * @param limit
+    *           The maximum number of rows of data to import
+    * @param skip
+    *           The number of initial data rows to skip
+    * @param ignoreLines
+    *           The number of initial lines within the data source which should be ignored entirely
+    * @param headerLines
+    *           The number of lines in the data source which describe each column
+    * @param guessValueType
+    *           Whether the parser should try and guess the type of the value being parsed
+    * @param splitIntoColumns
+    *           Whether the parser should try and split the data source into columns
+    * @throws IOException
+    */
+    public void read(LineNumberReader lnReader, Project project,
+            String sep, int limit, int skip, int ignoreLines,
+            int headerLines, boolean guessValueType, boolean splitIntoColumns) throws ImportException{
+                
+                int[] columnWidths = null;
+
+                columnWidths = getColumnWidthsFromString( sep );
+                
+                if(columnWidths.length < 2)
+                    splitIntoColumns = false;
+                
+                List<String> columnNames = new ArrayList<String>();
+                String line = null;
+                int rowsWithData = 0;
+
+                try {
+                    while ((line = lnReader.readLine()) != null) {
+                        if (ignoreLines > 0) {
+                            ignoreLines--;
+                            continue;
+                        } else if (StringUtils.isBlank(line)) {
+                            continue;
+                        }
+
+
+                        if (headerLines > 0) {
+                            //column headers
+                            headerLines--;
+                            
+                            ArrayList<String> cells = getCells(line, columnWidths, splitIntoColumns);
+                            
+                            for (int c = 0; c < cells.size(); c++) {
+                                String cell = cells.get(c).trim();
+                                //add column even if cell is blank
+                                ImporterUtilities.appendColumnName(columnNames, c, cell);
+                            }
+                        } else {
+                            //data
+                            Row row = new Row(columnNames.size());
+
+                            ArrayList<String> cells = getCells(line, columnWidths, splitIntoColumns);
+
+                            if( cells != null && cells.size() > 0 )
+                                rowsWithData++;
+
+                            if (skip <=0  || rowsWithData > skip){
+                                //add parsed data to row
+                                for(String s : cells){
+                                    if (ExpressionUtils.isNonBlankData(s)) {
+                                        Serializable value = guessValueType ? ImporterUtilities.parseCellValue(s) : s;
+                                        row.cells.add(new Cell(value, null));
+                                    }else{
+                                        row.cells.add(null);
+                                    }
+                                }
+                                project.rows.add(row);
+                                project.columnModel.setMaxCellIndex(row.cells.size());
+                                
+                                ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
+
+                                if (limit > 0 && project.rows.size() >= limit) {
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                } catch (IOException e) {
+                    throw new ImportException("The fixed width importer could not read the next line", e);
+                }
+
+                ImporterUtilities.setupColumns(project, columnNames);
+        
+        }
+
+    /**
+     * Splits the line into columns
+     * @param line
+     * @param lnReader
+     * @param splitIntoColumns
+     * @return
+     */
+    private ArrayList<String> getCells(String line, int[] widths, boolean splitIntoColumns) {
+        ArrayList<String> cells = new ArrayList<String>();
+        if(splitIntoColumns){
+            int columnStartCursor = 0;
+            int columnEndCursor = 0;
+            for(int width : widths){
+                if(columnStartCursor >= line.length()){
+                    cells.add(null); //FIXME is adding a null cell (to represent no data) OK?
+                    continue;
+                }
+                
+                columnEndCursor = columnStartCursor + width;
+                
+                if(columnEndCursor > line.length())
+                    columnEndCursor = line.length();
+                if(columnEndCursor <= columnStartCursor){
+                    cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK? 
+                    continue;
+                }
+                
+                cells.add(line.substring(columnStartCursor, columnEndCursor));
+                
+                columnStartCursor = columnEndCursor;
+            }
+        }else{
+            cells.add(line);
+        }
+        return cells;
+    }
+
+    /**
+     * Converts the expected string of comma separated integers into an array of integers.
+     * Also performs a basic sanity check on the provided data.
+     * 
+     * @param sep
+     * A comma separated string of integers. e.g. 4,2,5,22,19
+     * @return
+     * @throws ServletException
+     */
+    public int[] getColumnWidthsFromString(String sep) throws ImportException {
+        String[] splitSep = Pattern.compile(",").split(sep);
+
+        int[] widths = new int[splitSep.length];
+        for(int i = 0;  i < splitSep.length; i++){
+            try{
+                int parsedInt = Integer.parseInt(splitSep[i]);
+                if( parsedInt < 0 )
+                    throw new ImportException("A column cannot have a width of less than zero", null);
+                widths[i] = parsedInt;
+            }catch(NumberFormatException e){
+                throw new ImportException("For a fixed column width import, the column widths must be given as a comma separated string of integers.  e.g. 1,3,5,22,19", e);
+            }
+        }
+        return widths;
+    }
+
+}
--- a/main/src/com/google/refine/importers/ImporterRegistry.java
+++ b/main/src/com/google/refine/importers/ImporterRegistry.java
@ -52,7 +52,8 @@ abstract public class ImporterRegistry {
        {"RdfTripleImporter", "com.google.refine.importers.RdfTripleImporter"},
        {"MarcImporter", "com.google.refine.importers.MarcImporter"},
        {"TsvCsvImporter", "com.google.refine.importers.TsvCsvImporter"},
-        {"JsonImporter", "com.google.refine.importers.JsonImporter"}
+        {"JsonImporter", "com.google.refine.importers.JsonImporter"},
+        {"FixedWidthImporter", "com.google.refine.importers.FixedWidthImporter"}
    };

    static {
--- a/main/tests/server/src/com/google/refine/tests/importers/FixedWidthImporterTests.java
+++ b/main/tests/server/src/com/google/refine/tests/importers/FixedWidthImporterTests.java
@ -0,0 +1,125 @@
+package com.google.refine.tests.importers;
+
+
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import java.io.StringReader;
+import java.util.Properties;
+
+import org.slf4j.LoggerFactory;
+import org.testng.Assert;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.BeforeTest;
+import org.testng.annotations.Test;
+
+import com.google.refine.ProjectMetadata;
+import com.google.refine.importers.FixedWidthImporter;
+import com.google.refine.importers.ImportException;
+import com.google.refine.model.Project;
+import com.google.refine.tests.RefineTest;
+
+public class FixedWidthImporterTests extends RefineTest {
+    @BeforeTest
+    public void init() {
+        logger = LoggerFactory.getLogger(this.getClass());
+    }
+
+    //constants
+    String SAMPLE_ROW = "NDB_NoShrt_DescWater";
+    String SAMPLE_ROW_WIDTHS = "6,9,5";
+
+    //System Under Test
+    FixedWidthImporter SUT = null;
+
+    //mock dependencies
+    Project project = null;
+    Properties properties = null;
+
+
+    @BeforeMethod
+    public void SetUp(){
+        SUT = new FixedWidthImporter();
+        project = new Project(); //FIXME - should we try and use mock(Project.class); - seems unnecessary complexity
+        properties = mock(Properties.class);
+    }
+
+    @AfterMethod
+    public void TearDown(){
+        SUT = null;
+        project = null;
+        properties = null;
+    }
+    
+    //TODO a lot of these tests are very similar to the TsvCsvImporterTests.  It might be possible to overlap them
+    
+    @Test
+    public void canParseSeparator(){
+        int[] i = null;
+        try {
+            i = SUT.getColumnWidthsFromString("1,2,3");
+        } catch (ImportException e) {
+            Assert.fail(e.getMessage());
+        }
+        
+        Assert.assertNotNull(i);
+        Assert.assertEquals(i[0], 1);
+        Assert.assertEquals(i[1], 2);
+        Assert.assertEquals(i[2], 3);
+    }
+    
+    //---------------------read tests------------------------
+    @Test
+    public void readFixedWidth(){
+        StringReader reader = new StringReader(SAMPLE_ROW + "\nTooShort");
+
+        when(properties.getProperty("fixed-column-widths")).thenReturn(SAMPLE_ROW_WIDTHS);
+        whenGetIntegerOption("ignore",properties,0);
+        whenGetIntegerOption("header-lines",properties,0);
+        whenGetIntegerOption("limit",properties,-1);
+        whenGetIntegerOption("skip",properties,0);
+
+        try {
+            SUT.read(reader, project, new ProjectMetadata(), properties);
+        } catch (Exception e) {
+            Assert.fail(e.getMessage());
+        }
+
+
+        Assert.assertEquals(project.rows.size(), 2);
+        Assert.assertEquals(project.rows.get(0).cells.size(), 3);
+        Assert.assertEquals((String)project.rows.get(0).cells.get(0).value, "NDB_No");
+        Assert.assertEquals((String)project.rows.get(0).cells.get(1).value, "Shrt_Desc");
+        Assert.assertEquals((String)project.rows.get(0).cells.get(2).value, "Water");
+        Assert.assertEquals(project.rows.get(1).cells.size(), 3);
+        Assert.assertEquals((String)project.rows.get(1).cells.get(0).value, "TooSho");
+        Assert.assertEquals((String)project.rows.get(1).cells.get(1).value, "rt");
+        Assert.assertNull(project.rows.get(1).cells.get(2));
+
+        verify(properties, times(1)).getProperty("fixed-column-widths");
+        verifyGetOption("ignore",properties);
+        verifyGetOption("header-lines",properties);
+        verifyGetOption("limit",properties);
+        verifyGetOption("skip",properties);
+    }
+    
+    //----helpers----
+    
+    public void whenGetBooleanOption(String name, Properties properties, Boolean def){
+        when(properties.containsKey(name)).thenReturn(true);
+        when(properties.getProperty(name)).thenReturn(Boolean.toString(def));
+    }
+    
+    public void whenGetIntegerOption(String name, Properties properties, int def){
+        when(properties.containsKey(name)).thenReturn(true);
+        when(properties.getProperty(name)).thenReturn(Integer.toString(def));
+    }
+    
+    public void verifyGetOption(String name, Properties properties){
+        verify(properties, times(1)).containsKey(name);
+        verify(properties, times(1)).getProperty(name);
+    }
+}