Adding a Fixed Width data importer (Issue 85) and associated tests.
Although this importer is 'wired up', it requires a property "fixed-column-widths" which is not (yet) implemented in the UI. But the ImporterRegister.guessImporter method will probably select the CsvTsvImporter before the FixedWidthImporter anyway. I suggest an improvement to the project creation UI and/or the guessImporter method will be required. git-svn-id: http://google-refine.googlecode.com/svn/trunk@1857 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
4f9dc38066
commit
2f564589f5
240
main/src/com/google/refine/importers/FixedWidthImporter.java
Normal file
240
main/src/com/google/refine/importers/FixedWidthImporter.java
Normal file
@ -0,0 +1,240 @@
|
|||||||
|
package com.google.refine.importers;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.LineNumberReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Properties;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import javax.servlet.ServletException;
|
||||||
|
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.refine.ProjectMetadata;
|
||||||
|
import com.google.refine.expr.ExpressionUtils;
|
||||||
|
import com.google.refine.model.Cell;
|
||||||
|
import com.google.refine.model.Project;
|
||||||
|
import com.google.refine.model.Row;
|
||||||
|
|
||||||
|
public class FixedWidthImporter implements ReaderImporter, StreamImporter { //TODO this class is almost an exact copy of TsvCsvImporter. Could we combine the two, or combine common functions into a common abstract supertype?
|
||||||
|
|
||||||
|
final static Logger logger = LoggerFactory.getLogger("FixedWidthImporter");
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean canImportData(String contentType, String fileName) {
|
||||||
|
if (contentType != null) {
|
||||||
|
contentType = contentType.toLowerCase().trim();
|
||||||
|
|
||||||
|
//filter out tree structure data
|
||||||
|
if("application/json".equals(contentType)||
|
||||||
|
"text/json".equals(contentType)||
|
||||||
|
"application/xml".equals(contentType) ||
|
||||||
|
"text/xml".equals(contentType) ||
|
||||||
|
"application/rss+xml".equals(contentType) ||
|
||||||
|
"application/atom+xml".equals(contentType) ||
|
||||||
|
"application/rdf+xml".equals(contentType)) //TODO add more tree data types.
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return
|
||||||
|
"text/plain".equals(contentType)
|
||||||
|
|| "text/fixed-width".equals(contentType); //FIXME Is text/fixed-width a valid contentType?
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void read(InputStream inputStream, Project project,
|
||||||
|
ProjectMetadata metadata, Properties options)
|
||||||
|
throws ImportException {
|
||||||
|
read(new InputStreamReader(inputStream), project, metadata, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void read(Reader reader, Project project, ProjectMetadata metadata,
|
||||||
|
Properties options) throws ImportException {
|
||||||
|
boolean splitIntoColumns = ImporterUtilities.getBooleanOption("split-into-columns", options, true);
|
||||||
|
String columnWidths = options.getProperty("fixed-column-widths");
|
||||||
|
int ignoreLines = ImporterUtilities.getIntegerOption("ignore", options, -1);
|
||||||
|
int headerLines = ImporterUtilities.getIntegerOption("header-lines", options, 1);
|
||||||
|
|
||||||
|
int limit = ImporterUtilities.getIntegerOption("limit",options,-1);
|
||||||
|
int skip = ImporterUtilities.getIntegerOption("skip",options,0);
|
||||||
|
boolean guessValueType = ImporterUtilities.getBooleanOption("guess-value-type", options, true);
|
||||||
|
|
||||||
|
LineNumberReader lnReader = new LineNumberReader(reader);
|
||||||
|
|
||||||
|
|
||||||
|
read(lnReader, project, columnWidths,
|
||||||
|
limit, skip, ignoreLines, headerLines,
|
||||||
|
guessValueType, splitIntoColumns
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param lnReader
|
||||||
|
* LineNumberReader used to read file or string contents
|
||||||
|
* @param project
|
||||||
|
* The project into which the parsed data will be added
|
||||||
|
* @param columnWidths
|
||||||
|
* Expects a comma separated string of integers which indicate the number of characters in each line
|
||||||
|
* @param limit
|
||||||
|
* The maximum number of rows of data to import
|
||||||
|
* @param skip
|
||||||
|
* The number of initial data rows to skip
|
||||||
|
* @param ignoreLines
|
||||||
|
* The number of initial lines within the data source which should be ignored entirely
|
||||||
|
* @param headerLines
|
||||||
|
* The number of lines in the data source which describe each column
|
||||||
|
* @param guessValueType
|
||||||
|
* Whether the parser should try and guess the type of the value being parsed
|
||||||
|
* @param splitIntoColumns
|
||||||
|
* Whether the parser should try and split the data source into columns
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public void read(LineNumberReader lnReader, Project project,
|
||||||
|
String sep, int limit, int skip, int ignoreLines,
|
||||||
|
int headerLines, boolean guessValueType, boolean splitIntoColumns) throws ImportException{
|
||||||
|
|
||||||
|
int[] columnWidths = null;
|
||||||
|
|
||||||
|
columnWidths = getColumnWidthsFromString( sep );
|
||||||
|
|
||||||
|
if(columnWidths.length < 2)
|
||||||
|
splitIntoColumns = false;
|
||||||
|
|
||||||
|
List<String> columnNames = new ArrayList<String>();
|
||||||
|
String line = null;
|
||||||
|
int rowsWithData = 0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
while ((line = lnReader.readLine()) != null) {
|
||||||
|
if (ignoreLines > 0) {
|
||||||
|
ignoreLines--;
|
||||||
|
continue;
|
||||||
|
} else if (StringUtils.isBlank(line)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (headerLines > 0) {
|
||||||
|
//column headers
|
||||||
|
headerLines--;
|
||||||
|
|
||||||
|
ArrayList<String> cells = getCells(line, columnWidths, splitIntoColumns);
|
||||||
|
|
||||||
|
for (int c = 0; c < cells.size(); c++) {
|
||||||
|
String cell = cells.get(c).trim();
|
||||||
|
//add column even if cell is blank
|
||||||
|
ImporterUtilities.appendColumnName(columnNames, c, cell);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//data
|
||||||
|
Row row = new Row(columnNames.size());
|
||||||
|
|
||||||
|
ArrayList<String> cells = getCells(line, columnWidths, splitIntoColumns);
|
||||||
|
|
||||||
|
if( cells != null && cells.size() > 0 )
|
||||||
|
rowsWithData++;
|
||||||
|
|
||||||
|
if (skip <=0 || rowsWithData > skip){
|
||||||
|
//add parsed data to row
|
||||||
|
for(String s : cells){
|
||||||
|
if (ExpressionUtils.isNonBlankData(s)) {
|
||||||
|
Serializable value = guessValueType ? ImporterUtilities.parseCellValue(s) : s;
|
||||||
|
row.cells.add(new Cell(value, null));
|
||||||
|
}else{
|
||||||
|
row.cells.add(null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
project.rows.add(row);
|
||||||
|
project.columnModel.setMaxCellIndex(row.cells.size());
|
||||||
|
|
||||||
|
ImporterUtilities.ensureColumnsInRowExist(columnNames, row);
|
||||||
|
|
||||||
|
if (limit > 0 && project.rows.size() >= limit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new ImportException("The fixed width importer could not read the next line", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
ImporterUtilities.setupColumns(project, columnNames);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits the line into columns
|
||||||
|
* @param line
|
||||||
|
* @param lnReader
|
||||||
|
* @param splitIntoColumns
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private ArrayList<String> getCells(String line, int[] widths, boolean splitIntoColumns) {
|
||||||
|
ArrayList<String> cells = new ArrayList<String>();
|
||||||
|
if(splitIntoColumns){
|
||||||
|
int columnStartCursor = 0;
|
||||||
|
int columnEndCursor = 0;
|
||||||
|
for(int width : widths){
|
||||||
|
if(columnStartCursor >= line.length()){
|
||||||
|
cells.add(null); //FIXME is adding a null cell (to represent no data) OK?
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
columnEndCursor = columnStartCursor + width;
|
||||||
|
|
||||||
|
if(columnEndCursor > line.length())
|
||||||
|
columnEndCursor = line.length();
|
||||||
|
if(columnEndCursor <= columnStartCursor){
|
||||||
|
cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK?
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
cells.add(line.substring(columnStartCursor, columnEndCursor));
|
||||||
|
|
||||||
|
columnStartCursor = columnEndCursor;
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
cells.add(line);
|
||||||
|
}
|
||||||
|
return cells;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts the expected string of comma separated integers into an array of integers.
|
||||||
|
* Also performs a basic sanity check on the provided data.
|
||||||
|
*
|
||||||
|
* @param sep
|
||||||
|
* A comma separated string of integers. e.g. 4,2,5,22,19
|
||||||
|
* @return
|
||||||
|
* @throws ServletException
|
||||||
|
*/
|
||||||
|
public int[] getColumnWidthsFromString(String sep) throws ImportException {
|
||||||
|
String[] splitSep = Pattern.compile(",").split(sep);
|
||||||
|
|
||||||
|
int[] widths = new int[splitSep.length];
|
||||||
|
for(int i = 0; i < splitSep.length; i++){
|
||||||
|
try{
|
||||||
|
int parsedInt = Integer.parseInt(splitSep[i]);
|
||||||
|
if( parsedInt < 0 )
|
||||||
|
throw new ImportException("A column cannot have a width of less than zero", null);
|
||||||
|
widths[i] = parsedInt;
|
||||||
|
}catch(NumberFormatException e){
|
||||||
|
throw new ImportException("For a fixed column width import, the column widths must be given as a comma separated string of integers. e.g. 1,3,5,22,19", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return widths;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -52,7 +52,8 @@ abstract public class ImporterRegistry {
|
|||||||
{"RdfTripleImporter", "com.google.refine.importers.RdfTripleImporter"},
|
{"RdfTripleImporter", "com.google.refine.importers.RdfTripleImporter"},
|
||||||
{"MarcImporter", "com.google.refine.importers.MarcImporter"},
|
{"MarcImporter", "com.google.refine.importers.MarcImporter"},
|
||||||
{"TsvCsvImporter", "com.google.refine.importers.TsvCsvImporter"},
|
{"TsvCsvImporter", "com.google.refine.importers.TsvCsvImporter"},
|
||||||
{"JsonImporter", "com.google.refine.importers.JsonImporter"}
|
{"JsonImporter", "com.google.refine.importers.JsonImporter"},
|
||||||
|
{"FixedWidthImporter", "com.google.refine.importers.FixedWidthImporter"}
|
||||||
};
|
};
|
||||||
|
|
||||||
static {
|
static {
|
||||||
|
@ -0,0 +1,125 @@
|
|||||||
|
package com.google.refine.tests.importers;
|
||||||
|
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.times;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.AfterMethod;
|
||||||
|
import org.testng.annotations.BeforeMethod;
|
||||||
|
import org.testng.annotations.BeforeTest;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import com.google.refine.ProjectMetadata;
|
||||||
|
import com.google.refine.importers.FixedWidthImporter;
|
||||||
|
import com.google.refine.importers.ImportException;
|
||||||
|
import com.google.refine.model.Project;
|
||||||
|
import com.google.refine.tests.RefineTest;
|
||||||
|
|
||||||
|
public class FixedWidthImporterTests extends RefineTest {
|
||||||
|
@BeforeTest
|
||||||
|
public void init() {
|
||||||
|
logger = LoggerFactory.getLogger(this.getClass());
|
||||||
|
}
|
||||||
|
|
||||||
|
//constants
|
||||||
|
String SAMPLE_ROW = "NDB_NoShrt_DescWater";
|
||||||
|
String SAMPLE_ROW_WIDTHS = "6,9,5";
|
||||||
|
|
||||||
|
//System Under Test
|
||||||
|
FixedWidthImporter SUT = null;
|
||||||
|
|
||||||
|
//mock dependencies
|
||||||
|
Project project = null;
|
||||||
|
Properties properties = null;
|
||||||
|
|
||||||
|
|
||||||
|
@BeforeMethod
|
||||||
|
public void SetUp(){
|
||||||
|
SUT = new FixedWidthImporter();
|
||||||
|
project = new Project(); //FIXME - should we try and use mock(Project.class); - seems unnecessary complexity
|
||||||
|
properties = mock(Properties.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterMethod
|
||||||
|
public void TearDown(){
|
||||||
|
SUT = null;
|
||||||
|
project = null;
|
||||||
|
properties = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
//TODO a lot of these tests are very similar to the TsvCsvImporterTests. It might be possible to overlap them
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void canParseSeparator(){
|
||||||
|
int[] i = null;
|
||||||
|
try {
|
||||||
|
i = SUT.getColumnWidthsFromString("1,2,3");
|
||||||
|
} catch (ImportException e) {
|
||||||
|
Assert.fail(e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertNotNull(i);
|
||||||
|
Assert.assertEquals(i[0], 1);
|
||||||
|
Assert.assertEquals(i[1], 2);
|
||||||
|
Assert.assertEquals(i[2], 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------read tests------------------------
|
||||||
|
@Test
|
||||||
|
public void readFixedWidth(){
|
||||||
|
StringReader reader = new StringReader(SAMPLE_ROW + "\nTooShort");
|
||||||
|
|
||||||
|
when(properties.getProperty("fixed-column-widths")).thenReturn(SAMPLE_ROW_WIDTHS);
|
||||||
|
whenGetIntegerOption("ignore",properties,0);
|
||||||
|
whenGetIntegerOption("header-lines",properties,0);
|
||||||
|
whenGetIntegerOption("limit",properties,-1);
|
||||||
|
whenGetIntegerOption("skip",properties,0);
|
||||||
|
|
||||||
|
try {
|
||||||
|
SUT.read(reader, project, new ProjectMetadata(), properties);
|
||||||
|
} catch (Exception e) {
|
||||||
|
Assert.fail(e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Assert.assertEquals(project.rows.size(), 2);
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.size(), 3);
|
||||||
|
Assert.assertEquals((String)project.rows.get(0).cells.get(0).value, "NDB_No");
|
||||||
|
Assert.assertEquals((String)project.rows.get(0).cells.get(1).value, "Shrt_Desc");
|
||||||
|
Assert.assertEquals((String)project.rows.get(0).cells.get(2).value, "Water");
|
||||||
|
Assert.assertEquals(project.rows.get(1).cells.size(), 3);
|
||||||
|
Assert.assertEquals((String)project.rows.get(1).cells.get(0).value, "TooSho");
|
||||||
|
Assert.assertEquals((String)project.rows.get(1).cells.get(1).value, "rt");
|
||||||
|
Assert.assertNull(project.rows.get(1).cells.get(2));
|
||||||
|
|
||||||
|
verify(properties, times(1)).getProperty("fixed-column-widths");
|
||||||
|
verifyGetOption("ignore",properties);
|
||||||
|
verifyGetOption("header-lines",properties);
|
||||||
|
verifyGetOption("limit",properties);
|
||||||
|
verifyGetOption("skip",properties);
|
||||||
|
}
|
||||||
|
|
||||||
|
//----helpers----
|
||||||
|
|
||||||
|
public void whenGetBooleanOption(String name, Properties properties, Boolean def){
|
||||||
|
when(properties.containsKey(name)).thenReturn(true);
|
||||||
|
when(properties.getProperty(name)).thenReturn(Boolean.toString(def));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void whenGetIntegerOption(String name, Properties properties, int def){
|
||||||
|
when(properties.containsKey(name)).thenReturn(true);
|
||||||
|
when(properties.getProperty(name)).thenReturn(Integer.toString(def));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void verifyGetOption(String name, Properties properties){
|
||||||
|
verify(properties, times(1)).containsKey(name);
|
||||||
|
verify(properties, times(1)).getProperty(name);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user