Update split multi-valued cells to support split by regex and split by lengths
This commit is contained in:
parent
6fb7f1f476
commit
cccf1e55c9
@ -33,17 +33,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||||||
|
|
||||||
package com.google.refine.commands.cell;
|
package com.google.refine.commands.cell;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
|
||||||
import javax.servlet.ServletException;
|
import javax.servlet.ServletException;
|
||||||
import javax.servlet.http.HttpServletRequest;
|
import javax.servlet.http.HttpServletRequest;
|
||||||
import javax.servlet.http.HttpServletResponse;
|
import javax.servlet.http.HttpServletResponse;
|
||||||
|
|
||||||
|
import org.json.JSONArray;
|
||||||
|
|
||||||
import com.google.refine.commands.Command;
|
import com.google.refine.commands.Command;
|
||||||
import com.google.refine.model.AbstractOperation;
|
import com.google.refine.model.AbstractOperation;
|
||||||
import com.google.refine.model.Project;
|
import com.google.refine.model.Project;
|
||||||
import com.google.refine.operations.cell.MultiValuedCellSplitOperation;
|
import com.google.refine.operations.cell.MultiValuedCellSplitOperation;
|
||||||
|
import com.google.refine.util.ParsingUtilities;
|
||||||
import com.google.refine.process.Process;
|
import com.google.refine.process.Process;
|
||||||
|
|
||||||
public class SplitMultiValueCellsCommand extends Command {
|
public class SplitMultiValueCellsCommand extends Command {
|
||||||
@ -58,11 +61,33 @@ public class SplitMultiValueCellsCommand extends Command {
|
|||||||
String keyColumnName = request.getParameter("keyColumnName");
|
String keyColumnName = request.getParameter("keyColumnName");
|
||||||
String separator = request.getParameter("separator");
|
String separator = request.getParameter("separator");
|
||||||
String mode = request.getParameter("mode");
|
String mode = request.getParameter("mode");
|
||||||
|
Boolean regex = Boolean.parseBoolean(request.getParameter("regex"));
|
||||||
|
|
||||||
AbstractOperation op = new MultiValuedCellSplitOperation(columnName, keyColumnName, separator, mode);
|
if ("separator".equals(mode)) {
|
||||||
Process process = op.createProcess(project, new Properties());
|
AbstractOperation op = new MultiValuedCellSplitOperation(columnName,
|
||||||
|
keyColumnName,
|
||||||
|
separator,
|
||||||
|
regex);
|
||||||
|
Process process = op.createProcess(project, new Properties());
|
||||||
|
|
||||||
performProcessAndRespond(request, response, project, process);
|
performProcessAndRespond(request, response, project, process);
|
||||||
|
} else {
|
||||||
|
String s = request.getParameter("fieldLengths");
|
||||||
|
|
||||||
|
JSONArray a = ParsingUtilities.evaluateJsonStringToArray(s);
|
||||||
|
int[] fieldLengths = new int[a.length()];
|
||||||
|
|
||||||
|
for (int i = 0; i < fieldLengths.length; i++) {
|
||||||
|
fieldLengths[i] = a.getInt(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
AbstractOperation op = new MultiValuedCellSplitOperation(columnName,
|
||||||
|
keyColumnName,
|
||||||
|
fieldLengths);
|
||||||
|
Process process = op.createProcess(project, new Properties());
|
||||||
|
|
||||||
|
performProcessAndRespond(request, response, project, process);
|
||||||
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
respondException(response, e);
|
respondException(response, e);
|
||||||
}
|
}
|
||||||
|
@ -33,9 +33,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||||||
|
|
||||||
package com.google.refine.operations.cell;
|
package com.google.refine.operations.cell;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.json.JSONException;
|
import org.json.JSONException;
|
||||||
@ -50,32 +52,64 @@ import com.google.refine.model.Project;
|
|||||||
import com.google.refine.model.Row;
|
import com.google.refine.model.Row;
|
||||||
import com.google.refine.model.changes.MassRowChange;
|
import com.google.refine.model.changes.MassRowChange;
|
||||||
import com.google.refine.operations.OperationRegistry;
|
import com.google.refine.operations.OperationRegistry;
|
||||||
|
import com.google.refine.util.JSONUtilities;
|
||||||
|
|
||||||
public class MultiValuedCellSplitOperation extends AbstractOperation {
|
public class MultiValuedCellSplitOperation extends AbstractOperation {
|
||||||
final protected String _columnName;
|
final protected String _columnName;
|
||||||
final protected String _keyColumnName;
|
final protected String _keyColumnName;
|
||||||
final protected String _separator;
|
|
||||||
final protected String _mode;
|
final protected String _mode;
|
||||||
|
final protected String _separator;
|
||||||
|
final protected boolean _regex;
|
||||||
|
|
||||||
|
final protected int[] _fieldLengths;
|
||||||
|
|
||||||
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
|
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
|
||||||
return new MultiValuedCellSplitOperation(
|
String mode = obj.getString("mode");
|
||||||
obj.getString("columnName"),
|
|
||||||
obj.getString("keyColumnName"),
|
if ("separator".equals(mode)) {
|
||||||
obj.getString("separator"),
|
return new MultiValuedCellSplitOperation(
|
||||||
obj.getString("mode")
|
obj.getString("columnName"),
|
||||||
);
|
obj.getString("keyColumnName"),
|
||||||
|
obj.getString("separator"),
|
||||||
|
obj.getBoolean("regex")
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
return new MultiValuedCellSplitOperation(
|
||||||
|
obj.getString("columnName"),
|
||||||
|
obj.getString("keyColumnName"),
|
||||||
|
JSONUtilities.getIntArray(obj, "fieldLengths")
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public MultiValuedCellSplitOperation(
|
public MultiValuedCellSplitOperation(
|
||||||
String columnName,
|
String columnName,
|
||||||
String keyColumnName,
|
String keyColumnName,
|
||||||
String separator,
|
String separator,
|
||||||
String mode
|
boolean regex
|
||||||
) {
|
) {
|
||||||
_columnName = columnName;
|
_columnName = columnName;
|
||||||
_keyColumnName = keyColumnName;
|
_keyColumnName = keyColumnName;
|
||||||
_separator = separator;
|
_separator = separator;
|
||||||
_mode = mode;
|
_mode = "separator";
|
||||||
|
_regex = regex;
|
||||||
|
|
||||||
|
_fieldLengths = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public MultiValuedCellSplitOperation(
|
||||||
|
String columnName,
|
||||||
|
String keyColumnName,
|
||||||
|
int[] fieldLengths
|
||||||
|
) {
|
||||||
|
_columnName = columnName;
|
||||||
|
_keyColumnName = keyColumnName;
|
||||||
|
|
||||||
|
_mode = "lengths";
|
||||||
|
_separator = null;
|
||||||
|
_regex = false;
|
||||||
|
|
||||||
|
_fieldLengths = fieldLengths;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -87,8 +121,17 @@ public class MultiValuedCellSplitOperation extends AbstractOperation {
|
|||||||
writer.key("description"); writer.value("Split multi-valued cells in column " + _columnName);
|
writer.key("description"); writer.value("Split multi-valued cells in column " + _columnName);
|
||||||
writer.key("columnName"); writer.value(_columnName);
|
writer.key("columnName"); writer.value(_columnName);
|
||||||
writer.key("keyColumnName"); writer.value(_keyColumnName);
|
writer.key("keyColumnName"); writer.value(_keyColumnName);
|
||||||
writer.key("separator"); writer.value(_separator);
|
|
||||||
writer.key("mode"); writer.value(_mode);
|
writer.key("mode"); writer.value(_mode);
|
||||||
|
if ("separator".equals(_mode)) {
|
||||||
|
writer.key("separator"); writer.value(_separator);
|
||||||
|
writer.key("regex"); writer.value(_regex);
|
||||||
|
} else {
|
||||||
|
writer.key("fieldLengths"); writer.array();
|
||||||
|
for (int l : _fieldLengths) {
|
||||||
|
writer.value(l);
|
||||||
|
}
|
||||||
|
writer.endArray();
|
||||||
|
}
|
||||||
writer.endObject();
|
writer.endObject();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -124,8 +167,31 @@ public class MultiValuedCellSplitOperation extends AbstractOperation {
|
|||||||
Object value = oldRow.getCellValue(cellIndex);
|
Object value = oldRow.getCellValue(cellIndex);
|
||||||
String s = value instanceof String ? ((String) value) : value.toString();
|
String s = value instanceof String ? ((String) value) : value.toString();
|
||||||
String[] values = null;
|
String[] values = null;
|
||||||
if (_mode.equals("regex")) {
|
if("lengths".equals(_mode)) {
|
||||||
values = s.split(_separator);
|
//do split by lengths
|
||||||
|
if (_fieldLengths.length >= 0 && _fieldLengths[0] > 0) {
|
||||||
|
Object o = _fieldLengths[0];
|
||||||
|
|
||||||
|
values = new String[_fieldLengths.length];
|
||||||
|
|
||||||
|
int lastIndex = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < _fieldLengths.length; i++) {
|
||||||
|
int thisIndex = lastIndex;
|
||||||
|
|
||||||
|
Object o2 = _fieldLengths[i];
|
||||||
|
if (o2 instanceof Number) {
|
||||||
|
thisIndex = Math.min(s.length(), lastIndex + Math.max(0, ((Number) o2).intValue()));
|
||||||
|
}
|
||||||
|
|
||||||
|
values[i] = s.substring(lastIndex, thisIndex);
|
||||||
|
lastIndex = thisIndex;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (_regex) {
|
||||||
|
Pattern pattern = Pattern.compile(_separator);
|
||||||
|
values = pattern.split(s);
|
||||||
} else {
|
} else {
|
||||||
values = StringUtils.splitByWholeSeparatorPreserveAllTokens(s, _separator);
|
values = StringUtils.splitByWholeSeparatorPreserveAllTokens(s, _separator);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user