Update split multi-valued cells to support split by regex and split by lengths

This commit is contained in:
Owen Stephens 2017-10-22 23:54:18 +01:00
parent 6fb7f1f476
commit cccf1e55c9
2 changed files with 112 additions and 21 deletions

View File

@ -33,17 +33,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.refine.commands.cell;
import java.io.IOException;
import java.io.IOException;
import java.util.Properties;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.json.JSONArray;
import com.google.refine.commands.Command;
import com.google.refine.model.AbstractOperation;
import com.google.refine.model.Project;
import com.google.refine.operations.cell.MultiValuedCellSplitOperation;
import com.google.refine.util.ParsingUtilities;
import com.google.refine.process.Process;
public class SplitMultiValueCellsCommand extends Command {
@ -58,11 +61,33 @@ public class SplitMultiValueCellsCommand extends Command {
String keyColumnName = request.getParameter("keyColumnName");
String separator = request.getParameter("separator");
String mode = request.getParameter("mode");
Boolean regex = Boolean.parseBoolean(request.getParameter("regex"));
AbstractOperation op = new MultiValuedCellSplitOperation(columnName, keyColumnName, separator, mode);
Process process = op.createProcess(project, new Properties());
if ("separator".equals(mode)) {
AbstractOperation op = new MultiValuedCellSplitOperation(columnName,
keyColumnName,
separator,
regex);
Process process = op.createProcess(project, new Properties());
performProcessAndRespond(request, response, project, process);
performProcessAndRespond(request, response, project, process);
} else {
String s = request.getParameter("fieldLengths");
JSONArray a = ParsingUtilities.evaluateJsonStringToArray(s);
int[] fieldLengths = new int[a.length()];
for (int i = 0; i < fieldLengths.length; i++) {
fieldLengths[i] = a.getInt(i);
}
AbstractOperation op = new MultiValuedCellSplitOperation(columnName,
keyColumnName,
fieldLengths);
Process process = op.createProcess(project, new Properties());
performProcessAndRespond(request, response, project, process);
}
} catch (Exception e) {
respondException(response, e);
}

View File

@ -33,9 +33,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.refine.operations.cell;
import java.util.ArrayList;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.json.JSONException;
@ -50,32 +52,64 @@ import com.google.refine.model.Project;
import com.google.refine.model.Row;
import com.google.refine.model.changes.MassRowChange;
import com.google.refine.operations.OperationRegistry;
import com.google.refine.util.JSONUtilities;
public class MultiValuedCellSplitOperation extends AbstractOperation {
final protected String _columnName;
final protected String _keyColumnName;
final protected String _separator;
final protected String _mode;
final protected String _separator;
final protected boolean _regex;
final protected int[] _fieldLengths;
static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception {
return new MultiValuedCellSplitOperation(
obj.getString("columnName"),
obj.getString("keyColumnName"),
obj.getString("separator"),
obj.getString("mode")
);
String mode = obj.getString("mode");
if ("separator".equals(mode)) {
return new MultiValuedCellSplitOperation(
obj.getString("columnName"),
obj.getString("keyColumnName"),
obj.getString("separator"),
obj.getBoolean("regex")
);
} else {
return new MultiValuedCellSplitOperation(
obj.getString("columnName"),
obj.getString("keyColumnName"),
JSONUtilities.getIntArray(obj, "fieldLengths")
);
}
}
public MultiValuedCellSplitOperation(
String columnName,
String keyColumnName,
String separator,
String mode
String separator,
boolean regex
) {
_columnName = columnName;
_keyColumnName = keyColumnName;
_separator = separator;
_mode = mode;
_mode = "separator";
_regex = regex;
_fieldLengths = null;
}
public MultiValuedCellSplitOperation(
String columnName,
String keyColumnName,
int[] fieldLengths
) {
_columnName = columnName;
_keyColumnName = keyColumnName;
_mode = "lengths";
_separator = null;
_regex = false;
_fieldLengths = fieldLengths;
}
@Override
@ -87,8 +121,17 @@ public class MultiValuedCellSplitOperation extends AbstractOperation {
writer.key("description"); writer.value("Split multi-valued cells in column " + _columnName);
writer.key("columnName"); writer.value(_columnName);
writer.key("keyColumnName"); writer.value(_keyColumnName);
writer.key("separator"); writer.value(_separator);
writer.key("mode"); writer.value(_mode);
if ("separator".equals(_mode)) {
writer.key("separator"); writer.value(_separator);
writer.key("regex"); writer.value(_regex);
} else {
writer.key("fieldLengths"); writer.array();
for (int l : _fieldLengths) {
writer.value(l);
}
writer.endArray();
}
writer.endObject();
}
@ -124,8 +167,31 @@ public class MultiValuedCellSplitOperation extends AbstractOperation {
Object value = oldRow.getCellValue(cellIndex);
String s = value instanceof String ? ((String) value) : value.toString();
String[] values = null;
if (_mode.equals("regex")) {
values = s.split(_separator);
if("lengths".equals(_mode)) {
//do split by lengths
if (_fieldLengths.length >= 0 && _fieldLengths[0] > 0) {
Object o = _fieldLengths[0];
values = new String[_fieldLengths.length];
int lastIndex = 0;
for (int i = 0; i < _fieldLengths.length; i++) {
int thisIndex = lastIndex;
Object o2 = _fieldLengths[i];
if (o2 instanceof Number) {
thisIndex = Math.min(s.length(), lastIndex + Math.max(0, ((Number) o2).intValue()));
}
values[i] = s.substring(lastIndex, thisIndex);
lastIndex = thisIndex;
}
}
}
else if (_regex) {
Pattern pattern = Pattern.compile(_separator);
values = pattern.split(s);
} else {
values = StringUtils.splitByWholeSeparatorPreserveAllTokens(s, _separator);
}