From cccf1e55c98937f37de0a21798eefbd628ff0df8 Mon Sep 17 00:00:00 2001 From: Owen Stephens Date: Sun, 22 Oct 2017 23:54:18 +0100 Subject: [PATCH] Update split multi-valued cells to support split by regex and split by lengths --- .../cell/SplitMultiValueCellsCommand.java | 37 +++++-- .../cell/MultiValuedCellSplitOperation.java | 96 ++++++++++++++++--- 2 files changed, 112 insertions(+), 21 deletions(-) diff --git a/main/src/com/google/refine/commands/cell/SplitMultiValueCellsCommand.java b/main/src/com/google/refine/commands/cell/SplitMultiValueCellsCommand.java index 6cca50637..355fc1406 100644 --- a/main/src/com/google/refine/commands/cell/SplitMultiValueCellsCommand.java +++ b/main/src/com/google/refine/commands/cell/SplitMultiValueCellsCommand.java @@ -33,17 +33,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.commands.cell; - import java.io.IOException; +import java.io.IOException; import java.util.Properties; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; +import org.json.JSONArray; + import com.google.refine.commands.Command; import com.google.refine.model.AbstractOperation; import com.google.refine.model.Project; import com.google.refine.operations.cell.MultiValuedCellSplitOperation; +import com.google.refine.util.ParsingUtilities; import com.google.refine.process.Process; public class SplitMultiValueCellsCommand extends Command { @@ -58,11 +61,33 @@ public class SplitMultiValueCellsCommand extends Command { String keyColumnName = request.getParameter("keyColumnName"); String separator = request.getParameter("separator"); String mode = request.getParameter("mode"); - - AbstractOperation op = new MultiValuedCellSplitOperation(columnName, keyColumnName, separator, mode); - Process process = op.createProcess(project, new Properties()); - - performProcessAndRespond(request, response, project, process); + Boolean regex = Boolean.parseBoolean(request.getParameter("regex")); + + if ("separator".equals(mode)) { + AbstractOperation op = new MultiValuedCellSplitOperation(columnName, + keyColumnName, + separator, + regex); + Process process = op.createProcess(project, new Properties()); + + performProcessAndRespond(request, response, project, process); + } else { + String s = request.getParameter("fieldLengths"); + + JSONArray a = ParsingUtilities.evaluateJsonStringToArray(s); + int[] fieldLengths = new int[a.length()]; + + for (int i = 0; i < fieldLengths.length; i++) { + fieldLengths[i] = a.getInt(i); + } + + AbstractOperation op = new MultiValuedCellSplitOperation(columnName, + keyColumnName, + fieldLengths); + Process process = op.createProcess(project, new Properties()); + + performProcessAndRespond(request, response, project, process); + } } catch (Exception e) { respondException(response, e); } diff --git a/main/src/com/google/refine/operations/cell/MultiValuedCellSplitOperation.java b/main/src/com/google/refine/operations/cell/MultiValuedCellSplitOperation.java index 3a100e562..9ef71f09c 100644 --- a/main/src/com/google/refine/operations/cell/MultiValuedCellSplitOperation.java +++ b/main/src/com/google/refine/operations/cell/MultiValuedCellSplitOperation.java @@ -33,9 +33,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.operations.cell; - import java.util.ArrayList; +import java.io.Serializable; +import java.util.ArrayList; import java.util.List; import java.util.Properties; +import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.json.JSONException; @@ -50,32 +52,64 @@ import com.google.refine.model.Project; import com.google.refine.model.Row; import com.google.refine.model.changes.MassRowChange; import com.google.refine.operations.OperationRegistry; +import com.google.refine.util.JSONUtilities; public class MultiValuedCellSplitOperation extends AbstractOperation { final protected String _columnName; final protected String _keyColumnName; - final protected String _separator; final protected String _mode; + final protected String _separator; + final protected boolean _regex; + + final protected int[] _fieldLengths; static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception { - return new MultiValuedCellSplitOperation( - obj.getString("columnName"), - obj.getString("keyColumnName"), - obj.getString("separator"), - obj.getString("mode") - ); + String mode = obj.getString("mode"); + + if ("separator".equals(mode)) { + return new MultiValuedCellSplitOperation( + obj.getString("columnName"), + obj.getString("keyColumnName"), + obj.getString("separator"), + obj.getBoolean("regex") + ); + } else { + return new MultiValuedCellSplitOperation( + obj.getString("columnName"), + obj.getString("keyColumnName"), + JSONUtilities.getIntArray(obj, "fieldLengths") + ); + } } public MultiValuedCellSplitOperation( String columnName, String keyColumnName, - String separator, - String mode + String separator, + boolean regex ) { _columnName = columnName; _keyColumnName = keyColumnName; _separator = separator; - _mode = mode; + _mode = "separator"; + _regex = regex; + + _fieldLengths = null; + } + + public MultiValuedCellSplitOperation( + String columnName, + String keyColumnName, + int[] fieldLengths + ) { + _columnName = columnName; + _keyColumnName = keyColumnName; + + _mode = "lengths"; + _separator = null; + _regex = false; + + _fieldLengths = fieldLengths; } @Override @@ -87,8 +121,17 @@ public class MultiValuedCellSplitOperation extends AbstractOperation { writer.key("description"); writer.value("Split multi-valued cells in column " + _columnName); writer.key("columnName"); writer.value(_columnName); writer.key("keyColumnName"); writer.value(_keyColumnName); - writer.key("separator"); writer.value(_separator); writer.key("mode"); writer.value(_mode); + if ("separator".equals(_mode)) { + writer.key("separator"); writer.value(_separator); + writer.key("regex"); writer.value(_regex); + } else { + writer.key("fieldLengths"); writer.array(); + for (int l : _fieldLengths) { + writer.value(l); + } + writer.endArray(); + } writer.endObject(); } @@ -110,7 +153,7 @@ public class MultiValuedCellSplitOperation extends AbstractOperation { throw new Exception("No key column named " + _keyColumnName); } int keyCellIndex = keyColumn.getCellIndex(); - + List newRows = new ArrayList(); int oldRowCount = project.rows.size(); @@ -124,8 +167,31 @@ public class MultiValuedCellSplitOperation extends AbstractOperation { Object value = oldRow.getCellValue(cellIndex); String s = value instanceof String ? ((String) value) : value.toString(); String[] values = null; - if (_mode.equals("regex")) { - values = s.split(_separator); + if("lengths".equals(_mode)) { + //do split by lengths + if (_fieldLengths.length >= 0 && _fieldLengths[0] > 0) { + Object o = _fieldLengths[0]; + + values = new String[_fieldLengths.length]; + + int lastIndex = 0; + + for (int i = 0; i < _fieldLengths.length; i++) { + int thisIndex = lastIndex; + + Object o2 = _fieldLengths[i]; + if (o2 instanceof Number) { + thisIndex = Math.min(s.length(), lastIndex + Math.max(0, ((Number) o2).intValue())); + } + + values[i] = s.substring(lastIndex, thisIndex); + lastIndex = thisIndex; + } + } + } + else if (_regex) { + Pattern pattern = Pattern.compile(_separator); + values = pattern.split(s); } else { values = StringUtils.splitByWholeSeparatorPreserveAllTokens(s, _separator); }