diff --git a/main/src/com/google/refine/commands/cell/SplitMultiValueCellsCommand.java b/main/src/com/google/refine/commands/cell/SplitMultiValueCellsCommand.java index 6cca50637..355fc1406 100644 --- a/main/src/com/google/refine/commands/cell/SplitMultiValueCellsCommand.java +++ b/main/src/com/google/refine/commands/cell/SplitMultiValueCellsCommand.java @@ -33,17 +33,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.commands.cell; - import java.io.IOException; +import java.io.IOException; import java.util.Properties; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; +import org.json.JSONArray; + import com.google.refine.commands.Command; import com.google.refine.model.AbstractOperation; import com.google.refine.model.Project; import com.google.refine.operations.cell.MultiValuedCellSplitOperation; +import com.google.refine.util.ParsingUtilities; import com.google.refine.process.Process; public class SplitMultiValueCellsCommand extends Command { @@ -58,11 +61,33 @@ public class SplitMultiValueCellsCommand extends Command { String keyColumnName = request.getParameter("keyColumnName"); String separator = request.getParameter("separator"); String mode = request.getParameter("mode"); - - AbstractOperation op = new MultiValuedCellSplitOperation(columnName, keyColumnName, separator, mode); - Process process = op.createProcess(project, new Properties()); - - performProcessAndRespond(request, response, project, process); + Boolean regex = Boolean.parseBoolean(request.getParameter("regex")); + + if ("separator".equals(mode)) { + AbstractOperation op = new MultiValuedCellSplitOperation(columnName, + keyColumnName, + separator, + regex); + Process process = op.createProcess(project, new Properties()); + + performProcessAndRespond(request, response, project, process); + } else { + String s = request.getParameter("fieldLengths"); + + JSONArray a = ParsingUtilities.evaluateJsonStringToArray(s); + int[] fieldLengths = new int[a.length()]; + + for (int i = 0; i < fieldLengths.length; i++) { + fieldLengths[i] = a.getInt(i); + } + + AbstractOperation op = new MultiValuedCellSplitOperation(columnName, + keyColumnName, + fieldLengths); + Process process = op.createProcess(project, new Properties()); + + performProcessAndRespond(request, response, project, process); + } } catch (Exception e) { respondException(response, e); } diff --git a/main/src/com/google/refine/operations/cell/MultiValuedCellSplitOperation.java b/main/src/com/google/refine/operations/cell/MultiValuedCellSplitOperation.java index 3a100e562..9b614f60e 100644 --- a/main/src/com/google/refine/operations/cell/MultiValuedCellSplitOperation.java +++ b/main/src/com/google/refine/operations/cell/MultiValuedCellSplitOperation.java @@ -33,9 +33,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package com.google.refine.operations.cell; - import java.util.ArrayList; +import java.util.ArrayList; import java.util.List; import java.util.Properties; +import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.json.JSONException; @@ -50,32 +51,64 @@ import com.google.refine.model.Project; import com.google.refine.model.Row; import com.google.refine.model.changes.MassRowChange; import com.google.refine.operations.OperationRegistry; +import com.google.refine.util.JSONUtilities; public class MultiValuedCellSplitOperation extends AbstractOperation { final protected String _columnName; final protected String _keyColumnName; - final protected String _separator; final protected String _mode; + final protected String _separator; + final protected boolean _regex; + + final protected int[] _fieldLengths; static public AbstractOperation reconstruct(Project project, JSONObject obj) throws Exception { - return new MultiValuedCellSplitOperation( - obj.getString("columnName"), - obj.getString("keyColumnName"), - obj.getString("separator"), - obj.getString("mode") - ); + String mode = obj.getString("mode"); + + if ("separator".equals(mode)) { + return new MultiValuedCellSplitOperation( + obj.getString("columnName"), + obj.getString("keyColumnName"), + obj.getString("separator"), + obj.getBoolean("regex") + ); + } else { + return new MultiValuedCellSplitOperation( + obj.getString("columnName"), + obj.getString("keyColumnName"), + JSONUtilities.getIntArray(obj, "fieldLengths") + ); + } } public MultiValuedCellSplitOperation( String columnName, String keyColumnName, - String separator, - String mode + String separator, + boolean regex ) { _columnName = columnName; _keyColumnName = keyColumnName; _separator = separator; - _mode = mode; + _mode = "separator"; + _regex = regex; + + _fieldLengths = null; + } + + public MultiValuedCellSplitOperation( + String columnName, + String keyColumnName, + int[] fieldLengths + ) { + _columnName = columnName; + _keyColumnName = keyColumnName; + + _mode = "lengths"; + _separator = null; + _regex = false; + + _fieldLengths = fieldLengths; } @Override @@ -87,8 +120,17 @@ public class MultiValuedCellSplitOperation extends AbstractOperation { writer.key("description"); writer.value("Split multi-valued cells in column " + _columnName); writer.key("columnName"); writer.value(_columnName); writer.key("keyColumnName"); writer.value(_keyColumnName); - writer.key("separator"); writer.value(_separator); writer.key("mode"); writer.value(_mode); + if ("separator".equals(_mode)) { + writer.key("separator"); writer.value(_separator); + writer.key("regex"); writer.value(_regex); + } else { + writer.key("fieldLengths"); writer.array(); + for (int l : _fieldLengths) { + writer.value(l); + } + writer.endArray(); + } writer.endObject(); } @@ -110,7 +152,7 @@ public class MultiValuedCellSplitOperation extends AbstractOperation { throw new Exception("No key column named " + _keyColumnName); } int keyCellIndex = keyColumn.getCellIndex(); - + List newRows = new ArrayList(); int oldRowCount = project.rows.size(); @@ -124,8 +166,28 @@ public class MultiValuedCellSplitOperation extends AbstractOperation { Object value = oldRow.getCellValue(cellIndex); String s = value instanceof String ? ((String) value) : value.toString(); String[] values = null; - if (_mode.equals("regex")) { - values = s.split(_separator); + if("lengths".equals(_mode)) { + if (_fieldLengths.length >= 0 && _fieldLengths[0] > 0) { + values = new String[_fieldLengths.length]; + + int lastIndex = 0; + + for (int i = 0; i < _fieldLengths.length; i++) { + int thisIndex = lastIndex; + + Object o = _fieldLengths[i]; + if (o instanceof Number) { + thisIndex = Math.min(s.length(), lastIndex + Math.max(0, ((Number) o).intValue())); + } + + values[i] = s.substring(lastIndex, thisIndex); + lastIndex = thisIndex; + } + } + } + else if (_regex) { + Pattern pattern = Pattern.compile(_separator); + values = pattern.split(s); } else { values = StringUtils.splitByWholeSeparatorPreserveAllTokens(s, _separator); } @@ -138,14 +200,14 @@ public class MultiValuedCellSplitOperation extends AbstractOperation { // First value goes into the same row { Row firstNewRow = oldRow.dup(); - firstNewRow.setCell(cellIndex, new Cell(values[0].trim(), null)); + firstNewRow.setCell(cellIndex, new Cell(values[0], null)); newRows.add(firstNewRow); } int r2 = r + 1; for (int v = 1; v < values.length; v++) { - Cell newCell = new Cell(values[v].trim(), null); + Cell newCell = new Cell(values[v], null); if (r2 < project.rows.size()) { Row oldRow2 = project.rows.get(r2); diff --git a/main/tests/server/src/com/google/refine/tests/model/KeyValueColumnizeTests.java b/main/tests/server/src/com/google/refine/tests/operations/cell/KeyValueColumnizeTests.java similarity index 100% rename from main/tests/server/src/com/google/refine/tests/model/KeyValueColumnizeTests.java rename to main/tests/server/src/com/google/refine/tests/operations/cell/KeyValueColumnizeTests.java diff --git a/main/tests/server/src/com/google/refine/tests/operations/cell/SplitMultiValuedCellsTests.java b/main/tests/server/src/com/google/refine/tests/operations/cell/SplitMultiValuedCellsTests.java new file mode 100644 index 000000000..312899ae6 --- /dev/null +++ b/main/tests/server/src/com/google/refine/tests/operations/cell/SplitMultiValuedCellsTests.java @@ -0,0 +1,223 @@ +/* + +Copyright 2010, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +package com.google.refine.tests.model; + +import static org.mockito.Mockito.mock; + +import java.io.File; +import java.io.IOException; +import java.io.StringReader; +import java.util.Properties; +import java.util.List; +import java.util.ArrayList; + +import org.json.JSONException; +import org.json.JSONObject; +import org.slf4j.LoggerFactory; +import org.testng.Assert; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +import com.google.refine.ProjectManager; +import com.google.refine.ProjectMetadata; +import com.google.refine.RefineServlet; +import com.google.refine.importers.SeparatorBasedImporter; +import com.google.refine.importing.ImportingJob; +import com.google.refine.importing.ImportingManager; +import com.google.refine.io.FileProjectManager; +import com.google.refine.model.AbstractOperation; +import com.google.refine.model.ModelException; +import com.google.refine.model.Project; +import com.google.refine.process.Process; +import com.google.refine.operations.cell.MultiValuedCellSplitOperation; +import com.google.refine.tests.RefineServletStub; +import com.google.refine.tests.RefineTest; +import com.google.refine.tests.util.TestUtils; + + +public class SplitMultiValuedCellsTests extends RefineTest { + // dependencies + private RefineServlet servlet; + private Project project; + private ProjectMetadata pm; + private JSONObject options; + private ImportingJob job; + private SeparatorBasedImporter importer; + + + @Override + @BeforeTest + public void init() { + logger = LoggerFactory.getLogger(this.getClass()); + } + + @BeforeMethod + public void SetUp() throws JSONException, IOException, ModelException { + servlet = new RefineServletStub(); + File dir = TestUtils.createTempDirectory("openrefine-test-workspace-dir"); + FileProjectManager.initialize(dir); + project = new Project(); + pm = new ProjectMetadata(); + pm.setName("SplitMultiValuedCells test"); + ProjectManager.singleton.registerProject(project, pm); + options = mock(JSONObject.class); + + ImportingManager.initialize(servlet); + job = ImportingManager.createJob(); + importer = new SeparatorBasedImporter(); + } + + @AfterMethod + public void TearDown() { + ImportingManager.disposeJob(job.id); + ProjectManager.singleton.deleteProject(project.id); + job = null; + project = null; + pm = null; + options = null; + } + + /** + * Test to demonstrate the intended behaviour of the function, for issue #1268 + * https://github.com/OpenRefine/OpenRefine/issues/1268 + */ + + @Test + public void testSplitMultiValuedCellsTextSeparator() throws Exception { + String csv = "Key,Value\n" + + "Record_1,one:two;three four\n"; + prepareOptions(",", 10, 0, 0, 1, false, false); + List exceptions = new ArrayList(); + importer.parseOneFile(project, pm, job, "filesource", new StringReader(csv), -1, options, exceptions); + project.update(); + ProjectManager.singleton.registerProject(project, pm); + + AbstractOperation op = new MultiValuedCellSplitOperation( + "Value", + "Key", + ":", + false); + Process process = op.createProcess(project, new Properties()); + process.performImmediate(); + + int keyCol = project.columnModel.getColumnByName("Key").getCellIndex(); + int valueCol = project.columnModel.getColumnByName("Value").getCellIndex(); + + Assert.assertEquals(project.rows.get(0).getCellValue(keyCol), "Record_1"); + Assert.assertEquals(project.rows.get(0).getCellValue(valueCol), "one"); + Assert.assertEquals(project.rows.get(1).getCellValue(keyCol), null); + Assert.assertEquals(project.rows.get(1).getCellValue(valueCol), "two;three four"); + } + + @Test + public void testSplitMultiValuedCellsRegExSeparator() throws Exception { + String csv = "Key,Value\n" + + "Record_1,one:two;three four\n"; + prepareOptions(",", 10, 0, 0, 1, false, false); + List exceptions = new ArrayList(); + importer.parseOneFile(project, pm, job, "filesource", new StringReader(csv), -1, options, exceptions); + project.update(); + ProjectManager.singleton.registerProject(project, pm); + + AbstractOperation op = new MultiValuedCellSplitOperation( + "Value", + "Key", + "\\W", + true); + Process process = op.createProcess(project, new Properties()); + process.performImmediate(); + + int keyCol = project.columnModel.getColumnByName("Key").getCellIndex(); + int valueCol = project.columnModel.getColumnByName("Value").getCellIndex(); + + Assert.assertEquals(project.rows.get(0).getCellValue(keyCol), "Record_1"); + Assert.assertEquals(project.rows.get(0).getCellValue(valueCol), "one"); + Assert.assertEquals(project.rows.get(1).getCellValue(keyCol), null); + Assert.assertEquals(project.rows.get(1).getCellValue(valueCol), "two"); + Assert.assertEquals(project.rows.get(2).getCellValue(keyCol), null); + Assert.assertEquals(project.rows.get(2).getCellValue(valueCol), "three"); + Assert.assertEquals(project.rows.get(3).getCellValue(keyCol), null); + Assert.assertEquals(project.rows.get(3).getCellValue(valueCol), "four"); + } + + @Test + public void testSplitMultiValuedCellsLengths() throws Exception { + String csv = "Key,Value\n" + + "Record_1,one:two;three four\n"; + prepareOptions(",", 10, 0, 0, 1, false, false); + List exceptions = new ArrayList(); + importer.parseOneFile(project, pm, job, "filesource", new StringReader(csv), -1, options, exceptions); + project.update(); + ProjectManager.singleton.registerProject(project, pm); + int[] lengths = {4,4,6,4}; + + AbstractOperation op = new MultiValuedCellSplitOperation( + "Value", + "Key", + lengths); + Process process = op.createProcess(project, new Properties()); + process.performImmediate(); + + int keyCol = project.columnModel.getColumnByName("Key").getCellIndex(); + int valueCol = project.columnModel.getColumnByName("Value").getCellIndex(); + + Assert.assertEquals(project.rows.get(0).getCellValue(keyCol), "Record_1"); + Assert.assertEquals(project.rows.get(0).getCellValue(valueCol), "one:"); + Assert.assertEquals(project.rows.get(1).getCellValue(keyCol), null); + Assert.assertEquals(project.rows.get(1).getCellValue(valueCol), "two;"); + Assert.assertEquals(project.rows.get(2).getCellValue(keyCol), null); + Assert.assertEquals(project.rows.get(2).getCellValue(valueCol), "three "); + Assert.assertEquals(project.rows.get(3).getCellValue(keyCol), null); + Assert.assertEquals(project.rows.get(3).getCellValue(valueCol), "four"); + } + + private void prepareOptions( + String sep, int limit, int skip, int ignoreLines, + int headerLines, boolean guessValueType, boolean ignoreQuotes) { + + whenGetStringOption("separator", options, sep); + whenGetIntegerOption("limit", options, limit); + whenGetIntegerOption("skipDataLines", options, skip); + whenGetIntegerOption("ignoreLines", options, ignoreLines); + whenGetIntegerOption("headerLines", options, headerLines); + whenGetBooleanOption("guessCellValueTypes", options, guessValueType); + whenGetBooleanOption("processQuotes", options, !ignoreQuotes); + whenGetBooleanOption("storeBlankCellsAsNulls", options, true); + } + + +} + diff --git a/main/tests/server/src/com/google/refine/operations/cell/TransposeTests.java b/main/tests/server/src/com/google/refine/tests/operations/cell/TransposeTests.java similarity index 100% rename from main/tests/server/src/com/google/refine/operations/cell/TransposeTests.java rename to main/tests/server/src/com/google/refine/tests/operations/cell/TransposeTests.java diff --git a/main/webapp/modules/core/langs/translation-en.json b/main/webapp/modules/core/langs/translation-en.json index c62863a34..df7845879 100644 --- a/main/webapp/modules/core/langs/translation-en.json +++ b/main/webapp/modules/core/langs/translation-en.json @@ -525,6 +525,7 @@ "split-col": "Split column", "several-col": "into several columns", "how-split": "How to Split Column", + "how-split-cells": "How to split multi-valued cells", "by-sep": "by separator", "separator": "Separator", "reg-exp": "regular expression", diff --git a/main/webapp/modules/core/scripts/views/data-table/menu-edit-cells.js b/main/webapp/modules/core/scripts/views/data-table/menu-edit-cells.js index 9ea92d8ee..4c77f889f 100644 --- a/main/webapp/modules/core/scripts/views/data-table/menu-edit-cells.js +++ b/main/webapp/modules/core/scripts/views/data-table/menu-edit-cells.js @@ -125,7 +125,7 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { { columnName: column.name, keyColumnName: theProject.columnModel.keyColumnName, - separator: separator + separator }, null, { rowsChanged: true } @@ -134,20 +134,76 @@ DataTableColumnHeaderUI.extendMenu(function(column, columnHeaderUI, menu) { }; var doSplitMultiValueCells = function() { - var separator = window.prompt($.i18n._('core-views')["what-separator"], ","); - if (separator !== null) { + + var frame = $(DOM.loadHTML("core", "scripts/views/data-table/split-multi-valued-cells-dialog.html")); + var elmts = DOM.bind(frame); + elmts.dialogHeader.text($.i18n._('core-views')["split-cells"]); + + elmts.or_views_howSplit.text($.i18n._('core-views')["how-split-cells"]); + elmts.or_views_bySep.text($.i18n._('core-views')["by-sep"]); + elmts.or_views_separator.text($.i18n._('core-views')["separator"]); + elmts.or_views_regExp.text($.i18n._('core-views')["reg-exp"]); + + elmts.or_views_fieldLen.text($.i18n._('core-views')["field-len"]); + elmts.or_views_listInt.text($.i18n._('core-views')["list-int"]); + + elmts.okButton.html($.i18n._('core-buttons')["ok"]); + elmts.cancelButton.text($.i18n._('core-buttons')["cancel"]); + + var level = DialogSystem.showDialog(frame); + var dismiss = function() { DialogSystem.dismissUntil(level - 1); }; + + elmts.cancelButton.click(dismiss); + elmts.okButton.click(function() { + var mode = $("input[name='split-by-mode']:checked")[0].value; + var config = { + columnName: column.name, + keyColumnName: theProject.columnModel.keyColumnName, + mode + }; + if (mode === "separator") { + config.separator = elmts.separatorInput[0].value; + if (!(config.separator)) { + alert($.i18n._('core-views')["specify-sep"]); + return; + } + + config.regex = elmts.regexInput[0].checked; + + } else { + var s = "[" + elmts.lengthsTextarea[0].value + "]"; + try { + var a = JSON.parse(s); + + var lengths = []; + $.each(a, function(i,n) { + if (typeof n == "number") { + lengths.push(n); + } + }); + + if (lengths.length === 0) { + alert($.i18n._('core-views')["warning-no-length"]); + return; + } + + config.fieldLengths = JSON.stringify(lengths); + + } catch (e) { + alert($.i18n._('core-views')["warning-format"]); + return; + } + } + Refine.postCoreProcess( "split-multi-value-cells", - { - columnName: column.name, - keyColumnName: theProject.columnModel.keyColumnName, - separator: separator, - mode: "plain" - }, + config, null, { rowsChanged: true } ); - } + + dismiss(); + }); }; MenuSystem.appendTo(menu, [ "core/edit-cells" ], [ diff --git a/main/webapp/modules/core/scripts/views/data-table/split-multi-valued-cells-dialog.html b/main/webapp/modules/core/scripts/views/data-table/split-multi-valued-cells-dialog.html new file mode 100644 index 000000000..c68744865 --- /dev/null +++ b/main/webapp/modules/core/scripts/views/data-table/split-multi-valued-cells-dialog.html @@ -0,0 +1,40 @@ +
+
+
+
+
+ +
+
+ + + + + + + + + + + + + + + + + + + + + +

+ +
+
+
+ +
+
\ No newline at end of file