diff --git a/main/src/com/google/refine/importers/WikitextImporter.java b/main/src/com/google/refine/importers/WikitextImporter.java index a6a848380..3fd9e3315 100644 --- a/main/src/com/google/refine/importers/WikitextImporter.java +++ b/main/src/com/google/refine/importers/WikitextImporter.java @@ -25,6 +25,11 @@ import org.sweble.wikitext.parser.nodes.WtTable; import org.sweble.wikitext.parser.nodes.WtTableHeader; import org.sweble.wikitext.parser.nodes.WtTableRow; import org.sweble.wikitext.parser.nodes.WtTableCell; +import org.sweble.wikitext.parser.nodes.WtTableCaption; +import org.sweble.wikitext.parser.nodes.WtXmlAttributes; +import org.sweble.wikitext.parser.nodes.WtXmlAttribute; +import org.sweble.wikitext.parser.nodes.WtName; +import org.sweble.wikitext.parser.nodes.WtValue; import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage; import org.sweble.wikitext.parser.nodes.WtBody; @@ -58,25 +63,61 @@ public class WikitextImporter extends TabularImportingParserBase { JSONObject options = super.createParserUIInitializationData(job, fileRecords, format); JSONUtilities.safePut(options, "guessCellValueTypes", false); + JSONUtilities.safePut(options, "blankSpanningCells", true); return options; } + private class SpanningCell { + public String value; + public int colspan; + public int rowspan; + public int row; + public int col; + + SpanningCell(String value, int row, int col, int rowspan, int colspan) { + this.value = value; + this.row = row; + this.col = col; + this.rowspan = rowspan; + this.colspan = colspan; + } + } + public class WikitextTableVisitor extends AstVisitor { + public String caption; public List header; public List> rows; private List currentRow; - private StringBuilder currentStringBuilder; + + private boolean blankSpanningCells; + + private int rowId; + private List spanningCells; + private StringBuilder cellStringBuilder; + private StringBuilder xmlAttrStringBuilder; + private String currentXmlAttr; private String currentInternalLink; private String currentExternalLink; + private int colspan; + private int rowspan; + private int spanningCellIdx; - public WikitextTableVisitor() { + public WikitextTableVisitor(boolean blankSpanningCells) { + this.blankSpanningCells = blankSpanningCells; + caption = null; header = new ArrayList(); rows = new ArrayList>(); - currentStringBuilder = null; + spanningCells = new ArrayList(); + cellStringBuilder = null; + xmlAttrStringBuilder = null; currentInternalLink = null; currentExternalLink = null; + colspan = 0; + rowspan = 0; + rowId = -1; + spanningCellIdx = 0; } @Override @@ -86,6 +127,7 @@ public class WikitextImporter extends TabularImportingParserBase { public void visit(WtNode e) { // Ignore other nodes + // System.out.println(e.getNodeName()); } public void visit(WtParsedWikitextPage e) { @@ -101,16 +143,34 @@ public class WikitextImporter extends TabularImportingParserBase { } public void visit(WtTableHeader e) { - header.add(renderAsString(e)); + String columnName = renderCellAsString(e); + header.add(columnName); + // For the header, we ignore rowspan and manually add cells for colspan + if (colspan > 1) { + for (int i = 0; i < colspan-1; i++) { + header.add(columnName); + } + } + } + + public void visit(WtTableCaption e) { + caption = renderCellAsString(e); } public void visit(WtTableRow e) { if (currentRow == null) { + if (rowId == -1) { + // no header was found, start on the first row + rowId = 0; + } currentRow = new ArrayList(); + spanningCellIdx = 0; + addSpanningCells(); iterate(e); if(currentRow.size() > 0) { rows.add(currentRow); + rowId++; } currentRow = null; } @@ -119,30 +179,104 @@ public class WikitextImporter extends TabularImportingParserBase { public void visit(WtTableCell e) { if (currentRow != null) { - currentRow.add(renderAsString(e)); + rowspan = 1; + colspan = 1; + String value = renderCellAsString(e); + currentRow.add(value); + if (colspan > 1 || rowspan > 1) { + SpanningCell spanningCell = new SpanningCell( + value, rowId, currentRow.size()-1, rowspan, colspan); + spanningCells.add(spanningCellIdx, spanningCell); + } + + addSpanningCells(); } } - public String renderAsString(WtNode e) { - currentStringBuilder = new StringBuilder(); + private SpanningCell spanningCell() { + return spanningCells.get(spanningCellIdx); + } + + private void addSpanningCells() { + while (spanningCellIdx < spanningCells.size() && + currentRow.size() >= spanningCell().col) { + // Add blank cells to represent the current spanning cell + SpanningCell cell = spanningCell(); + if (cell.row + cell.rowspan >= rowId + 1) { + while(currentRow.size() < cell.col + cell.colspan) { + if (blankSpanningCells) { + currentRow.add(null); + } else { + currentRow.add(cell.value); + } + } + } + // Check if this spanning cell has been fully represented + if(cell.row + cell.rowspan <= rowId + 1) { + spanningCells.remove(spanningCellIdx); + } else { + spanningCellIdx++; + } + } + } + + public void visit(WtXmlAttributes e) { iterate(e); - String value = currentStringBuilder.toString().trim(); - currentStringBuilder = null; + } + + public void visit(WtXmlAttribute e) { + if (currentXmlAttr == null) { + xmlAttrStringBuilder = new StringBuilder(); + iterate(e); + try { + int attrValue = Integer.parseInt(xmlAttrStringBuilder.toString()); + if (currentXmlAttr.equals("colspan")) { + colspan = attrValue; + } else if (currentXmlAttr.equals("rowspan")) { + rowspan = attrValue; + } + } catch (NumberFormatException _) { + ; + } + currentXmlAttr = null; + xmlAttrStringBuilder = null; + } + } + + public void visit(WtName e) { + currentXmlAttr = e.getAsString(); + } + + public void visit(WtValue e) { + iterate(e); + } + + public String renderCellAsString(WtNode e) { + cellStringBuilder = new StringBuilder(); + iterate(e); + String value = cellStringBuilder.toString(); + if (value == null) { + value = ""; + } + value = value.trim(); + cellStringBuilder = null; return value; } public void visit(WtText text) { - if (currentStringBuilder != null) { - currentStringBuilder.append(text.getContent()); + if (xmlAttrStringBuilder != null) { + xmlAttrStringBuilder.append(text.getContent()); + } else if (cellStringBuilder != null) { + cellStringBuilder.append(text.getContent()); } } public void visit(WtNoLinkTitle e) { if (currentInternalLink != null) { - currentStringBuilder.append(currentInternalLink); + cellStringBuilder.append(currentInternalLink); } else if (currentExternalLink != null) { - currentStringBuilder.append(currentExternalLink); + cellStringBuilder.append(currentExternalLink); } } @@ -239,13 +373,20 @@ public class WikitextImporter extends TabularImportingParserBase { parsedArticle = (WtParsedWikitextPage) parser.parseArticle(ppw, title); // Compile the retrieved page - final WikitextTableVisitor vs = new WikitextTableVisitor(); + boolean blankSpanningCells = JSONUtilities.getBoolean(options, "blankSpanningCells", true); + final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells); vs.go(parsedArticle); TableDataReader dataReader = new WikiTableDataReader(vs); JSONUtilities.safePut(options, "headerLines", 1); + // Set metadata + if (vs.caption != null && vs.caption.length() > 0) { + metadata.setName(vs.caption); + // TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way? + } + TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); } catch (IOException e1) { e1.printStackTrace(); diff --git a/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java b/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java index 200fbf94b..8f6542ae6 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java @@ -83,7 +83,7 @@ public class WikitextImporterTests extends ImporterTest { + "|-\n" + "|}\n"; try { - prepareOptions(0, 0, 0, true); + prepareOptions(0, true, true); parse(input); } catch (Exception e) { Assert.fail("Parsing failed", e); @@ -110,7 +110,7 @@ public class WikitextImporterTests extends ImporterTest { +"|}\n"; try { - prepareOptions(0, 0, 0, true); + prepareOptions(0, true, true); parse(input); } catch (Exception e) { Assert.fail("Parsing failed", e); @@ -146,7 +146,7 @@ public class WikitextImporterTests extends ImporterTest { +"|}\n"; try { - prepareOptions(-1, 0, -1, true); + prepareOptions(-1, true, true); parse(input); } catch (Exception e) { Assert.fail("Parsing failed", e); @@ -160,6 +160,35 @@ public class WikitextImporterTests extends ImporterTest { Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/"); } + @Test + public void readTableWithSpanningCells() { + // inspired from https://www.mediawiki.org/wiki/Help:Tables + String input = "{| class=\"wikitable\"\n" + +"!colspan=\"6\"|Shopping List\n" + +"|-\n" + +"|Bread & Butter\n" + +"|Pie\n" + +"|Buns\n" + +"|rowspan=\"2\"|Danish\n" + +"|colspan=\"2\"|Croissant\n" + +"|-\n" + +"|Cheese\n" + +"|colspan=\"2\"|Ice cream\n" + +"|Butter\n" + +"|Yogurt\n" + +"|}\n"; + + try { + prepareOptions(-1, true, true); + parse(input); + } catch (Exception e) { + Assert.fail("Parsing failed", e); + } + Assert.assertEquals(project.columnModel.columns.size(), 6); + Assert.assertEquals(project.rows.get(1).cells.get(2), null); + Assert.assertEquals(project.rows.get(1).cells.get(3), null); + Assert.assertEquals(project.rows.get(1).cells.get(4).value, "Butter"); + } //--helpers-- private void parse(String wikitext) { @@ -167,26 +196,22 @@ public class WikitextImporterTests extends ImporterTest { } private void prepareOptions( - int limit, int skip, int ignoreLines, + int limit, boolean blankSpanningCells, boolean guessValueType) { whenGetIntegerOption("limit", options, limit); - whenGetIntegerOption("skipDataLines", options, skip); - whenGetIntegerOption("ignoreLines", options, ignoreLines); - whenGetIntegerOption("headerLines", options, 1); whenGetBooleanOption("guessCellValueTypes", options, guessValueType); + whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells); whenGetBooleanOption("storeBlankCellsAsNulls", options, true); + whenGetIntegerOption("headerLines", options, 1); } private void verifyOptions() { try { - verify(options, times(1)).getString("separator"); verify(options, times(1)).getInt("limit"); - verify(options, times(1)).getInt("skipDataLines"); - verify(options, times(1)).getInt("ignoreLines"); verify(options, times(1)).getBoolean("guessCellValueTypes"); - verify(options, times(1)).getBoolean("processQuotes"); verify(options, times(1)).getBoolean("storeBlankCellsAsNulls"); + verify(options, times(1)).getBoolean("blankSpanningCells"); } catch (JSONException e) { Assert.fail("JSON exception",e); } diff --git a/main/webapp/modules/core/MOD-INF/controller.js b/main/webapp/modules/core/MOD-INF/controller.js index 3d624313f..ab4433840 100644 --- a/main/webapp/modules/core/MOD-INF/controller.js +++ b/main/webapp/modules/core/MOD-INF/controller.js @@ -212,7 +212,7 @@ function registerImporting() { IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfTriplesParserUI", new Packages.com.google.refine.importers.RdfXmlTripleImporter()); IM.registerFormat("text/json", "JSON files", "JsonParserUI", new Packages.com.google.refine.importers.JsonImporter()); IM.registerFormat("text/marc", "MARC files", "XmlParserUI", new Packages.com.google.refine.importers.MarcImporter()); - IM.registerFormat("text/wiki", "Wikitext files", "WikitextParserUI", new Packages.com.google.refine.importers.WikitextImporter()); + IM.registerFormat("text/wiki", "Wikitext", "WikitextParserUI", new Packages.com.google.refine.importers.WikitextImporter()); IM.registerFormat("binary", "Binary files"); // generic format, no parser to handle it diff --git a/main/webapp/modules/core/langs/translation-en.json b/main/webapp/modules/core/langs/translation-en.json index ae836fb8d..3e7b64dd8 100644 --- a/main/webapp/modules/core/langs/translation-en.json +++ b/main/webapp/modules/core/langs/translation-en.json @@ -113,6 +113,7 @@ "parse-cell": "Parse cell text into
numbers, dates, ...", "store-blank": "Store blank rows", "store-nulls": "Store blank cells as nulls", + "blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls", "store-source": "Store file source
(file names, URLs)
in each row", "preserve-empty": "Preserve empty strings", "trim": "Trim leading & trailing whitespace from strings", diff --git a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html new file mode 100644 index 000000000..b7d1daf6d --- /dev/null +++ b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html @@ -0,0 +1,25 @@ +
+ + + + + + + + + + + + + + + + + + + + + + +
+
 
diff --git a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js index a775e334f..9be1574de 100644 --- a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js +++ b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js @@ -62,20 +62,8 @@ Refine.WikitextParserUI.prototype.confirmReadyToCreateProject = function() { Refine.WikitextParserUI.prototype.getOptions = function() { var options = { - encoding: $.trim(this._optionContainerElmts.encodingInput[0].value) }; - switch (this._optionContainer.find("input[name='column-separator']:checked")[0].value) { - case 'comma': - options.separator = ","; - break; - case 'tab': - options.separator = "\\t"; - break; - default: - options.separator = this._optionContainerElmts.columnSeparatorInput[0].value; - } - var parseIntDefault = function(s, def) { try { var n = parseInt(s,10); @@ -87,30 +75,15 @@ Refine.WikitextParserUI.prototype.getOptions = function() { } return def; }; - if (this._optionContainerElmts.ignoreCheckbox[0].checked) { - options.ignoreLines = parseIntDefault(this._optionContainerElmts.ignoreInput[0].value, -1); - } else { - options.ignoreLines = -1; - } - if (this._optionContainerElmts.headerLinesCheckbox[0].checked) { - options.headerLines = parseIntDefault(this._optionContainerElmts.headerLinesInput[0].value, 0); - } else { - options.headerLines = 0; - } - if (this._optionContainerElmts.skipCheckbox[0].checked) { - options.skipDataLines = parseIntDefault(this._optionContainerElmts.skipInput[0].value, 0); - } else { - options.skipDataLines = 0; - } if (this._optionContainerElmts.limitCheckbox[0].checked) { options.limit = parseIntDefault(this._optionContainerElmts.limitInput[0].value, -1); } else { options.limit = -1; } options.storeBlankRows = this._optionContainerElmts.storeBlankRowsCheckbox[0].checked; + options.blankSpanningCells = this._optionContainerElmts.blankSpanningCellsCheckbox[0].checked; options.guessCellValueTypes = this._optionContainerElmts.guessCellValueTypesCheckbox[0].checked; - options.processQuotes = this._optionContainerElmts.processQuoteMarksCheckbox[0].checked; options.storeBlankCellsAsNulls = this._optionContainerElmts.storeBlankCellsAsNullsCheckbox[0].checked; options.includeFileSources = this._optionContainerElmts.includeFileSourcesCheckbox[0].checked; @@ -120,35 +93,23 @@ Refine.WikitextParserUI.prototype.getOptions = function() { Refine.WikitextParserUI.prototype._initialize = function() { var self = this; - console.log('wikitext ui initialize'); this._optionContainer.unbind().empty().html( - DOM.loadHTML("core", "scripts/index/parser-interfaces/separator-based-parser-ui.html")); + DOM.loadHTML("core", "scripts/index/parser-interfaces/wikitext-parser-ui.html")); this._optionContainerElmts = DOM.bind(this._optionContainer); this._optionContainerElmts.previewButton.click(function() { self._updatePreview(); }); this._optionContainerElmts.previewButton.html($.i18n._('core-buttons')["update-preview"]); - $('#or-import-encoding').html($.i18n._('core-index-import')["char-encoding"]); - $('#or-import-colsep').html($.i18n._('core-index-parser')["col-separated-by"]); - $('#or-import-commas').html($.i18n._('core-index-parser')["commas"]); - $('#or-import-tabs').html($.i18n._('core-index-parser')["tabs"]); - $('#or-import-custom').html($.i18n._('core-index-parser')["custom"]); - $('#or-import-escape').html($.i18n._('core-index-parser')["escape"]); - $('#or-import-ignore').text($.i18n._('core-index-parser')["ignore-first"]); - $('#or-import-lines').text($.i18n._('core-index-parser')["lines-beg"]); - $('#or-import-parse').text($.i18n._('core-index-parser')["parse-next"]); - $('#or-import-header').text($.i18n._('core-index-parser')["lines-header"]); - $('#or-import-discard').text($.i18n._('core-index-parser')["discard-initial"]); - $('#or-import-rows').text($.i18n._('core-index-parser')["rows-data"]); $('#or-import-load').text($.i18n._('core-index-parser')["load-at-most"]); $('#or-import-rows2').text($.i18n._('core-index-parser')["rows-data"]); $('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]); - $('#or-import-quote').html($.i18n._('core-index-parser')["quotation-mark"]); + $('#or-import-blankSpanningCells').text($.i18n._('core-index-parser')["blank-spanning-cells"]); $('#or-import-blank').text($.i18n._('core-index-parser')["store-blank"]); $('#or-import-null').text($.i18n._('core-index-parser')["store-nulls"]); $('#or-import-source').html($.i18n._('core-index-parser')["store-source"]); +/* this._optionContainerElmts.encodingInput .attr('value', this._config.encoding || '') .click(function() { @@ -156,29 +117,18 @@ Refine.WikitextParserUI.prototype._initialize = function() { self._updatePreview(); }); }); +*/ + console.log(this._config); - var columnSeparatorValue = (this._config.separator == ",") ? 'comma' : - ((this._config.separator == "\\t") ? 'tab' : 'custom'); - this._optionContainer.find( - "input[name='column-separator'][value='" + columnSeparatorValue + "']").prop("checked", true); - this._optionContainerElmts.columnSeparatorInput[0].value = this._config.separator; - - if (this._config.ignoreLines > 0) { - this._optionContainerElmts.ignoreCheckbox.prop("checked", true); - this._optionContainerElmts.ignoreInput[0].value = this._config.ignoreLines.toString(); - } - if (this._config.headerLines > 0) { - this._optionContainerElmts.headerLinesCheckbox.prop("checked", true); - this._optionContainerElmts.headerLinesInput[0].value = this._config.headerLines.toString(); - } - if (this._config.limit > 0) { + if (this._config.limit > 0) { this._optionContainerElmts.limitCheckbox.prop("checked", true); this._optionContainerElmts.limitInput[0].value = this._config.limit.toString(); } - if (this._config.skipDataLines > 0) { - this._optionContainerElmts.skipCheckbox.prop("checked", true); - this._optionContainerElmts.skipInput.value[0].value = this._config.skipDataLines.toString(); + + if (this._config.blankSpanningCells) { + this._optionContainerElmts.blankSpanningCellsCheckbox.prop("checked", true); } + if (this._config.storeBlankRows) { this._optionContainerElmts.storeBlankRowsCheckbox.prop("checked", true); } @@ -186,9 +136,6 @@ Refine.WikitextParserUI.prototype._initialize = function() { if (this._config.guessCellValueTypes) { this._optionContainerElmts.guessCellValueTypesCheckbox.prop("checked", true); } - if (this._config.processQuotes) { - this._optionContainerElmts.processQuoteMarksCheckbox.prop("checked", true); - } if (this._config.storeBlankCellsAsNulls) { this._optionContainerElmts.storeBlankCellsAsNullsCheckbox.prop("checked", true); @@ -205,7 +152,6 @@ Refine.WikitextParserUI.prototype._initialize = function() { }; Refine.WikitextParserUI.prototype._scheduleUpdatePreview = function() { - console.log('scheduleUpdatePreview'); if (this._timerID !== null) { window.clearTimeout(this._timerID); this._timerID = null; @@ -222,10 +168,8 @@ Refine.WikitextParserUI.prototype._updatePreview = function() { var self = this; this._progressContainer.show(); - console.log('updatePreview'); this._controller.updateFormatAndOptions(this.getOptions(), function(result) { - console.log(result.status); if (result.status == "ok") { self._controller.getPreviewData(function(projectData) { self._progressContainer.hide();