diff --git a/main/src/com/google/refine/importers/WikitextImporter.java b/main/src/com/google/refine/importers/WikitextImporter.java index 568430d93..ab624b946 100644 --- a/main/src/com/google/refine/importers/WikitextImporter.java +++ b/main/src/com/google/refine/importers/WikitextImporter.java @@ -29,6 +29,7 @@ import org.sweble.wikitext.parser.nodes.WtTemplateArguments; import org.sweble.wikitext.parser.nodes.WtText; import org.sweble.wikitext.parser.nodes.WtInternalLink; import org.sweble.wikitext.parser.nodes.WtExternalLink; +import org.sweble.wikitext.parser.nodes.WtImageLink; import org.sweble.wikitext.parser.nodes.WtLinkTitle; import org.sweble.wikitext.parser.nodes.WtLinkTitle.WtNoLinkTitle; import org.sweble.wikitext.parser.nodes.WtUrl; @@ -129,7 +130,6 @@ public class WikitextImporter extends TabularImportingParserBase { public class WikitextTableVisitor extends AstVisitor { public String caption; - public List header; public List> rows; public List> references; public List wikilinkedCells; @@ -163,7 +163,6 @@ public class WikitextImporter extends TabularImportingParserBase { this.blankSpanningCells = blankSpanningCells; this.includeRawTemplates = includeRawTemplates; caption = null; - header = new ArrayList(); rows = new ArrayList>(); references = new ArrayList>(); wikilinkedCells = new ArrayList(); @@ -178,7 +177,7 @@ public class WikitextImporter extends TabularImportingParserBase { currentReferenceName = null; colspan = 0; rowspan = 0; - rowId = -1; + rowId = 0; spanningCellIdx = 0; internalLinksInCell = new ArrayList(); namedReferences = new HashMap(); @@ -202,77 +201,80 @@ public class WikitextImporter extends TabularImportingParserBase { iterate(e); } - public void visit(WtTableHeader e) { - String columnName = renderCellAsString(e); - header.add(columnName); - // For the header, we ignore rowspan and manually add cells for colspan - if (colspan > 1) { - for (int i = 0; i < colspan-1; i++) { - header.add(columnName); - } - } - } - public void visit(WtTableCaption e) { caption = renderCellAsString(e); } public void visit(WtTableRow e) { - if (currentRow == null) { - if (rowId == -1) { - // no header was found, start on the first row - rowId = 0; - } - currentRow = new ArrayList(); - currentRowReferences = new ArrayList(); - spanningCellIdx = 0; - addSpanningCells(); - iterate(e); - if(currentRow.size() > 0) { - rows.add(currentRow); - references.add(currentRowReferences); - rowId++; - } - currentRow = null; + if (currentRow != null) { + finishRow(); } + startRow(); + iterate(e); + finishRow(); + } + + private void startRow() { + currentRow = new ArrayList(); + currentRowReferences = new ArrayList(); + spanningCellIdx = 0; + addSpanningCells(); + } + + private void finishRow() { + if(currentRow.size() > 0) { + rows.add(currentRow); + references.add(currentRowReferences); + rowId++; + } + currentRow = null; } public void visit(WtTableCell e) { - if (currentRow != null) { - rowspan = 1; - colspan = 1; - internalLinksInCell.clear(); - currentReference = null; - currentReferenceName = null; - - String value = renderCellAsString(e); - - int colId = currentRow.size(); - - // Add the cell to the row we are currently building - currentRow.add(value); - currentRowReferences.add(currentReference); - - // Reconcile it if we found exactly one link in the cell - String reconciled = null; - if (internalLinksInCell.size() == 1) { - reconciled = internalLinksInCell.get(0); - wikilinkedCells.add(new WikilinkedCell(reconciled, rowId, colId)); - } - - // Mark it as spanning if we found the tags - if (colspan > 1 || rowspan > 1) { - SpanningCell spanningCell = new SpanningCell( - value, reconciled, currentReference, - rowId, colId, rowspan, colspan); - spanningCells.add(spanningCellIdx, spanningCell); - } - - // Add all spanning cells that need to be inserted after this one. - addSpanningCells(); + addCell(e); + } + + public void visit(WtTableHeader e) { + addCell(e); + } + + public void addCell(WtNode e) { + if (currentRow == null) { + startRow(); } + rowspan = 1; + colspan = 1; + internalLinksInCell.clear(); + currentReference = null; + currentReferenceName = null; + + String value = renderCellAsString(e); + + int colId = currentRow.size(); + + // Add the cell to the row we are currently building + currentRow.add(value); + currentRowReferences.add(currentReference); + + // Reconcile it if we found exactly one link in the cell + String reconciled = null; + if (internalLinksInCell.size() == 1) { + reconciled = internalLinksInCell.get(0); + wikilinkedCells.add(new WikilinkedCell(reconciled, rowId, colId)); + } + + // Mark it as spanning if we found the tags + if (colspan > 1 || rowspan > 1) { + SpanningCell spanningCell = new SpanningCell( + value, reconciled, currentReference, + rowId, colId, rowspan, colspan); + spanningCells.add(spanningCellIdx, spanningCell); + } + + // Add all spanning cells that need to be inserted after this one. + addSpanningCells(); } public String renderCellAsString(WtNode e) { @@ -403,28 +405,25 @@ public class WikitextImporter extends TabularImportingParserBase { } public void visit(WtXmlAttribute e) { - if (currentXmlAttr == null) { - xmlAttrStringBuilder = new StringBuilder(); - iterate(e); - try { - if ("colspan".equals(currentXmlAttr)) { - colspan = Integer.parseInt(xmlAttrStringBuilder.toString()); - } else if ("rowspan".equals(currentXmlAttr)) { - rowspan = Integer.parseInt(xmlAttrStringBuilder.toString()); - } else if ("name".equals(currentXmlAttr)) { - currentReferenceName = xmlAttrStringBuilder.toString(); - } - } catch (NumberFormatException nfe) { + xmlAttrStringBuilder = new StringBuilder(); + iterate(e); + try { + if ("colspan".equals(currentXmlAttr)) { + colspan = Integer.parseInt(xmlAttrStringBuilder.toString()); + } else if ("rowspan".equals(currentXmlAttr)) { + rowspan = Integer.parseInt(xmlAttrStringBuilder.toString()); + } else if ("name".equals(currentXmlAttr)) { + currentReferenceName = xmlAttrStringBuilder.toString(); } - currentXmlAttr = null; - xmlAttrStringBuilder = null; + } catch (NumberFormatException nfe) { } + currentXmlAttr = null; + xmlAttrStringBuilder = null; } public void visit(WtName e) { try { - currentXmlAttr = e.getAsString(); - + currentXmlAttr = e.getAsString(); } catch (UnsupportedOperationException soe) { currentXmlAttr = null; } @@ -507,6 +506,14 @@ public class WikitextImporter extends TabularImportingParserBase { iterate(e.getValue()); } + public void visit(WtImageLink e) { + if(includeRawTemplates) { + writeText("[["); + writeText(e.getTarget().getAsString()); + writeText("]]"); + } + } + /* Content blocks */ public void visit(WtParsedWikitextPage e) { @@ -537,7 +544,7 @@ public class WikitextImporter extends TabularImportingParserBase { } public class WikiTableDataReader implements TableDataReader { - private int currentRow = -1; + private int currentRow = 0; private WikitextTableVisitor visitor = null; private List> reconList = null; private List columnReconciled = null; @@ -545,7 +552,7 @@ public class WikitextImporter extends TabularImportingParserBase { public WikiTableDataReader(WikitextTableVisitor visitor, boolean references) { this.visitor = visitor; - currentRow = -1; + currentRow = 0; reconList = null; if (references) { @@ -569,9 +576,7 @@ public class WikitextImporter extends TabularImportingParserBase { List row = null; List origRow = null; List refRow = null; - if (currentRow == -1) { - origRow = this.visitor.header; - } else if(currentRow < this.visitor.rows.size()) { + if(currentRow < this.visitor.rows.size()) { origRow = this.visitor.rows.get(currentRow); refRow = this.visitor.references.get(currentRow); } @@ -583,10 +588,15 @@ public class WikitextImporter extends TabularImportingParserBase { if (currentRow >= 0 && reconList != null) { recon = reconList.get(currentRow).get(i); } - row.add(new Cell(origRow.get(i), recon)); + String value = origRow.get(i); + if (value != null) { + row.add(new Cell(value, recon)); + } else { + row.add(null); + } - // if we should add reference colums… - if (columnReferenced != null && columnReferenced.get(i)) { + // if we should add reference columns… + if (columnReferenced != null && i < columnReferenced.size() && columnReferenced.get(i)) { String refValue = null; // for headers if(currentRow == -1) { @@ -594,7 +604,11 @@ public class WikitextImporter extends TabularImportingParserBase { } else { refValue = refRow.get(i); } - row.add(new Cell(refValue, null)); + if (refValue != null) { + row.add(new Cell(refValue, null)); + } else { + row.add(null); + } } } } @@ -705,8 +719,6 @@ public class WikitextImporter extends TabularImportingParserBase { dataReader.reconcileToQids(wikiUrl, cfg); } - JSONUtilities.safePut(options, "headerLines", 1); - // Set metadata if (vs.caption != null && vs.caption.length() > 0) { metadata.setName(vs.caption); diff --git a/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java b/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java index 1686e89dc..a9120820a 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java @@ -80,10 +80,10 @@ public class WikitextImporterTests extends ImporterTest { + "|-\n" + "|}\n"; try { - prepareOptions(0, true, true, null); - parse(input); + prepareOptions(0, 0, true, true, null); + parse(input); } catch (Exception e) { - Assert.fail("Parsing failed", e); + Assert.fail("Parsing failed", e); } Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.rows.size(), 2); @@ -93,6 +93,35 @@ public class WikitextImporterTests extends ImporterTest { Assert.assertEquals(project.rows.get(1).cells.get(2).value, "f"); } + /** + * Issue #1448 + * https://github.com/OpenRefine/OpenRefine/issues/1448 + */ + @Test + public void readTableWithMisplacedHeaders() { + String input = "\n" + + "{|\n" + + "|-\n" + + "| a || b
2 || c \n" + + "|-\n" + + "| d\n" + + "! e\n" + + "| f
\n" + + "|-\n" + + "|}\n"; + try { + prepareOptions(0, 0, true, true, null); + parse(input); + } catch (Exception e) { + Assert.fail("Parsing failed", e); + } + Assert.assertEquals(project.columnModel.columns.size(), 3); + Assert.assertEquals(project.rows.size(), 2); + Assert.assertEquals(project.rows.get(0).cells.size(), 3); + Assert.assertEquals(project.rows.get(1).cells.get(1).value, "e"); + Assert.assertEquals(project.rows.get(1).cells.get(2).value, "f"); + } + @Test public void readTableWithLinks() { // Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit @@ -108,10 +137,10 @@ public class WikitextImporterTests extends ImporterTest { +"|}\n"; try { - prepareOptions(0, true, true, "https://de.wikipedia.org/wiki/"); - parse(input); + prepareOptions(0, 0, true, true, "https://de.wikipedia.org/wiki/"); + parse(input); } catch (Exception e) { - Assert.fail("Parsing failed", e); + Assert.fail("Parsing failed", e); } Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.rows.size(), 3); @@ -153,10 +182,10 @@ public class WikitextImporterTests extends ImporterTest { +"|}\n"; try { - prepareOptions(-1, true, true, null); - parse(input); + prepareOptions(-1, 1, true, true, null); + parse(input); } catch (Exception e) { - Assert.fail("Parsing failed", e); + Assert.fail("Parsing failed", e); } Assert.assertEquals(project.columnModel.columns.size(), 7); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung"); @@ -186,14 +215,14 @@ public class WikitextImporterTests extends ImporterTest { +"|}\n"; try { - prepareOptions(-1, true, true, null); - parse(input); + prepareOptions(-1, 1, true, true, null); + parse(input); } catch (Exception e) { - Assert.fail("Parsing failed", e); + Assert.fail("Parsing failed", e); } Assert.assertEquals(project.columnModel.columns.size(), 6); - Assert.assertNull(project.rows.get(1).cells.get(2).value); - Assert.assertNull(project.rows.get(1).cells.get(3).value); + Assert.assertNull(project.rows.get(1).cells.get(2)); + Assert.assertNull(project.rows.get(1).cells.get(3)); Assert.assertEquals(project.rows.get(1).cells.get(4).value, "Butter"); } @@ -212,10 +241,10 @@ public class WikitextImporterTests extends ImporterTest { +"|}\n"; try { - prepareOptions(-1, true, true, null); - parse(input); + prepareOptions(-1, 1, true, true, null); + parse(input); } catch (Exception e) { - Assert.fail("Parsing failed", e); + Assert.fail("Parsing failed", e); } Assert.assertEquals(project.columnModel.columns.size(), 5); Assert.assertEquals(project.rows.get(0).cells.get(1).value, "b"); @@ -240,10 +269,10 @@ public class WikitextImporterTests extends ImporterTest { +"|}\n"; try { - prepareOptions(-1, true, true, null); - parse(input); + prepareOptions(-1, 1, true, true, null); + parse(input); } catch (Exception e) { - Assert.fail("Parsing failed", e); + Assert.fail("Parsing failed", e); } Assert.assertEquals(project.columnModel.columns.size(), 5); Assert.assertEquals(project.rows.get(0).cells.get(1).value, "b"); @@ -252,6 +281,34 @@ public class WikitextImporterTests extends ImporterTest { Assert.assertEquals(project.rows.get(1).cells.get(4).value, "http://gnu.org"); Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://microsoft.com/"); } + + /** + * Include templates and image filenames + */ + @Test + public void readTableWithTemplates() { + String input = "\n" + + "{|\n" + + "|-\n" + + "| {{free to read}} || b || c \n" + + "|-\n" + + "| d\n" + + "| [[File:My logo.svg|70px]]\n" + + "| f
\n" + + "|-\n" + + "|}\n"; + try { + prepareOptions(0, 0, true, true, null); + parse(input); + } catch (Exception e) { + Assert.fail("Parsing failed", e); + } + Assert.assertEquals(project.columnModel.columns.size(), 3); + Assert.assertEquals(project.rows.size(), 2); + Assert.assertEquals(project.rows.get(0).cells.size(), 3); + Assert.assertEquals(project.rows.get(0).cells.get(0).value, "{{free to read}}"); + Assert.assertEquals(project.rows.get(1).cells.get(1).value, "[[File:My logo.svg]]"); + } //--helpers-- @@ -260,16 +317,17 @@ public class WikitextImporterTests extends ImporterTest { } private void prepareOptions( - int limit, boolean blankSpanningCells, + int limit, int headerLines, boolean blankSpanningCells, boolean guessValueType, String wikiUrl) { whenGetIntegerOption("limit", options, limit); + whenGetIntegerOption("headerLines", options, headerLines); whenGetBooleanOption("guessCellValueTypes", options, guessValueType); whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells); whenGetBooleanOption("storeBlankCellsAsNulls", options, true); whenGetBooleanOption("parseReferences", options, true); + whenGetBooleanOption("includeRawTemplates", options, true); whenGetStringOption("wikiUrl", options, wikiUrl); - whenGetIntegerOption("headerLines", options, 1); whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api"); } } diff --git a/main/webapp/modules/core/langs/translation-en.json b/main/webapp/modules/core/langs/translation-en.json index 98047ee75..c2b43aa1a 100644 --- a/main/webapp/modules/core/langs/translation-en.json +++ b/main/webapp/modules/core/langs/translation-en.json @@ -138,7 +138,7 @@ "store-blank": "Store blank rows", "store-nulls": "Store blank cells as nulls", "blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls", - "include-raw-templates": "Include templates as raw wikicode", + "include-raw-templates": "Include templates and images as raw wikicode", "parse-references": "Extract references in additional columns", "wiki-base-url": "Reconcile to wiki with base URL:", "invalid-wikitext": "No table could be parsed. Are you sure this is a valid wiki table?", diff --git a/main/webapp/modules/core/langs/translation-fr.json b/main/webapp/modules/core/langs/translation-fr.json index 6a1df732b..beb2b750b 100644 --- a/main/webapp/modules/core/langs/translation-fr.json +++ b/main/webapp/modules/core/langs/translation-fr.json @@ -133,7 +133,7 @@ "store-nulls": "Analyser les cellules vides comme nulles", "lines-into-row": "lignes comme une seule", "custom": "autre", - "include-raw-templates": "Inclure les modèles an tant que wikicode brut", + "include-raw-templates": "Inclure les modèles et images comme du wikicode brut", "quotation-mark": "Des guillemets sont utilisés
pour délimiter les cellules qui contiennent
des séparateurs de colonne", "invalid-wikitext": "Aucun tableau n'a pu être extrait. Êtes-vous sûr·e que c'est un wiki-tableau valide ?", "json-parser": "Cliquer sur le premier nœud JSON { } correspondant à la première ligne à charger.", diff --git a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html index 5c4191a00..f4330e275 100644 --- a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html +++ b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html @@ -3,6 +3,11 @@ + + + + + diff --git a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js index 2a7081d29..bc53fb1bf 100644 --- a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js +++ b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js @@ -85,6 +85,12 @@ Refine.WikitextParserUI.prototype.getOptions = function() { } else { options.limit = -1; } + if (this._optionContainerElmts.headerLinesCheckbox[0].checked) { + options.headerLines = parseIntDefault(this._optionContainerElmts.headerLinesInput[0].value, 1); + } else { + options.headerLines = -1; + } + options.storeBlankRows = this._optionContainerElmts.storeBlankRowsCheckbox[0].checked; options.blankSpanningCells = this._optionContainerElmts.blankSpanningCellsCheckbox[0].checked; options.includeRawTemplates = this._optionContainerElmts.includeRawTemplatesCheckbox[0].checked; @@ -111,6 +117,8 @@ Refine.WikitextParserUI.prototype._initialize = function() { this._optionContainerElmts.previewButton.html($.i18n._('core-buttons')["update-preview"]); $('#or-import-wiki-base-url').text($.i18n._('core-index-parser')["wiki-base-url"]); + $('#or-import-parse').text($.i18n._('core-index-parser')["parse-next"]); + $('#or-import-header').text($.i18n._('core-index-parser')["lines-header"]); $('#or-import-load').text($.i18n._('core-index-parser')["load-at-most"]); $('#or-import-rows2').text($.i18n._('core-index-parser')["rows-data"]); $('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]); @@ -142,6 +150,11 @@ Refine.WikitextParserUI.prototype._initialize = function() { this._optionContainerElmts.limitInput[0].value = this._config.limit.toString(); } + if (this._config.headerLines > 0) { + this._optionContainerElmts.headerLinesCheckbox.prop("checked", true); + this._optionContainerElmts.headerLinesInput[0].value = this._config.headerLines.toString(); + } + if (this._config.blankSpanningCells) { this._optionContainerElmts.blankSpanningCellsCheckbox.prop("checked", true); }