Merge pull request #1450 from OpenRefine/issue1448

Make Wikitext importer more robust
This commit is contained in:
Owen Stephens 2018-02-07 17:41:23 +00:00 committed by GitHub
commit 0d04a25cf1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 201 additions and 113 deletions

View File

@ -29,6 +29,7 @@ import org.sweble.wikitext.parser.nodes.WtTemplateArguments;
import org.sweble.wikitext.parser.nodes.WtText; import org.sweble.wikitext.parser.nodes.WtText;
import org.sweble.wikitext.parser.nodes.WtInternalLink; import org.sweble.wikitext.parser.nodes.WtInternalLink;
import org.sweble.wikitext.parser.nodes.WtExternalLink; import org.sweble.wikitext.parser.nodes.WtExternalLink;
import org.sweble.wikitext.parser.nodes.WtImageLink;
import org.sweble.wikitext.parser.nodes.WtLinkTitle; import org.sweble.wikitext.parser.nodes.WtLinkTitle;
import org.sweble.wikitext.parser.nodes.WtLinkTitle.WtNoLinkTitle; import org.sweble.wikitext.parser.nodes.WtLinkTitle.WtNoLinkTitle;
import org.sweble.wikitext.parser.nodes.WtUrl; import org.sweble.wikitext.parser.nodes.WtUrl;
@ -129,7 +130,6 @@ public class WikitextImporter extends TabularImportingParserBase {
public class WikitextTableVisitor extends AstVisitor<WtNode> { public class WikitextTableVisitor extends AstVisitor<WtNode> {
public String caption; public String caption;
public List<String> header;
public List<List<String>> rows; public List<List<String>> rows;
public List<List<String>> references; public List<List<String>> references;
public List<WikilinkedCell> wikilinkedCells; public List<WikilinkedCell> wikilinkedCells;
@ -163,7 +163,6 @@ public class WikitextImporter extends TabularImportingParserBase {
this.blankSpanningCells = blankSpanningCells; this.blankSpanningCells = blankSpanningCells;
this.includeRawTemplates = includeRawTemplates; this.includeRawTemplates = includeRawTemplates;
caption = null; caption = null;
header = new ArrayList<String>();
rows = new ArrayList<List<String>>(); rows = new ArrayList<List<String>>();
references = new ArrayList<List<String>>(); references = new ArrayList<List<String>>();
wikilinkedCells = new ArrayList<WikilinkedCell>(); wikilinkedCells = new ArrayList<WikilinkedCell>();
@ -178,7 +177,7 @@ public class WikitextImporter extends TabularImportingParserBase {
currentReferenceName = null; currentReferenceName = null;
colspan = 0; colspan = 0;
rowspan = 0; rowspan = 0;
rowId = -1; rowId = 0;
spanningCellIdx = 0; spanningCellIdx = 0;
internalLinksInCell = new ArrayList<String>(); internalLinksInCell = new ArrayList<String>();
namedReferences = new HashMap<String, String>(); namedReferences = new HashMap<String, String>();
@ -202,33 +201,28 @@ public class WikitextImporter extends TabularImportingParserBase {
iterate(e); iterate(e);
} }
public void visit(WtTableHeader e) {
String columnName = renderCellAsString(e);
header.add(columnName);
// For the header, we ignore rowspan and manually add cells for colspan
if (colspan > 1) {
for (int i = 0; i < colspan-1; i++) {
header.add(columnName);
}
}
}
public void visit(WtTableCaption e) { public void visit(WtTableCaption e) {
caption = renderCellAsString(e); caption = renderCellAsString(e);
} }
public void visit(WtTableRow e) public void visit(WtTableRow e)
{ {
if (currentRow == null) { if (currentRow != null) {
if (rowId == -1) { finishRow();
// no header was found, start on the first row
rowId = 0;
} }
startRow();
iterate(e);
finishRow();
}
private void startRow() {
currentRow = new ArrayList<String>(); currentRow = new ArrayList<String>();
currentRowReferences = new ArrayList<String>(); currentRowReferences = new ArrayList<String>();
spanningCellIdx = 0; spanningCellIdx = 0;
addSpanningCells(); addSpanningCells();
iterate(e); }
private void finishRow() {
if(currentRow.size() > 0) { if(currentRow.size() > 0) {
rows.add(currentRow); rows.add(currentRow);
references.add(currentRowReferences); references.add(currentRowReferences);
@ -236,11 +230,20 @@ public class WikitextImporter extends TabularImportingParserBase {
} }
currentRow = null; currentRow = null;
} }
}
public void visit(WtTableCell e) public void visit(WtTableCell e)
{ {
if (currentRow != null) { addCell(e);
}
public void visit(WtTableHeader e) {
addCell(e);
}
public void addCell(WtNode e) {
if (currentRow == null) {
startRow();
}
rowspan = 1; rowspan = 1;
colspan = 1; colspan = 1;
internalLinksInCell.clear(); internalLinksInCell.clear();
@ -273,7 +276,6 @@ public class WikitextImporter extends TabularImportingParserBase {
// Add all spanning cells that need to be inserted after this one. // Add all spanning cells that need to be inserted after this one.
addSpanningCells(); addSpanningCells();
} }
}
public String renderCellAsString(WtNode e) { public String renderCellAsString(WtNode e) {
cellStringBuilder = new StringBuilder(); cellStringBuilder = new StringBuilder();
@ -403,7 +405,6 @@ public class WikitextImporter extends TabularImportingParserBase {
} }
public void visit(WtXmlAttribute e) { public void visit(WtXmlAttribute e) {
if (currentXmlAttr == null) {
xmlAttrStringBuilder = new StringBuilder(); xmlAttrStringBuilder = new StringBuilder();
iterate(e); iterate(e);
try { try {
@ -419,12 +420,10 @@ public class WikitextImporter extends TabularImportingParserBase {
currentXmlAttr = null; currentXmlAttr = null;
xmlAttrStringBuilder = null; xmlAttrStringBuilder = null;
} }
}
public void visit(WtName e) { public void visit(WtName e) {
try { try {
currentXmlAttr = e.getAsString(); currentXmlAttr = e.getAsString();
} catch (UnsupportedOperationException soe) { } catch (UnsupportedOperationException soe) {
currentXmlAttr = null; currentXmlAttr = null;
} }
@ -507,6 +506,14 @@ public class WikitextImporter extends TabularImportingParserBase {
iterate(e.getValue()); iterate(e.getValue());
} }
public void visit(WtImageLink e) {
if(includeRawTemplates) {
writeText("[[");
writeText(e.getTarget().getAsString());
writeText("]]");
}
}
/* Content blocks */ /* Content blocks */
public void visit(WtParsedWikitextPage e) { public void visit(WtParsedWikitextPage e) {
@ -537,7 +544,7 @@ public class WikitextImporter extends TabularImportingParserBase {
} }
public class WikiTableDataReader implements TableDataReader { public class WikiTableDataReader implements TableDataReader {
private int currentRow = -1; private int currentRow = 0;
private WikitextTableVisitor visitor = null; private WikitextTableVisitor visitor = null;
private List<List<Recon>> reconList = null; private List<List<Recon>> reconList = null;
private List<Boolean> columnReconciled = null; private List<Boolean> columnReconciled = null;
@ -545,7 +552,7 @@ public class WikitextImporter extends TabularImportingParserBase {
public WikiTableDataReader(WikitextTableVisitor visitor, boolean references) { public WikiTableDataReader(WikitextTableVisitor visitor, boolean references) {
this.visitor = visitor; this.visitor = visitor;
currentRow = -1; currentRow = 0;
reconList = null; reconList = null;
if (references) { if (references) {
@ -569,9 +576,7 @@ public class WikitextImporter extends TabularImportingParserBase {
List<Object> row = null; List<Object> row = null;
List<String> origRow = null; List<String> origRow = null;
List<String> refRow = null; List<String> refRow = null;
if (currentRow == -1) { if(currentRow < this.visitor.rows.size()) {
origRow = this.visitor.header;
} else if(currentRow < this.visitor.rows.size()) {
origRow = this.visitor.rows.get(currentRow); origRow = this.visitor.rows.get(currentRow);
refRow = this.visitor.references.get(currentRow); refRow = this.visitor.references.get(currentRow);
} }
@ -583,10 +588,15 @@ public class WikitextImporter extends TabularImportingParserBase {
if (currentRow >= 0 && reconList != null) { if (currentRow >= 0 && reconList != null) {
recon = reconList.get(currentRow).get(i); recon = reconList.get(currentRow).get(i);
} }
row.add(new Cell(origRow.get(i), recon)); String value = origRow.get(i);
if (value != null) {
row.add(new Cell(value, recon));
} else {
row.add(null);
}
// if we should add reference colums // if we should add reference columns
if (columnReferenced != null && columnReferenced.get(i)) { if (columnReferenced != null && i < columnReferenced.size() && columnReferenced.get(i)) {
String refValue = null; String refValue = null;
// for headers // for headers
if(currentRow == -1) { if(currentRow == -1) {
@ -594,7 +604,11 @@ public class WikitextImporter extends TabularImportingParserBase {
} else { } else {
refValue = refRow.get(i); refValue = refRow.get(i);
} }
if (refValue != null) {
row.add(new Cell(refValue, null)); row.add(new Cell(refValue, null));
} else {
row.add(null);
}
} }
} }
} }
@ -705,8 +719,6 @@ public class WikitextImporter extends TabularImportingParserBase {
dataReader.reconcileToQids(wikiUrl, cfg); dataReader.reconcileToQids(wikiUrl, cfg);
} }
JSONUtilities.safePut(options, "headerLines", 1);
// Set metadata // Set metadata
if (vs.caption != null && vs.caption.length() > 0) { if (vs.caption != null && vs.caption.length() > 0) {
metadata.setName(vs.caption); metadata.setName(vs.caption);

View File

@ -80,7 +80,7 @@ public class WikitextImporterTests extends ImporterTest {
+ "|-\n" + "|-\n"
+ "|}\n"; + "|}\n";
try { try {
prepareOptions(0, true, true, null); prepareOptions(0, 0, true, true, null);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
@ -93,6 +93,35 @@ public class WikitextImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "f"); Assert.assertEquals(project.rows.get(1).cells.get(2).value, "f");
} }
/**
* Issue #1448
* https://github.com/OpenRefine/OpenRefine/issues/1448
*/
@Test
public void readTableWithMisplacedHeaders() {
String input = "\n"
+ "{|\n"
+ "|-\n"
+ "| a || b<br/>2 || c \n"
+ "|-\n"
+ "| d\n"
+ "! e\n"
+ "| f<br>\n"
+ "|-\n"
+ "|}\n";
try {
prepareOptions(0, 0, true, true, null);
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
}
Assert.assertEquals(project.columnModel.columns.size(), 3);
Assert.assertEquals(project.rows.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.size(), 3);
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "e");
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "f");
}
@Test @Test
public void readTableWithLinks() { public void readTableWithLinks() {
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit // Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
@ -108,7 +137,7 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n"; +"|}\n";
try { try {
prepareOptions(0, true, true, "https://de.wikipedia.org/wiki/"); prepareOptions(0, 0, true, true, "https://de.wikipedia.org/wiki/");
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
@ -153,7 +182,7 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n"; +"|}\n";
try { try {
prepareOptions(-1, true, true, null); prepareOptions(-1, 1, true, true, null);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
@ -186,14 +215,14 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n"; +"|}\n";
try { try {
prepareOptions(-1, true, true, null); prepareOptions(-1, 1, true, true, null);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
} }
Assert.assertEquals(project.columnModel.columns.size(), 6); Assert.assertEquals(project.columnModel.columns.size(), 6);
Assert.assertNull(project.rows.get(1).cells.get(2).value); Assert.assertNull(project.rows.get(1).cells.get(2));
Assert.assertNull(project.rows.get(1).cells.get(3).value); Assert.assertNull(project.rows.get(1).cells.get(3));
Assert.assertEquals(project.rows.get(1).cells.get(4).value, "Butter"); Assert.assertEquals(project.rows.get(1).cells.get(4).value, "Butter");
} }
@ -212,7 +241,7 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n"; +"|}\n";
try { try {
prepareOptions(-1, true, true, null); prepareOptions(-1, 1, true, true, null);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
@ -240,7 +269,7 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n"; +"|}\n";
try { try {
prepareOptions(-1, true, true, null); prepareOptions(-1, 1, true, true, null);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
@ -253,6 +282,34 @@ public class WikitextImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://microsoft.com/"); Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://microsoft.com/");
} }
/**
* Include templates and image filenames
*/
@Test
public void readTableWithTemplates() {
String input = "\n"
+ "{|\n"
+ "|-\n"
+ "| {{free to read}} || b || c \n"
+ "|-\n"
+ "| d\n"
+ "| [[File:My logo.svg|70px]]\n"
+ "| f<br>\n"
+ "|-\n"
+ "|}\n";
try {
prepareOptions(0, 0, true, true, null);
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
}
Assert.assertEquals(project.columnModel.columns.size(), 3);
Assert.assertEquals(project.rows.size(), 2);
Assert.assertEquals(project.rows.get(0).cells.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "{{free to read}}");
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "[[File:My logo.svg]]");
}
//--helpers-- //--helpers--
private void parse(String wikitext) { private void parse(String wikitext) {
@ -260,16 +317,17 @@ public class WikitextImporterTests extends ImporterTest {
} }
private void prepareOptions( private void prepareOptions(
int limit, boolean blankSpanningCells, int limit, int headerLines, boolean blankSpanningCells,
boolean guessValueType, String wikiUrl) { boolean guessValueType, String wikiUrl) {
whenGetIntegerOption("limit", options, limit); whenGetIntegerOption("limit", options, limit);
whenGetIntegerOption("headerLines", options, headerLines);
whenGetBooleanOption("guessCellValueTypes", options, guessValueType); whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells); whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells);
whenGetBooleanOption("storeBlankCellsAsNulls", options, true); whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
whenGetBooleanOption("parseReferences", options, true); whenGetBooleanOption("parseReferences", options, true);
whenGetBooleanOption("includeRawTemplates", options, true);
whenGetStringOption("wikiUrl", options, wikiUrl); whenGetStringOption("wikiUrl", options, wikiUrl);
whenGetIntegerOption("headerLines", options, 1);
whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api"); whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api");
} }
} }

View File

@ -138,7 +138,7 @@
"store-blank": "Store blank rows", "store-blank": "Store blank rows",
"store-nulls": "Store blank cells as nulls", "store-nulls": "Store blank cells as nulls",
"blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls", "blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls",
"include-raw-templates": "Include templates as raw wikicode", "include-raw-templates": "Include templates and images as raw wikicode",
"parse-references": "Extract references in additional columns", "parse-references": "Extract references in additional columns",
"wiki-base-url": "Reconcile to wiki with base URL:", "wiki-base-url": "Reconcile to wiki with base URL:",
"invalid-wikitext": "No table could be parsed. Are you sure this is a valid wiki table?", "invalid-wikitext": "No table could be parsed. Are you sure this is a valid wiki table?",

View File

@ -133,7 +133,7 @@
"store-nulls": "Analyser les cellules vides comme nulles", "store-nulls": "Analyser les cellules vides comme nulles",
"lines-into-row": "lignes comme une seule", "lines-into-row": "lignes comme une seule",
"custom": "autre", "custom": "autre",
"include-raw-templates": "Inclure les modèles an tant que wikicode brut", "include-raw-templates": "Inclure les modèles et images comme du wikicode brut",
"quotation-mark": "Des guillemets sont utilisés<br/>pour délimiter les cellules qui contiennent<br/>des séparateurs de colonne", "quotation-mark": "Des guillemets sont utilisés<br/>pour délimiter les cellules qui contiennent<br/>des séparateurs de colonne",
"invalid-wikitext": "Aucun tableau n'a pu être extrait. Êtes-vous sûr·e que c'est un wiki-tableau valide ?", "invalid-wikitext": "Aucun tableau n'a pu être extrait. Êtes-vous sûr·e que c'est un wiki-tableau valide ?",
"json-parser": "Cliquer sur le premier nœud JSON { } correspondant à la première ligne à charger.", "json-parser": "Cliquer sur le premier nœud JSON { } correspondant à la première ligne à charger.",

View File

@ -3,6 +3,11 @@
<tr><td width="1%"><input type="checkbox" bind="wikiCheckbox" id="$reconcileWiki" /></td><td><label for="$reconcileWiki" id="or-import-wiki-base-url"></label> <tr><td width="1%"><input type="checkbox" bind="wikiCheckbox" id="$reconcileWiki" /></td><td><label for="$reconcileWiki" id="or-import-wiki-base-url"></label>
<input bind="wikiUrlInput" type="text" class="lightweight" size="30" id="$wikiUrl" /></td></tr> <input bind="wikiUrlInput" type="text" class="lightweight" size="30" id="$wikiUrl" /></td></tr>
<tr><td width="1%"><input type="checkbox" bind="headerLinesCheckbox" id="$headers" /></td>
<td><label for="$headers" id="or-import-parse"></label>
<input bind="headerLinesInput" type="text" class="lightweight" size="2" value="1" />
<label for="$headers" id="or-import-header"></label></td></tr>
<tr><td width="1%"><input type="checkbox" bind="limitCheckbox" id="$limit" /></td> <tr><td width="1%"><input type="checkbox" bind="limitCheckbox" id="$limit" /></td>
<td><label for="$limit" id="or-import-load"></label> <td><label for="$limit" id="or-import-load"></label>
<input bind="limitInput" type="text" class="lightweight" size="2" value="0" /> <input bind="limitInput" type="text" class="lightweight" size="2" value="0" />

View File

@ -85,6 +85,12 @@ Refine.WikitextParserUI.prototype.getOptions = function() {
} else { } else {
options.limit = -1; options.limit = -1;
} }
if (this._optionContainerElmts.headerLinesCheckbox[0].checked) {
options.headerLines = parseIntDefault(this._optionContainerElmts.headerLinesInput[0].value, 1);
} else {
options.headerLines = -1;
}
options.storeBlankRows = this._optionContainerElmts.storeBlankRowsCheckbox[0].checked; options.storeBlankRows = this._optionContainerElmts.storeBlankRowsCheckbox[0].checked;
options.blankSpanningCells = this._optionContainerElmts.blankSpanningCellsCheckbox[0].checked; options.blankSpanningCells = this._optionContainerElmts.blankSpanningCellsCheckbox[0].checked;
options.includeRawTemplates = this._optionContainerElmts.includeRawTemplatesCheckbox[0].checked; options.includeRawTemplates = this._optionContainerElmts.includeRawTemplatesCheckbox[0].checked;
@ -111,6 +117,8 @@ Refine.WikitextParserUI.prototype._initialize = function() {
this._optionContainerElmts.previewButton.html($.i18n._('core-buttons')["update-preview"]); this._optionContainerElmts.previewButton.html($.i18n._('core-buttons')["update-preview"]);
$('#or-import-wiki-base-url').text($.i18n._('core-index-parser')["wiki-base-url"]); $('#or-import-wiki-base-url').text($.i18n._('core-index-parser')["wiki-base-url"]);
$('#or-import-parse').text($.i18n._('core-index-parser')["parse-next"]);
$('#or-import-header').text($.i18n._('core-index-parser')["lines-header"]);
$('#or-import-load').text($.i18n._('core-index-parser')["load-at-most"]); $('#or-import-load').text($.i18n._('core-index-parser')["load-at-most"]);
$('#or-import-rows2').text($.i18n._('core-index-parser')["rows-data"]); $('#or-import-rows2').text($.i18n._('core-index-parser')["rows-data"]);
$('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]); $('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]);
@ -142,6 +150,11 @@ Refine.WikitextParserUI.prototype._initialize = function() {
this._optionContainerElmts.limitInput[0].value = this._config.limit.toString(); this._optionContainerElmts.limitInput[0].value = this._config.limit.toString();
} }
if (this._config.headerLines > 0) {
this._optionContainerElmts.headerLinesCheckbox.prop("checked", true);
this._optionContainerElmts.headerLinesInput[0].value = this._config.headerLines.toString();
}
if (this._config.blankSpanningCells) { if (this._config.blankSpanningCells) {
this._optionContainerElmts.blankSpanningCellsCheckbox.prop("checked", true); this._optionContainerElmts.blankSpanningCellsCheckbox.prop("checked", true);
} }