Add support for colspan and rowspan in Wikitext

This commit is contained in:
Antonin Delpeuch 2017-08-15 11:28:43 +01:00
parent 73f7fdc036
commit aa4517ba58
6 changed files with 229 additions and 93 deletions

View File

@ -25,6 +25,11 @@ import org.sweble.wikitext.parser.nodes.WtTable;
import org.sweble.wikitext.parser.nodes.WtTableHeader;
import org.sweble.wikitext.parser.nodes.WtTableRow;
import org.sweble.wikitext.parser.nodes.WtTableCell;
import org.sweble.wikitext.parser.nodes.WtTableCaption;
import org.sweble.wikitext.parser.nodes.WtXmlAttributes;
import org.sweble.wikitext.parser.nodes.WtXmlAttribute;
import org.sweble.wikitext.parser.nodes.WtName;
import org.sweble.wikitext.parser.nodes.WtValue;
import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage;
import org.sweble.wikitext.parser.nodes.WtBody;
@ -58,25 +63,61 @@ public class WikitextImporter extends TabularImportingParserBase {
JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
JSONUtilities.safePut(options, "guessCellValueTypes", false);
JSONUtilities.safePut(options, "blankSpanningCells", true);
return options;
}
private class SpanningCell {
public String value;
public int colspan;
public int rowspan;
public int row;
public int col;
SpanningCell(String value, int row, int col, int rowspan, int colspan) {
this.value = value;
this.row = row;
this.col = col;
this.rowspan = rowspan;
this.colspan = colspan;
}
}
public class WikitextTableVisitor extends AstVisitor<WtNode> {
public String caption;
public List<String> header;
public List<List<String>> rows;
private List<String> currentRow;
private StringBuilder currentStringBuilder;
private boolean blankSpanningCells;
private int rowId;
private List<SpanningCell> spanningCells;
private StringBuilder cellStringBuilder;
private StringBuilder xmlAttrStringBuilder;
private String currentXmlAttr;
private String currentInternalLink;
private String currentExternalLink;
private int colspan;
private int rowspan;
private int spanningCellIdx;
public WikitextTableVisitor() {
public WikitextTableVisitor(boolean blankSpanningCells) {
this.blankSpanningCells = blankSpanningCells;
caption = null;
header = new ArrayList<String>();
rows = new ArrayList<List<String>>();
currentStringBuilder = null;
spanningCells = new ArrayList<SpanningCell>();
cellStringBuilder = null;
xmlAttrStringBuilder = null;
currentInternalLink = null;
currentExternalLink = null;
colspan = 0;
rowspan = 0;
rowId = -1;
spanningCellIdx = 0;
}
@Override
@ -86,6 +127,7 @@ public class WikitextImporter extends TabularImportingParserBase {
public void visit(WtNode e) {
// Ignore other nodes
// System.out.println(e.getNodeName());
}
public void visit(WtParsedWikitextPage e) {
@ -101,16 +143,34 @@ public class WikitextImporter extends TabularImportingParserBase {
}
public void visit(WtTableHeader e) {
header.add(renderAsString(e));
String columnName = renderCellAsString(e);
header.add(columnName);
// For the header, we ignore rowspan and manually add cells for colspan
if (colspan > 1) {
for (int i = 0; i < colspan-1; i++) {
header.add(columnName);
}
}
}
public void visit(WtTableCaption e) {
caption = renderCellAsString(e);
}
public void visit(WtTableRow e)
{
if (currentRow == null) {
if (rowId == -1) {
// no header was found, start on the first row
rowId = 0;
}
currentRow = new ArrayList<String>();
spanningCellIdx = 0;
addSpanningCells();
iterate(e);
if(currentRow.size() > 0) {
rows.add(currentRow);
rowId++;
}
currentRow = null;
}
@ -119,30 +179,104 @@ public class WikitextImporter extends TabularImportingParserBase {
public void visit(WtTableCell e)
{
if (currentRow != null) {
currentRow.add(renderAsString(e));
rowspan = 1;
colspan = 1;
String value = renderCellAsString(e);
currentRow.add(value);
if (colspan > 1 || rowspan > 1) {
SpanningCell spanningCell = new SpanningCell(
value, rowId, currentRow.size()-1, rowspan, colspan);
spanningCells.add(spanningCellIdx, spanningCell);
}
addSpanningCells();
}
}
public String renderAsString(WtNode e) {
currentStringBuilder = new StringBuilder();
private SpanningCell spanningCell() {
return spanningCells.get(spanningCellIdx);
}
private void addSpanningCells() {
while (spanningCellIdx < spanningCells.size() &&
currentRow.size() >= spanningCell().col) {
// Add blank cells to represent the current spanning cell
SpanningCell cell = spanningCell();
if (cell.row + cell.rowspan >= rowId + 1) {
while(currentRow.size() < cell.col + cell.colspan) {
if (blankSpanningCells) {
currentRow.add(null);
} else {
currentRow.add(cell.value);
}
}
}
// Check if this spanning cell has been fully represented
if(cell.row + cell.rowspan <= rowId + 1) {
spanningCells.remove(spanningCellIdx);
} else {
spanningCellIdx++;
}
}
}
public void visit(WtXmlAttributes e) {
iterate(e);
String value = currentStringBuilder.toString().trim();
currentStringBuilder = null;
}
public void visit(WtXmlAttribute e) {
if (currentXmlAttr == null) {
xmlAttrStringBuilder = new StringBuilder();
iterate(e);
try {
int attrValue = Integer.parseInt(xmlAttrStringBuilder.toString());
if (currentXmlAttr.equals("colspan")) {
colspan = attrValue;
} else if (currentXmlAttr.equals("rowspan")) {
rowspan = attrValue;
}
} catch (NumberFormatException _) {
;
}
currentXmlAttr = null;
xmlAttrStringBuilder = null;
}
}
public void visit(WtName e) {
currentXmlAttr = e.getAsString();
}
public void visit(WtValue e) {
iterate(e);
}
public String renderCellAsString(WtNode e) {
cellStringBuilder = new StringBuilder();
iterate(e);
String value = cellStringBuilder.toString();
if (value == null) {
value = "";
}
value = value.trim();
cellStringBuilder = null;
return value;
}
public void visit(WtText text) {
if (currentStringBuilder != null) {
currentStringBuilder.append(text.getContent());
if (xmlAttrStringBuilder != null) {
xmlAttrStringBuilder.append(text.getContent());
} else if (cellStringBuilder != null) {
cellStringBuilder.append(text.getContent());
}
}
public void visit(WtNoLinkTitle e) {
if (currentInternalLink != null) {
currentStringBuilder.append(currentInternalLink);
cellStringBuilder.append(currentInternalLink);
} else if (currentExternalLink != null) {
currentStringBuilder.append(currentExternalLink);
cellStringBuilder.append(currentExternalLink);
}
}
@ -239,13 +373,20 @@ public class WikitextImporter extends TabularImportingParserBase {
parsedArticle = (WtParsedWikitextPage) parser.parseArticle(ppw, title);
// Compile the retrieved page
final WikitextTableVisitor vs = new WikitextTableVisitor();
boolean blankSpanningCells = JSONUtilities.getBoolean(options, "blankSpanningCells", true);
final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells);
vs.go(parsedArticle);
TableDataReader dataReader = new WikiTableDataReader(vs);
JSONUtilities.safePut(options, "headerLines", 1);
// Set metadata
if (vs.caption != null && vs.caption.length() > 0) {
metadata.setName(vs.caption);
// TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way?
}
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
} catch (IOException e1) {
e1.printStackTrace();

View File

@ -83,7 +83,7 @@ public class WikitextImporterTests extends ImporterTest {
+ "|-\n"
+ "|}\n";
try {
prepareOptions(0, 0, 0, true);
prepareOptions(0, true, true);
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
@ -110,7 +110,7 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n";
try {
prepareOptions(0, 0, 0, true);
prepareOptions(0, true, true);
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
@ -146,7 +146,7 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n";
try {
prepareOptions(-1, 0, -1, true);
prepareOptions(-1, true, true);
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
@ -160,6 +160,35 @@ public class WikitextImporterTests extends ImporterTest {
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
}
@Test
public void readTableWithSpanningCells() {
// inspired from https://www.mediawiki.org/wiki/Help:Tables
String input = "{| class=\"wikitable\"\n"
+"!colspan=\"6\"|Shopping List\n"
+"|-\n"
+"|Bread & Butter\n"
+"|Pie\n"
+"|Buns\n"
+"|rowspan=\"2\"|Danish\n"
+"|colspan=\"2\"|Croissant\n"
+"|-\n"
+"|Cheese\n"
+"|colspan=\"2\"|Ice cream\n"
+"|Butter\n"
+"|Yogurt\n"
+"|}\n";
try {
prepareOptions(-1, true, true);
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
}
Assert.assertEquals(project.columnModel.columns.size(), 6);
Assert.assertEquals(project.rows.get(1).cells.get(2), null);
Assert.assertEquals(project.rows.get(1).cells.get(3), null);
Assert.assertEquals(project.rows.get(1).cells.get(4).value, "Butter");
}
//--helpers--
private void parse(String wikitext) {
@ -167,26 +196,22 @@ public class WikitextImporterTests extends ImporterTest {
}
private void prepareOptions(
int limit, int skip, int ignoreLines,
int limit, boolean blankSpanningCells,
boolean guessValueType) {
whenGetIntegerOption("limit", options, limit);
whenGetIntegerOption("skipDataLines", options, skip);
whenGetIntegerOption("ignoreLines", options, ignoreLines);
whenGetIntegerOption("headerLines", options, 1);
whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells);
whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
whenGetIntegerOption("headerLines", options, 1);
}
private void verifyOptions() {
try {
verify(options, times(1)).getString("separator");
verify(options, times(1)).getInt("limit");
verify(options, times(1)).getInt("skipDataLines");
verify(options, times(1)).getInt("ignoreLines");
verify(options, times(1)).getBoolean("guessCellValueTypes");
verify(options, times(1)).getBoolean("processQuotes");
verify(options, times(1)).getBoolean("storeBlankCellsAsNulls");
verify(options, times(1)).getBoolean("blankSpanningCells");
} catch (JSONException e) {
Assert.fail("JSON exception",e);
}

View File

@ -212,7 +212,7 @@ function registerImporting() {
IM.registerFormat("text/xml/rdf", "RDF/XML files", "RdfTriplesParserUI", new Packages.com.google.refine.importers.RdfXmlTripleImporter());
IM.registerFormat("text/json", "JSON files", "JsonParserUI", new Packages.com.google.refine.importers.JsonImporter());
IM.registerFormat("text/marc", "MARC files", "XmlParserUI", new Packages.com.google.refine.importers.MarcImporter());
IM.registerFormat("text/wiki", "Wikitext files", "WikitextParserUI", new Packages.com.google.refine.importers.WikitextImporter());
IM.registerFormat("text/wiki", "Wikitext", "WikitextParserUI", new Packages.com.google.refine.importers.WikitextImporter());
IM.registerFormat("binary", "Binary files"); // generic format, no parser to handle it

View File

@ -113,6 +113,7 @@
"parse-cell": "Parse cell text into<br/>numbers, dates, ...",
"store-blank": "Store blank rows",
"store-nulls": "Store blank cells as nulls",
"blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls",
"store-source": "Store file source <br/>(file names, URLs)<br/>in each row",
"preserve-empty": "Preserve empty strings",
"trim": "Trim leading &amp; trailing whitespace from strings",

View File

@ -0,0 +1,25 @@
<div class="grid-layout layout-tightest"><table>
<tr><td colspan="2" id="or-import-colsep"></td></tr>
<tr><td width="1%"><input type="checkbox" bind="limitCheckbox" id="$limit" /></td>
<td><label for="$limit" id="or-import-load"></label></td>
<td><input bind="limitInput" type="text" class="lightweight" size="2" value="0" />
<label for="$limit" id="or-import-rows2"></label></td></tr>
<tr><td width="1%"><input type="checkbox" bind="guessCellValueTypesCheckbox" id="$guess" /></td>
<td><label for="$guess" id="or-import-parseCell"></label></td></tr>
<tr><td width="1%"><input type="checkbox" bind="blankSpanningCellsCheckbox" id="$blank-spanning-cells" /></td>
<td><label for="$blank-spanning-cells" id="or-import-blankSpanningCells"></label></td></tr>
<tr><td width="1%"><input type="checkbox" bind="storeBlankRowsCheckbox" id="$store-blank-rows" /></td>
<td colspan="2"><label for="$store-blank-rows" id="or-import-blank"></label></td></tr>
<tr><td width="1%"><input type="checkbox" bind="storeBlankCellsAsNullsCheckbox" id="$store-blank-cells" /></td>
<td colspan="2"><label for="$store-blank-cells" id="or-import-null"></label></td></tr>
<tr><td width="1%"><input type="checkbox" bind="includeFileSourcesCheckbox" id="$include-file-sources" /></td>
<td><label for="$include-file-sources" id="or-import-source"></label></td></tr>
<tr>
<td style="text-align: right;">&nbsp;</td>
<td width="1%"><button class="button" bind="previewButton"></button></td>
</tr>
</table></div>

View File

@ -62,20 +62,8 @@ Refine.WikitextParserUI.prototype.confirmReadyToCreateProject = function() {
Refine.WikitextParserUI.prototype.getOptions = function() {
var options = {
encoding: $.trim(this._optionContainerElmts.encodingInput[0].value)
};
switch (this._optionContainer.find("input[name='column-separator']:checked")[0].value) {
case 'comma':
options.separator = ",";
break;
case 'tab':
options.separator = "\\t";
break;
default:
options.separator = this._optionContainerElmts.columnSeparatorInput[0].value;
}
var parseIntDefault = function(s, def) {
try {
var n = parseInt(s,10);
@ -87,30 +75,15 @@ Refine.WikitextParserUI.prototype.getOptions = function() {
}
return def;
};
if (this._optionContainerElmts.ignoreCheckbox[0].checked) {
options.ignoreLines = parseIntDefault(this._optionContainerElmts.ignoreInput[0].value, -1);
} else {
options.ignoreLines = -1;
}
if (this._optionContainerElmts.headerLinesCheckbox[0].checked) {
options.headerLines = parseIntDefault(this._optionContainerElmts.headerLinesInput[0].value, 0);
} else {
options.headerLines = 0;
}
if (this._optionContainerElmts.skipCheckbox[0].checked) {
options.skipDataLines = parseIntDefault(this._optionContainerElmts.skipInput[0].value, 0);
} else {
options.skipDataLines = 0;
}
if (this._optionContainerElmts.limitCheckbox[0].checked) {
options.limit = parseIntDefault(this._optionContainerElmts.limitInput[0].value, -1);
} else {
options.limit = -1;
}
options.storeBlankRows = this._optionContainerElmts.storeBlankRowsCheckbox[0].checked;
options.blankSpanningCells = this._optionContainerElmts.blankSpanningCellsCheckbox[0].checked;
options.guessCellValueTypes = this._optionContainerElmts.guessCellValueTypesCheckbox[0].checked;
options.processQuotes = this._optionContainerElmts.processQuoteMarksCheckbox[0].checked;
options.storeBlankCellsAsNulls = this._optionContainerElmts.storeBlankCellsAsNullsCheckbox[0].checked;
options.includeFileSources = this._optionContainerElmts.includeFileSourcesCheckbox[0].checked;
@ -120,35 +93,23 @@ Refine.WikitextParserUI.prototype.getOptions = function() {
Refine.WikitextParserUI.prototype._initialize = function() {
var self = this;
console.log('wikitext ui initialize');
this._optionContainer.unbind().empty().html(
DOM.loadHTML("core", "scripts/index/parser-interfaces/separator-based-parser-ui.html"));
DOM.loadHTML("core", "scripts/index/parser-interfaces/wikitext-parser-ui.html"));
this._optionContainerElmts = DOM.bind(this._optionContainer);
this._optionContainerElmts.previewButton.click(function() { self._updatePreview(); });
this._optionContainerElmts.previewButton.html($.i18n._('core-buttons')["update-preview"]);
$('#or-import-encoding').html($.i18n._('core-index-import')["char-encoding"]);
$('#or-import-colsep').html($.i18n._('core-index-parser')["col-separated-by"]);
$('#or-import-commas').html($.i18n._('core-index-parser')["commas"]);
$('#or-import-tabs').html($.i18n._('core-index-parser')["tabs"]);
$('#or-import-custom').html($.i18n._('core-index-parser')["custom"]);
$('#or-import-escape').html($.i18n._('core-index-parser')["escape"]);
$('#or-import-ignore').text($.i18n._('core-index-parser')["ignore-first"]);
$('#or-import-lines').text($.i18n._('core-index-parser')["lines-beg"]);
$('#or-import-parse').text($.i18n._('core-index-parser')["parse-next"]);
$('#or-import-header').text($.i18n._('core-index-parser')["lines-header"]);
$('#or-import-discard').text($.i18n._('core-index-parser')["discard-initial"]);
$('#or-import-rows').text($.i18n._('core-index-parser')["rows-data"]);
$('#or-import-load').text($.i18n._('core-index-parser')["load-at-most"]);
$('#or-import-rows2').text($.i18n._('core-index-parser')["rows-data"]);
$('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]);
$('#or-import-quote').html($.i18n._('core-index-parser')["quotation-mark"]);
$('#or-import-blankSpanningCells').text($.i18n._('core-index-parser')["blank-spanning-cells"]);
$('#or-import-blank').text($.i18n._('core-index-parser')["store-blank"]);
$('#or-import-null').text($.i18n._('core-index-parser')["store-nulls"]);
$('#or-import-source').html($.i18n._('core-index-parser')["store-source"]);
/*
this._optionContainerElmts.encodingInput
.attr('value', this._config.encoding || '')
.click(function() {
@ -156,29 +117,18 @@ Refine.WikitextParserUI.prototype._initialize = function() {
self._updatePreview();
});
});
*/
console.log(this._config);
var columnSeparatorValue = (this._config.separator == ",") ? 'comma' :
((this._config.separator == "\\t") ? 'tab' : 'custom');
this._optionContainer.find(
"input[name='column-separator'][value='" + columnSeparatorValue + "']").prop("checked", true);
this._optionContainerElmts.columnSeparatorInput[0].value = this._config.separator;
if (this._config.ignoreLines > 0) {
this._optionContainerElmts.ignoreCheckbox.prop("checked", true);
this._optionContainerElmts.ignoreInput[0].value = this._config.ignoreLines.toString();
}
if (this._config.headerLines > 0) {
this._optionContainerElmts.headerLinesCheckbox.prop("checked", true);
this._optionContainerElmts.headerLinesInput[0].value = this._config.headerLines.toString();
}
if (this._config.limit > 0) {
if (this._config.limit > 0) {
this._optionContainerElmts.limitCheckbox.prop("checked", true);
this._optionContainerElmts.limitInput[0].value = this._config.limit.toString();
}
if (this._config.skipDataLines > 0) {
this._optionContainerElmts.skipCheckbox.prop("checked", true);
this._optionContainerElmts.skipInput.value[0].value = this._config.skipDataLines.toString();
if (this._config.blankSpanningCells) {
this._optionContainerElmts.blankSpanningCellsCheckbox.prop("checked", true);
}
if (this._config.storeBlankRows) {
this._optionContainerElmts.storeBlankRowsCheckbox.prop("checked", true);
}
@ -186,9 +136,6 @@ Refine.WikitextParserUI.prototype._initialize = function() {
if (this._config.guessCellValueTypes) {
this._optionContainerElmts.guessCellValueTypesCheckbox.prop("checked", true);
}
if (this._config.processQuotes) {
this._optionContainerElmts.processQuoteMarksCheckbox.prop("checked", true);
}
if (this._config.storeBlankCellsAsNulls) {
this._optionContainerElmts.storeBlankCellsAsNullsCheckbox.prop("checked", true);
@ -205,7 +152,6 @@ Refine.WikitextParserUI.prototype._initialize = function() {
};
Refine.WikitextParserUI.prototype._scheduleUpdatePreview = function() {
console.log('scheduleUpdatePreview');
if (this._timerID !== null) {
window.clearTimeout(this._timerID);
this._timerID = null;
@ -222,10 +168,8 @@ Refine.WikitextParserUI.prototype._updatePreview = function() {
var self = this;
this._progressContainer.show();
console.log('updatePreview');
this._controller.updateFormatAndOptions(this.getOptions(), function(result) {
console.log(result.status);
if (result.status == "ok") {
self._controller.getPreviewData(function(projectData) {
self._progressContainer.hide();