Parse headers just like other cells in Wikitable importer
This makes the column names a bit less meaningful, but this is necessary to handle cases where headers are inserted anywhere in the table.
This commit is contained in:
parent
3f56a4eee9
commit
843641ca74
@ -29,6 +29,7 @@ import org.sweble.wikitext.parser.nodes.WtTemplateArguments;
|
||||
import org.sweble.wikitext.parser.nodes.WtText;
|
||||
import org.sweble.wikitext.parser.nodes.WtInternalLink;
|
||||
import org.sweble.wikitext.parser.nodes.WtExternalLink;
|
||||
import org.sweble.wikitext.parser.nodes.WtHeading;
|
||||
import org.sweble.wikitext.parser.nodes.WtLinkTitle;
|
||||
import org.sweble.wikitext.parser.nodes.WtLinkTitle.WtNoLinkTitle;
|
||||
import org.sweble.wikitext.parser.nodes.WtUrl;
|
||||
@ -129,7 +130,6 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
public class WikitextTableVisitor extends AstVisitor<WtNode> {
|
||||
|
||||
public String caption;
|
||||
public List<String> header;
|
||||
public List<List<String>> rows;
|
||||
public List<List<String>> references;
|
||||
public List<WikilinkedCell> wikilinkedCells;
|
||||
@ -163,7 +163,6 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
this.blankSpanningCells = blankSpanningCells;
|
||||
this.includeRawTemplates = includeRawTemplates;
|
||||
caption = null;
|
||||
header = new ArrayList<String>();
|
||||
rows = new ArrayList<List<String>>();
|
||||
references = new ArrayList<List<String>>();
|
||||
wikilinkedCells = new ArrayList<WikilinkedCell>();
|
||||
@ -178,7 +177,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
currentReferenceName = null;
|
||||
colspan = 0;
|
||||
rowspan = 0;
|
||||
rowId = -1;
|
||||
rowId = 0;
|
||||
spanningCellIdx = 0;
|
||||
internalLinksInCell = new ArrayList<String>();
|
||||
namedReferences = new HashMap<String, String>();
|
||||
@ -202,77 +201,80 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
iterate(e);
|
||||
}
|
||||
|
||||
public void visit(WtTableHeader e) {
|
||||
String columnName = renderCellAsString(e);
|
||||
header.add(columnName);
|
||||
// For the header, we ignore rowspan and manually add cells for colspan
|
||||
if (colspan > 1) {
|
||||
for (int i = 0; i < colspan-1; i++) {
|
||||
header.add(columnName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void visit(WtTableCaption e) {
|
||||
caption = renderCellAsString(e);
|
||||
}
|
||||
|
||||
public void visit(WtTableRow e)
|
||||
{
|
||||
if (currentRow == null) {
|
||||
if (rowId == -1) {
|
||||
// no header was found, start on the first row
|
||||
rowId = 0;
|
||||
}
|
||||
currentRow = new ArrayList<String>();
|
||||
currentRowReferences = new ArrayList<String>();
|
||||
spanningCellIdx = 0;
|
||||
addSpanningCells();
|
||||
iterate(e);
|
||||
if(currentRow.size() > 0) {
|
||||
rows.add(currentRow);
|
||||
references.add(currentRowReferences);
|
||||
rowId++;
|
||||
}
|
||||
currentRow = null;
|
||||
if (currentRow != null) {
|
||||
finishRow();
|
||||
}
|
||||
startRow();
|
||||
iterate(e);
|
||||
finishRow();
|
||||
}
|
||||
|
||||
private void startRow() {
|
||||
currentRow = new ArrayList<String>();
|
||||
currentRowReferences = new ArrayList<String>();
|
||||
spanningCellIdx = 0;
|
||||
addSpanningCells();
|
||||
}
|
||||
|
||||
private void finishRow() {
|
||||
if(currentRow.size() > 0) {
|
||||
rows.add(currentRow);
|
||||
references.add(currentRowReferences);
|
||||
rowId++;
|
||||
}
|
||||
currentRow = null;
|
||||
}
|
||||
|
||||
public void visit(WtTableCell e)
|
||||
{
|
||||
if (currentRow != null) {
|
||||
rowspan = 1;
|
||||
colspan = 1;
|
||||
internalLinksInCell.clear();
|
||||
currentReference = null;
|
||||
currentReferenceName = null;
|
||||
|
||||
String value = renderCellAsString(e);
|
||||
|
||||
int colId = currentRow.size();
|
||||
|
||||
// Add the cell to the row we are currently building
|
||||
currentRow.add(value);
|
||||
currentRowReferences.add(currentReference);
|
||||
|
||||
// Reconcile it if we found exactly one link in the cell
|
||||
String reconciled = null;
|
||||
if (internalLinksInCell.size() == 1) {
|
||||
reconciled = internalLinksInCell.get(0);
|
||||
wikilinkedCells.add(new WikilinkedCell(reconciled, rowId, colId));
|
||||
}
|
||||
|
||||
// Mark it as spanning if we found the tags
|
||||
if (colspan > 1 || rowspan > 1) {
|
||||
SpanningCell spanningCell = new SpanningCell(
|
||||
value, reconciled, currentReference,
|
||||
rowId, colId, rowspan, colspan);
|
||||
spanningCells.add(spanningCellIdx, spanningCell);
|
||||
}
|
||||
|
||||
// Add all spanning cells that need to be inserted after this one.
|
||||
addSpanningCells();
|
||||
addCell(e);
|
||||
}
|
||||
|
||||
public void visit(WtTableHeader e) {
|
||||
addCell(e);
|
||||
}
|
||||
|
||||
public void addCell(WtNode e) {
|
||||
if (currentRow == null) {
|
||||
startRow();
|
||||
}
|
||||
rowspan = 1;
|
||||
colspan = 1;
|
||||
internalLinksInCell.clear();
|
||||
currentReference = null;
|
||||
currentReferenceName = null;
|
||||
|
||||
String value = renderCellAsString(e);
|
||||
|
||||
int colId = currentRow.size();
|
||||
|
||||
// Add the cell to the row we are currently building
|
||||
currentRow.add(value);
|
||||
currentRowReferences.add(currentReference);
|
||||
|
||||
// Reconcile it if we found exactly one link in the cell
|
||||
String reconciled = null;
|
||||
if (internalLinksInCell.size() == 1) {
|
||||
reconciled = internalLinksInCell.get(0);
|
||||
wikilinkedCells.add(new WikilinkedCell(reconciled, rowId, colId));
|
||||
}
|
||||
|
||||
// Mark it as spanning if we found the tags
|
||||
if (colspan > 1 || rowspan > 1) {
|
||||
SpanningCell spanningCell = new SpanningCell(
|
||||
value, reconciled, currentReference,
|
||||
rowId, colId, rowspan, colspan);
|
||||
spanningCells.add(spanningCellIdx, spanningCell);
|
||||
}
|
||||
|
||||
// Add all spanning cells that need to be inserted after this one.
|
||||
addSpanningCells();
|
||||
}
|
||||
|
||||
public String renderCellAsString(WtNode e) {
|
||||
@ -403,22 +405,20 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
}
|
||||
|
||||
public void visit(WtXmlAttribute e) {
|
||||
if (currentXmlAttr == null) {
|
||||
xmlAttrStringBuilder = new StringBuilder();
|
||||
iterate(e);
|
||||
try {
|
||||
if ("colspan".equals(currentXmlAttr)) {
|
||||
colspan = Integer.parseInt(xmlAttrStringBuilder.toString());
|
||||
} else if ("rowspan".equals(currentXmlAttr)) {
|
||||
rowspan = Integer.parseInt(xmlAttrStringBuilder.toString());
|
||||
} else if ("name".equals(currentXmlAttr)) {
|
||||
currentReferenceName = xmlAttrStringBuilder.toString();
|
||||
}
|
||||
} catch (NumberFormatException nfe) {
|
||||
xmlAttrStringBuilder = new StringBuilder();
|
||||
iterate(e);
|
||||
try {
|
||||
if ("colspan".equals(currentXmlAttr)) {
|
||||
colspan = Integer.parseInt(xmlAttrStringBuilder.toString());
|
||||
} else if ("rowspan".equals(currentXmlAttr)) {
|
||||
rowspan = Integer.parseInt(xmlAttrStringBuilder.toString());
|
||||
} else if ("name".equals(currentXmlAttr)) {
|
||||
currentReferenceName = xmlAttrStringBuilder.toString();
|
||||
}
|
||||
currentXmlAttr = null;
|
||||
xmlAttrStringBuilder = null;
|
||||
} catch (NumberFormatException nfe) {
|
||||
}
|
||||
currentXmlAttr = null;
|
||||
xmlAttrStringBuilder = null;
|
||||
}
|
||||
|
||||
public void visit(WtName e) {
|
||||
@ -537,7 +537,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
}
|
||||
|
||||
public class WikiTableDataReader implements TableDataReader {
|
||||
private int currentRow = -1;
|
||||
private int currentRow = 0;
|
||||
private WikitextTableVisitor visitor = null;
|
||||
private List<List<Recon>> reconList = null;
|
||||
private List<Boolean> columnReconciled = null;
|
||||
@ -545,7 +545,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
|
||||
public WikiTableDataReader(WikitextTableVisitor visitor, boolean references) {
|
||||
this.visitor = visitor;
|
||||
currentRow = -1;
|
||||
currentRow = 0;
|
||||
reconList = null;
|
||||
|
||||
if (references) {
|
||||
@ -569,9 +569,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
List<Object> row = null;
|
||||
List<String> origRow = null;
|
||||
List<String> refRow = null;
|
||||
if (currentRow == -1) {
|
||||
origRow = this.visitor.header;
|
||||
} else if(currentRow < this.visitor.rows.size()) {
|
||||
if(currentRow < this.visitor.rows.size()) {
|
||||
origRow = this.visitor.rows.get(currentRow);
|
||||
refRow = this.visitor.references.get(currentRow);
|
||||
}
|
||||
@ -583,10 +581,15 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
if (currentRow >= 0 && reconList != null) {
|
||||
recon = reconList.get(currentRow).get(i);
|
||||
}
|
||||
row.add(new Cell(origRow.get(i), recon));
|
||||
String value = origRow.get(i);
|
||||
if (value != null) {
|
||||
row.add(new Cell(value, recon));
|
||||
} else {
|
||||
row.add(null);
|
||||
}
|
||||
|
||||
// if we should add reference colums…
|
||||
if (columnReferenced != null && columnReferenced.get(i)) {
|
||||
// if we should add reference columns…
|
||||
if (columnReferenced != null && i < columnReferenced.size() && columnReferenced.get(i)) {
|
||||
String refValue = null;
|
||||
// for headers
|
||||
if(currentRow == -1) {
|
||||
@ -594,7 +597,11 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
} else {
|
||||
refValue = refRow.get(i);
|
||||
}
|
||||
row.add(new Cell(refValue, null));
|
||||
if (refValue != null) {
|
||||
row.add(new Cell(refValue, null));
|
||||
} else {
|
||||
row.add(null);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -705,8 +712,6 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
dataReader.reconcileToQids(wikiUrl, cfg);
|
||||
}
|
||||
|
||||
JSONUtilities.safePut(options, "headerLines", 1);
|
||||
|
||||
// Set metadata
|
||||
if (vs.caption != null && vs.caption.length() > 0) {
|
||||
metadata.setName(vs.caption);
|
||||
|
@ -80,10 +80,10 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+ "|-\n"
|
||||
+ "|}\n";
|
||||
try {
|
||||
prepareOptions(0, true, true, null);
|
||||
parse(input);
|
||||
prepareOptions(0, 0, true, true, null);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
Assert.fail("Parsing failed", e);
|
||||
}
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 3);
|
||||
Assert.assertEquals(project.rows.size(), 2);
|
||||
@ -93,6 +93,35 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "f");
|
||||
}
|
||||
|
||||
/**
|
||||
* Issue #1448
|
||||
* https://github.com/OpenRefine/OpenRefine/issues/1448
|
||||
*/
|
||||
@Test
|
||||
public void readTableWithMisplacedHeaders() {
|
||||
String input = "\n"
|
||||
+ "{|\n"
|
||||
+ "|-\n"
|
||||
+ "| a || b<br/>2 || c \n"
|
||||
+ "|-\n"
|
||||
+ "| d\n"
|
||||
+ "! e\n"
|
||||
+ "| f<br>\n"
|
||||
+ "|-\n"
|
||||
+ "|}\n";
|
||||
try {
|
||||
prepareOptions(0, 0, true, true, null);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
}
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 3);
|
||||
Assert.assertEquals(project.rows.size(), 2);
|
||||
Assert.assertEquals(project.rows.get(0).cells.size(), 3);
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "e");
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "f");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void readTableWithLinks() {
|
||||
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
|
||||
@ -108,10 +137,10 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+"|}\n";
|
||||
|
||||
try {
|
||||
prepareOptions(0, true, true, "https://de.wikipedia.org/wiki/");
|
||||
parse(input);
|
||||
prepareOptions(0, 0, true, true, "https://de.wikipedia.org/wiki/");
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
Assert.fail("Parsing failed", e);
|
||||
}
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 3);
|
||||
Assert.assertEquals(project.rows.size(), 3);
|
||||
@ -153,10 +182,10 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+"|}\n";
|
||||
|
||||
try {
|
||||
prepareOptions(-1, true, true, null);
|
||||
parse(input);
|
||||
prepareOptions(-1, 1, true, true, null);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
Assert.fail("Parsing failed", e);
|
||||
}
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 7);
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung");
|
||||
@ -186,14 +215,14 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+"|}\n";
|
||||
|
||||
try {
|
||||
prepareOptions(-1, true, true, null);
|
||||
parse(input);
|
||||
prepareOptions(-1, 1, true, true, null);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
Assert.fail("Parsing failed", e);
|
||||
}
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 6);
|
||||
Assert.assertNull(project.rows.get(1).cells.get(2).value);
|
||||
Assert.assertNull(project.rows.get(1).cells.get(3).value);
|
||||
Assert.assertNull(project.rows.get(1).cells.get(2));
|
||||
Assert.assertNull(project.rows.get(1).cells.get(3));
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(4).value, "Butter");
|
||||
}
|
||||
|
||||
@ -212,10 +241,10 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+"|}\n";
|
||||
|
||||
try {
|
||||
prepareOptions(-1, true, true, null);
|
||||
parse(input);
|
||||
prepareOptions(-1, 1, true, true, null);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
Assert.fail("Parsing failed", e);
|
||||
}
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 5);
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "b");
|
||||
@ -240,10 +269,10 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+"|}\n";
|
||||
|
||||
try {
|
||||
prepareOptions(-1, true, true, null);
|
||||
parse(input);
|
||||
prepareOptions(-1, 1, true, true, null);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
Assert.fail("Parsing failed", e);
|
||||
}
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 5);
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "b");
|
||||
@ -260,16 +289,16 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
}
|
||||
|
||||
private void prepareOptions(
|
||||
int limit, boolean blankSpanningCells,
|
||||
int limit, int headerLines, boolean blankSpanningCells,
|
||||
boolean guessValueType, String wikiUrl) {
|
||||
|
||||
whenGetIntegerOption("limit", options, limit);
|
||||
whenGetIntegerOption("headerLines", options, headerLines);
|
||||
whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
|
||||
whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells);
|
||||
whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
|
||||
whenGetBooleanOption("parseReferences", options, true);
|
||||
whenGetStringOption("wikiUrl", options, wikiUrl);
|
||||
whenGetIntegerOption("headerLines", options, 1);
|
||||
whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api");
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,11 @@
|
||||
|
||||
<tr><td width="1%"><input type="checkbox" bind="wikiCheckbox" id="$reconcileWiki" /></td><td><label for="$reconcileWiki" id="or-import-wiki-base-url"></label>
|
||||
<input bind="wikiUrlInput" type="text" class="lightweight" size="30" id="$wikiUrl" /></td></tr>
|
||||
|
||||
<tr><td width="1%"><input type="checkbox" bind="headerLinesCheckbox" id="$headers" /></td>
|
||||
<td><label for="$headers" id="or-import-parse"></label>
|
||||
<input bind="headerLinesInput" type="text" class="lightweight" size="2" value="1" />
|
||||
<label for="$headers" id="or-import-header"></label></td></tr>
|
||||
<tr><td width="1%"><input type="checkbox" bind="limitCheckbox" id="$limit" /></td>
|
||||
<td><label for="$limit" id="or-import-load"></label>
|
||||
<input bind="limitInput" type="text" class="lightweight" size="2" value="0" />
|
||||
|
@ -85,6 +85,12 @@ Refine.WikitextParserUI.prototype.getOptions = function() {
|
||||
} else {
|
||||
options.limit = -1;
|
||||
}
|
||||
if (this._optionContainerElmts.headerLinesCheckbox[0].checked) {
|
||||
options.headerLines = parseIntDefault(this._optionContainerElmts.headerLinesInput[0].value, 1);
|
||||
} else {
|
||||
options.headerLines = -1;
|
||||
}
|
||||
|
||||
options.storeBlankRows = this._optionContainerElmts.storeBlankRowsCheckbox[0].checked;
|
||||
options.blankSpanningCells = this._optionContainerElmts.blankSpanningCellsCheckbox[0].checked;
|
||||
options.includeRawTemplates = this._optionContainerElmts.includeRawTemplatesCheckbox[0].checked;
|
||||
@ -111,6 +117,8 @@ Refine.WikitextParserUI.prototype._initialize = function() {
|
||||
this._optionContainerElmts.previewButton.html($.i18n._('core-buttons')["update-preview"]);
|
||||
|
||||
$('#or-import-wiki-base-url').text($.i18n._('core-index-parser')["wiki-base-url"]);
|
||||
$('#or-import-parse').text($.i18n._('core-index-parser')["parse-next"]);
|
||||
$('#or-import-header').text($.i18n._('core-index-parser')["lines-header"]);
|
||||
$('#or-import-load').text($.i18n._('core-index-parser')["load-at-most"]);
|
||||
$('#or-import-rows2').text($.i18n._('core-index-parser')["rows-data"]);
|
||||
$('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]);
|
||||
@ -142,6 +150,11 @@ Refine.WikitextParserUI.prototype._initialize = function() {
|
||||
this._optionContainerElmts.limitInput[0].value = this._config.limit.toString();
|
||||
}
|
||||
|
||||
if (this._config.headerLines > 0) {
|
||||
this._optionContainerElmts.headerLinesCheckbox.prop("checked", true);
|
||||
this._optionContainerElmts.headerLinesInput[0].value = this._config.headerLines.toString();
|
||||
}
|
||||
|
||||
if (this._config.blankSpanningCells) {
|
||||
this._optionContainerElmts.blankSpanningCellsCheckbox.prop("checked", true);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user