Better error reporting and testing for Wikitext import

This commit is contained in:
Antonin Delpeuch 2017-08-16 10:30:51 +01:00
parent e47fb3f2a6
commit 637e69db9d
6 changed files with 163 additions and 58 deletions

View File

@ -15,7 +15,10 @@ import de.fau.cs.osr.ptk.common.AstVisitor;
import org.sweble.wikitext.parser.ParserConfig; import org.sweble.wikitext.parser.ParserConfig;
import org.sweble.wikitext.parser.utils.SimpleParserConfig; import org.sweble.wikitext.parser.utils.SimpleParserConfig;
import org.sweble.wikitext.parser.WikitextParser; import org.sweble.wikitext.parser.WikitextParser;
import org.sweble.wikitext.parser.nodes.WtBold;
import org.sweble.wikitext.parser.nodes.WtItalics;
import org.sweble.wikitext.parser.nodes.WtNode; import org.sweble.wikitext.parser.nodes.WtNode;
import org.sweble.wikitext.parser.nodes.WtSection;
import org.sweble.wikitext.parser.nodes.WtText; import org.sweble.wikitext.parser.nodes.WtText;
import org.sweble.wikitext.parser.nodes.WtInternalLink; import org.sweble.wikitext.parser.nodes.WtInternalLink;
import org.sweble.wikitext.parser.nodes.WtExternalLink; import org.sweble.wikitext.parser.nodes.WtExternalLink;
@ -156,18 +159,14 @@ public class WikitextImporter extends TabularImportingParserBase {
return super.before(node); return super.before(node);
} }
/* Default handler */
public void visit(WtNode e) { public void visit(WtNode e) {
// Ignore other nodes // Ignore other nodes
// System.out.println(e.getNodeName()); // System.out.println(e.getNodeName());
} }
public void visit(WtParsedWikitextPage e) { /* Table handling */
iterate(e);
}
public void visit(WtBody e) {
iterate(e);
}
public void visit(WtTable e) { public void visit(WtTable e) {
iterate(e); iterate(e);
@ -239,6 +238,28 @@ public class WikitextImporter extends TabularImportingParserBase {
} }
} }
public String renderCellAsString(WtNode e) {
cellStringBuilder = new StringBuilder();
iterate(e);
String value = cellStringBuilder.toString();
if (value == null) {
value = "";
}
value = value.trim();
cellStringBuilder = null;
return value;
}
public void visit(WtText text) {
if (xmlAttrStringBuilder != null) {
xmlAttrStringBuilder.append(text.getContent());
} else if (cellStringBuilder != null) {
cellStringBuilder.append(text.getContent());
}
}
/* Spanning cell helpers */
private SpanningCell spanningCell() { private SpanningCell spanningCell() {
return spanningCells.get(spanningCellIdx); return spanningCells.get(spanningCellIdx);
} }
@ -269,6 +290,8 @@ public class WikitextImporter extends TabularImportingParserBase {
} }
} }
/* XML attributes : useful for colspan and rowspan */
public void visit(WtXmlAttributes e) { public void visit(WtXmlAttributes e) {
iterate(e); iterate(e);
} }
@ -299,39 +322,9 @@ public class WikitextImporter extends TabularImportingParserBase {
iterate(e); iterate(e);
} }
public String renderCellAsString(WtNode e) { /* Link management */
cellStringBuilder = new StringBuilder();
iterate(e);
String value = cellStringBuilder.toString();
if (value == null) {
value = "";
}
value = value.trim();
cellStringBuilder = null;
return value;
}
public void visit(WtText text) {
if (xmlAttrStringBuilder != null) {
xmlAttrStringBuilder.append(text.getContent());
} else if (cellStringBuilder != null) {
cellStringBuilder.append(text.getContent());
}
}
public void visit(WtNoLinkTitle e) {
if (currentInternalLink != null) {
cellStringBuilder.append(currentInternalLink);
} else if (currentExternalLink != null) {
cellStringBuilder.append(currentExternalLink);
}
}
public void visit(WtLinkTitle e) {
iterate(e);
}
public void visit(WtInternalLink e) { public void visit(WtInternalLink e) {
currentInternalLink = e.getTarget().getAsString(); currentInternalLink = e.getTarget().getAsString();
internalLinksInCell.add(currentInternalLink); internalLinksInCell.add(currentInternalLink);
@ -341,9 +334,60 @@ public class WikitextImporter extends TabularImportingParserBase {
public void visit(WtExternalLink e) { public void visit(WtExternalLink e) {
WtUrl url = e.getTarget(); WtUrl url = e.getTarget();
currentExternalLink = url.getProtocol() + ":" + url.getPath(); String externalLink = url.getProtocol() + ":" + url.getPath();
if (cellStringBuilder != null) {
if(rowId >= 0) {
// We are inside the table: all hyperlinks
// should be converted to their URLs regardless of
// their label.
cellStringBuilder.append(externalLink);
} else {
// We are in the header: keep the labels instead
currentExternalLink = externalLink;
iterate(e);
currentExternalLink = null;
}
}
}
public void visit(WtNoLinkTitle e) {
if (cellStringBuilder != null) {
if (currentInternalLink != null) {
cellStringBuilder.append(currentInternalLink);
} else if (currentExternalLink != null) {
cellStringBuilder.append(currentExternalLink);
}
}
}
public void visit(WtLinkTitle e) {
iterate(e);
}
public void visit(WtUrl e) {
// already handled, in WtExternalLink, added here for clarity
}
/* Content blocks */
public void visit(WtParsedWikitextPage e) {
iterate(e);
}
public void visit(WtSection e) {
iterate(e);
}
public void visit(WtBody e) {
iterate(e);
}
public void visit(WtItalics e) {
iterate(e);
}
public void visit(WtBold e) {
iterate(e); iterate(e);
currentExternalLink = null;
} }
@Override @Override
@ -402,9 +446,11 @@ public class WikitextImporter extends TabularImportingParserBase {
List<Recon> recons = new ArrayList<Recon>(rowSize); List<Recon> recons = new ArrayList<Recon>(rowSize);
for (int j = 0; j < rowSize; j++) { for (int j = 0; j < rowSize; j++) {
recons.add(null); recons.add(null);
if (i == 0)
columnReconciled.add(false);
} }
reconList.add(recons); reconList.add(recons);
columnReconciled.add(false);
} }
int batchSize = 50; int batchSize = 50;

View File

@ -49,7 +49,7 @@ import org.testng.annotations.Test;
import com.google.refine.importers.WikitextImporter; import com.google.refine.importers.WikitextImporter;
public class WikitextImporterTests extends ImporterTest { public class WikitextImporterTests extends ImporterTest {
//System Under Test
private WikitextImporter importer = null; private WikitextImporter importer = null;
@Override @Override
@ -83,7 +83,7 @@ public class WikitextImporterTests extends ImporterTest {
+ "|-\n" + "|-\n"
+ "|}\n"; + "|}\n";
try { try {
prepareOptions(0, true, true); prepareOptions(0, true, true, null);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
@ -101,7 +101,7 @@ public class WikitextImporterTests extends ImporterTest {
String input = "\n" String input = "\n"
+"{|\n" +"{|\n"
+"|-\n" +"|-\n"
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung|Cedefop]] || Cedefop || [http://www.cedefop.europa.eu/]\n" +"| [[Europäisches Zentrum für die Förderung der Berufsbildung|Cedefop]] || Cedefop || http://www.cedefop.europa.eu/\n"
+"|-\n" +"|-\n"
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n" +"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n"
+"|-\n" +"|-\n"
@ -110,7 +110,7 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n"; +"|}\n";
try { try {
prepareOptions(0, true, true); prepareOptions(0, true, true, "https://de.wikipedia.org/wiki/");
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
@ -118,16 +118,25 @@ public class WikitextImporterTests extends ImporterTest {
Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.columnModel.columns.size(), 3);
Assert.assertEquals(project.rows.size(), 3); Assert.assertEquals(project.rows.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.size(), 3); Assert.assertEquals(project.rows.get(0).cells.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Cedefop");
// Reconciled cells
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "Cedefop");
Assert.assertEquals(project.rows.get(0).cells.get(1).recon, null);
Assert.assertEquals(project.rows.get(2).cells.get(0).value, "Europäische Beobachtungsstelle für Drogen und Drogensucht"); Assert.assertEquals(project.rows.get(2).cells.get(0).value, "Europäische Beobachtungsstelle für Drogen und Drogensucht");
Assert.assertEquals(project.rows.get(2).cells.get(0).recon.getBestCandidate().id, "Q1377256");
// various ways to input external links
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/"); Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "europa.eu"); Assert.assertEquals(project.rows.get(2).cells.get(2).value, "http://www.emcdda.europa.eu/");
// Assert.assertEquals(project.rows.get(0).cells.get(2).value, "http://www.cedefop.europa.eu/");
// unfortunately the above does not seem to be supported by the parser (parsed as blank instead)
} }
@Test @Test
public void readStyledTableWithHeader() { public void readStyledTableWithHeader() {
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit // Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
String input = "\n" String input = "\n"
+"==Agenturen==\n"
+"{| class=\"wikitable sortable\"\n" +"{| class=\"wikitable sortable\"\n"
+"! style=\"text-align:left; width: 60em\" | Offizieller Name\n" +"! style=\"text-align:left; width: 60em\" | Offizieller Name\n"
+"! style=\"text-align:left; width: 9em\" | Abkürzung\n" +"! style=\"text-align:left; width: 9em\" | Abkürzung\n"
@ -137,27 +146,27 @@ public class WikitextImporterTests extends ImporterTest {
+"! style=\"text-align:left; width: 6em\" | Gründung\n" +"! style=\"text-align:left; width: 6em\" | Gründung\n"
+"! style=\"text-align:left; width: 50em\" | Anmerkungen\n" +"! style=\"text-align:left; width: 50em\" | Anmerkungen\n"
+"|-\n" +"|-\n"
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung]] || Cedefop || [http://www.cedefop.europa.eu/] || [[Thessaloniki]] || {{Griechenland}} || 1975 ||\n" +"| [[Europäisches Zentrum für die Förderung der Berufsbildung]] || '''Cedefop''' || [http://www.cedefop.europa.eu/] || [[Thessaloniki]] || {{Griechenland}} || 1975 ||\n"
+"|-\n" +"|-\n"
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/] || [[Dublin]] || {{Irland}} || 1975 ||\n" +"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || ''EUROFOUND'' || [http://www.eurofound.europa.eu/] || [[Dublin]] || {{Irland}} || 1975 ||\n"
+"|-\n" +"|-\n"
+"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/] || [[Lissabon]] || {{Portugal}} || 1993 ||\n" +"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/] || [[Lissabon]] || {{Portugal}} || 1993 ||\n"
+"|-\n" +"|-\n"
+"|}\n"; +"|}\n";
try { try {
prepareOptions(-1, true, true); prepareOptions(-1, true, true, null);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
} }
Assert.assertEquals(project.columnModel.columns.size(), 7); Assert.assertEquals(project.columnModel.columns.size(), 7);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung"); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "Cedefop");
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "EUROFOUND");
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Offizieller Name"); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Offizieller Name");
Assert.assertEquals(project.columnModel.columns.get(6).getName(), "Anmerkungen"); Assert.assertEquals(project.columnModel.columns.get(6).getName(), "Anmerkungen");
Assert.assertEquals(project.rows.get(0).cells.size(), 7); Assert.assertEquals(project.rows.get(0).cells.size(), 7);
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
} }
@Test @Test
@ -179,7 +188,7 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n"; +"|}\n";
try { try {
prepareOptions(-1, true, true); prepareOptions(-1, true, true, null);
parse(input); parse(input);
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Parsing failed", e); Assert.fail("Parsing failed", e);
@ -197,12 +206,13 @@ public class WikitextImporterTests extends ImporterTest {
private void prepareOptions( private void prepareOptions(
int limit, boolean blankSpanningCells, int limit, boolean blankSpanningCells,
boolean guessValueType) { boolean guessValueType, String wikiUrl) {
whenGetIntegerOption("limit", options, limit); whenGetIntegerOption("limit", options, limit);
whenGetBooleanOption("guessCellValueTypes", options, guessValueType); whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells); whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells);
whenGetBooleanOption("storeBlankCellsAsNulls", options, true); whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
whenGetStringOption("wikiUrl", options, wikiUrl);
whenGetIntegerOption("headerLines", options, 1); whenGetIntegerOption("headerLines", options, 1);
whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api"); whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api");
} }

View File

@ -377,7 +377,8 @@ function init() {
"styles/views/extend-data-preview-dialog.less", "styles/views/extend-data-preview-dialog.less",
"styles/index/fixed-width-parser-ui.less", "styles/index/fixed-width-parser-ui.less",
"styles/index/xml-parser-ui.less", "styles/index/xml-parser-ui.less",
"styles/index/json-parser-ui.less" "styles/index/json-parser-ui.less",
"styles/index/wikitext-parser-ui.less",
] ]
); );

View File

@ -115,6 +115,7 @@
"store-nulls": "Store blank cells as nulls", "store-nulls": "Store blank cells as nulls",
"blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls", "blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls",
"wiki-base-url": "Reconcile to wiki with base URL:", "wiki-base-url": "Reconcile to wiki with base URL:",
"invalid-wikitext": "No table could be parsed. Are you sure this is a valid wiki table?",
"store-source": "Store file source <br/>(file names, URLs)<br/>in each row", "store-source": "Store file source <br/>(file names, URLs)<br/>in each row",
"preserve-empty": "Preserve empty strings", "preserve-empty": "Preserve empty strings",
"trim": "Trim leading &amp; trailing whitespace from strings", "trim": "Trim leading &amp; trailing whitespace from strings",

View File

@ -186,8 +186,13 @@ Refine.WikitextParserUI.prototype._updatePreview = function() {
if (result.status == "ok") { if (result.status == "ok") {
self._controller.getPreviewData(function(projectData) { self._controller.getPreviewData(function(projectData) {
self._progressContainer.hide(); self._progressContainer.hide();
var container = self._dataContainer.unbind().empty();
new Refine.PreviewTable(projectData, self._dataContainer.unbind().empty()); if (projectData.rowModel.rows.length === 0) {
$('<div>').addClass("wikitext-parser-ui-message")
.text($.i18n._('core-index-parser')["invalid-wikitext"]).appendTo(container);
} else {
new Refine.PreviewTable(projectData, container);
}
}); });
} }
}); });

View File

@ -0,0 +1,42 @@
/*
Copyright 2011, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
@import-less url("../theme.less");
.wikitext-parser-ui-message {
background: #eee;
font-size: 150%;
color: #666;
padding: 20px;
}