Better error reporting and testing for Wikitext import

This commit is contained in:
Antonin Delpeuch 2017-08-16 10:30:51 +01:00
parent e47fb3f2a6
commit 637e69db9d
6 changed files with 163 additions and 58 deletions

View File

@ -15,7 +15,10 @@ import de.fau.cs.osr.ptk.common.AstVisitor;
import org.sweble.wikitext.parser.ParserConfig;
import org.sweble.wikitext.parser.utils.SimpleParserConfig;
import org.sweble.wikitext.parser.WikitextParser;
import org.sweble.wikitext.parser.nodes.WtBold;
import org.sweble.wikitext.parser.nodes.WtItalics;
import org.sweble.wikitext.parser.nodes.WtNode;
import org.sweble.wikitext.parser.nodes.WtSection;
import org.sweble.wikitext.parser.nodes.WtText;
import org.sweble.wikitext.parser.nodes.WtInternalLink;
import org.sweble.wikitext.parser.nodes.WtExternalLink;
@ -156,18 +159,14 @@ public class WikitextImporter extends TabularImportingParserBase {
return super.before(node);
}
/* Default handler */
public void visit(WtNode e) {
// Ignore other nodes
// System.out.println(e.getNodeName());
}
public void visit(WtParsedWikitextPage e) {
iterate(e);
}
public void visit(WtBody e) {
iterate(e);
}
/* Table handling */
public void visit(WtTable e) {
iterate(e);
@ -239,6 +238,28 @@ public class WikitextImporter extends TabularImportingParserBase {
}
}
public String renderCellAsString(WtNode e) {
cellStringBuilder = new StringBuilder();
iterate(e);
String value = cellStringBuilder.toString();
if (value == null) {
value = "";
}
value = value.trim();
cellStringBuilder = null;
return value;
}
public void visit(WtText text) {
if (xmlAttrStringBuilder != null) {
xmlAttrStringBuilder.append(text.getContent());
} else if (cellStringBuilder != null) {
cellStringBuilder.append(text.getContent());
}
}
/* Spanning cell helpers */
private SpanningCell spanningCell() {
return spanningCells.get(spanningCellIdx);
}
@ -269,6 +290,8 @@ public class WikitextImporter extends TabularImportingParserBase {
}
}
/* XML attributes : useful for colspan and rowspan */
public void visit(WtXmlAttributes e) {
iterate(e);
}
@ -299,39 +322,9 @@ public class WikitextImporter extends TabularImportingParserBase {
iterate(e);
}
public String renderCellAsString(WtNode e) {
cellStringBuilder = new StringBuilder();
iterate(e);
String value = cellStringBuilder.toString();
if (value == null) {
value = "";
}
value = value.trim();
cellStringBuilder = null;
return value;
}
/* Link management */
public void visit(WtText text) {
if (xmlAttrStringBuilder != null) {
xmlAttrStringBuilder.append(text.getContent());
} else if (cellStringBuilder != null) {
cellStringBuilder.append(text.getContent());
}
}
public void visit(WtNoLinkTitle e) {
if (currentInternalLink != null) {
cellStringBuilder.append(currentInternalLink);
} else if (currentExternalLink != null) {
cellStringBuilder.append(currentExternalLink);
}
}
public void visit(WtLinkTitle e) {
iterate(e);
}
public void visit(WtInternalLink e) {
currentInternalLink = e.getTarget().getAsString();
internalLinksInCell.add(currentInternalLink);
@ -341,9 +334,60 @@ public class WikitextImporter extends TabularImportingParserBase {
public void visit(WtExternalLink e) {
WtUrl url = e.getTarget();
currentExternalLink = url.getProtocol() + ":" + url.getPath();
String externalLink = url.getProtocol() + ":" + url.getPath();
if (cellStringBuilder != null) {
if(rowId >= 0) {
// We are inside the table: all hyperlinks
// should be converted to their URLs regardless of
// their label.
cellStringBuilder.append(externalLink);
} else {
// We are in the header: keep the labels instead
currentExternalLink = externalLink;
iterate(e);
currentExternalLink = null;
}
}
}
public void visit(WtNoLinkTitle e) {
if (cellStringBuilder != null) {
if (currentInternalLink != null) {
cellStringBuilder.append(currentInternalLink);
} else if (currentExternalLink != null) {
cellStringBuilder.append(currentExternalLink);
}
}
}
public void visit(WtLinkTitle e) {
iterate(e);
}
public void visit(WtUrl e) {
// already handled, in WtExternalLink, added here for clarity
}
/* Content blocks */
public void visit(WtParsedWikitextPage e) {
iterate(e);
}
public void visit(WtSection e) {
iterate(e);
}
public void visit(WtBody e) {
iterate(e);
}
public void visit(WtItalics e) {
iterate(e);
}
public void visit(WtBold e) {
iterate(e);
currentExternalLink = null;
}
@Override
@ -402,9 +446,11 @@ public class WikitextImporter extends TabularImportingParserBase {
List<Recon> recons = new ArrayList<Recon>(rowSize);
for (int j = 0; j < rowSize; j++) {
recons.add(null);
if (i == 0)
columnReconciled.add(false);
}
reconList.add(recons);
columnReconciled.add(false);
}
int batchSize = 50;

View File

@ -49,7 +49,7 @@ import org.testng.annotations.Test;
import com.google.refine.importers.WikitextImporter;
public class WikitextImporterTests extends ImporterTest {
//System Under Test
private WikitextImporter importer = null;
@Override
@ -83,7 +83,7 @@ public class WikitextImporterTests extends ImporterTest {
+ "|-\n"
+ "|}\n";
try {
prepareOptions(0, true, true);
prepareOptions(0, true, true, null);
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
@ -101,7 +101,7 @@ public class WikitextImporterTests extends ImporterTest {
String input = "\n"
+"{|\n"
+"|-\n"
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung|Cedefop]] || Cedefop || [http://www.cedefop.europa.eu/]\n"
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung|Cedefop]] || Cedefop || http://www.cedefop.europa.eu/\n"
+"|-\n"
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n"
+"|-\n"
@ -110,7 +110,7 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n";
try {
prepareOptions(0, true, true);
prepareOptions(0, true, true, "https://de.wikipedia.org/wiki/");
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
@ -118,16 +118,25 @@ public class WikitextImporterTests extends ImporterTest {
Assert.assertEquals(project.columnModel.columns.size(), 3);
Assert.assertEquals(project.rows.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.size(), 3);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Cedefop");
// Reconciled cells
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "Cedefop");
Assert.assertEquals(project.rows.get(0).cells.get(1).recon, null);
Assert.assertEquals(project.rows.get(2).cells.get(0).value, "Europäische Beobachtungsstelle für Drogen und Drogensucht");
Assert.assertEquals(project.rows.get(2).cells.get(0).recon.getBestCandidate().id, "Q1377256");
// various ways to input external links
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "europa.eu");
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "http://www.emcdda.europa.eu/");
// Assert.assertEquals(project.rows.get(0).cells.get(2).value, "http://www.cedefop.europa.eu/");
// unfortunately the above does not seem to be supported by the parser (parsed as blank instead)
}
@Test
public void readStyledTableWithHeader() {
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
String input = "\n"
+"==Agenturen==\n"
+"{| class=\"wikitable sortable\"\n"
+"! style=\"text-align:left; width: 60em\" | Offizieller Name\n"
+"! style=\"text-align:left; width: 9em\" | Abkürzung\n"
@ -137,27 +146,27 @@ public class WikitextImporterTests extends ImporterTest {
+"! style=\"text-align:left; width: 6em\" | Gründung\n"
+"! style=\"text-align:left; width: 50em\" | Anmerkungen\n"
+"|-\n"
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung]] || Cedefop || [http://www.cedefop.europa.eu/] || [[Thessaloniki]] || {{Griechenland}} || 1975 ||\n"
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung]] || '''Cedefop''' || [http://www.cedefop.europa.eu/] || [[Thessaloniki]] || {{Griechenland}} || 1975 ||\n"
+"|-\n"
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/] || [[Dublin]] || {{Irland}} || 1975 ||\n"
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || ''EUROFOUND'' || [http://www.eurofound.europa.eu/] || [[Dublin]] || {{Irland}} || 1975 ||\n"
+"|-\n"
+"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/] || [[Lissabon]] || {{Portugal}} || 1993 ||\n"
+"|-\n"
+"|}\n";
try {
prepareOptions(-1, true, true);
prepareOptions(-1, true, true, null);
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
}
Assert.assertEquals(project.columnModel.columns.size(), 7);
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung");
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "Cedefop");
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "EUROFOUND");
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Offizieller Name");
Assert.assertEquals(project.columnModel.columns.get(6).getName(), "Anmerkungen");
Assert.assertEquals(project.rows.get(0).cells.size(), 7);
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
}
@Test
@ -179,7 +188,7 @@ public class WikitextImporterTests extends ImporterTest {
+"|}\n";
try {
prepareOptions(-1, true, true);
prepareOptions(-1, true, true, null);
parse(input);
} catch (Exception e) {
Assert.fail("Parsing failed", e);
@ -197,12 +206,13 @@ public class WikitextImporterTests extends ImporterTest {
private void prepareOptions(
int limit, boolean blankSpanningCells,
boolean guessValueType) {
boolean guessValueType, String wikiUrl) {
whenGetIntegerOption("limit", options, limit);
whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells);
whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
whenGetStringOption("wikiUrl", options, wikiUrl);
whenGetIntegerOption("headerLines", options, 1);
whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api");
}

View File

@ -377,7 +377,8 @@ function init() {
"styles/views/extend-data-preview-dialog.less",
"styles/index/fixed-width-parser-ui.less",
"styles/index/xml-parser-ui.less",
"styles/index/json-parser-ui.less"
"styles/index/json-parser-ui.less",
"styles/index/wikitext-parser-ui.less",
]
);

View File

@ -115,6 +115,7 @@
"store-nulls": "Store blank cells as nulls",
"blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls",
"wiki-base-url": "Reconcile to wiki with base URL:",
"invalid-wikitext": "No table could be parsed. Are you sure this is a valid wiki table?",
"store-source": "Store file source <br/>(file names, URLs)<br/>in each row",
"preserve-empty": "Preserve empty strings",
"trim": "Trim leading &amp; trailing whitespace from strings",

View File

@ -186,8 +186,13 @@ Refine.WikitextParserUI.prototype._updatePreview = function() {
if (result.status == "ok") {
self._controller.getPreviewData(function(projectData) {
self._progressContainer.hide();
new Refine.PreviewTable(projectData, self._dataContainer.unbind().empty());
var container = self._dataContainer.unbind().empty();
if (projectData.rowModel.rows.length === 0) {
$('<div>').addClass("wikitext-parser-ui-message")
.text($.i18n._('core-index-parser')["invalid-wikitext"]).appendTo(container);
} else {
new Refine.PreviewTable(projectData, container);
}
});
}
});

View File

@ -0,0 +1,42 @@
/*
Copyright 2011, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
@import-less url("../theme.less");
.wikitext-parser-ui-message {
background: #eee;
font-size: 150%;
color: #666;
padding: 20px;
}