Better error reporting and testing for Wikitext import
This commit is contained in:
parent
e47fb3f2a6
commit
637e69db9d
@ -15,7 +15,10 @@ import de.fau.cs.osr.ptk.common.AstVisitor;
|
||||
import org.sweble.wikitext.parser.ParserConfig;
|
||||
import org.sweble.wikitext.parser.utils.SimpleParserConfig;
|
||||
import org.sweble.wikitext.parser.WikitextParser;
|
||||
import org.sweble.wikitext.parser.nodes.WtBold;
|
||||
import org.sweble.wikitext.parser.nodes.WtItalics;
|
||||
import org.sweble.wikitext.parser.nodes.WtNode;
|
||||
import org.sweble.wikitext.parser.nodes.WtSection;
|
||||
import org.sweble.wikitext.parser.nodes.WtText;
|
||||
import org.sweble.wikitext.parser.nodes.WtInternalLink;
|
||||
import org.sweble.wikitext.parser.nodes.WtExternalLink;
|
||||
@ -156,18 +159,14 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
return super.before(node);
|
||||
}
|
||||
|
||||
/* Default handler */
|
||||
|
||||
public void visit(WtNode e) {
|
||||
// Ignore other nodes
|
||||
// System.out.println(e.getNodeName());
|
||||
}
|
||||
|
||||
public void visit(WtParsedWikitextPage e) {
|
||||
iterate(e);
|
||||
}
|
||||
|
||||
public void visit(WtBody e) {
|
||||
iterate(e);
|
||||
}
|
||||
/* Table handling */
|
||||
|
||||
public void visit(WtTable e) {
|
||||
iterate(e);
|
||||
@ -239,6 +238,28 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
}
|
||||
}
|
||||
|
||||
public String renderCellAsString(WtNode e) {
|
||||
cellStringBuilder = new StringBuilder();
|
||||
iterate(e);
|
||||
String value = cellStringBuilder.toString();
|
||||
if (value == null) {
|
||||
value = "";
|
||||
}
|
||||
value = value.trim();
|
||||
cellStringBuilder = null;
|
||||
return value;
|
||||
}
|
||||
|
||||
public void visit(WtText text) {
|
||||
if (xmlAttrStringBuilder != null) {
|
||||
xmlAttrStringBuilder.append(text.getContent());
|
||||
} else if (cellStringBuilder != null) {
|
||||
cellStringBuilder.append(text.getContent());
|
||||
}
|
||||
}
|
||||
|
||||
/* Spanning cell helpers */
|
||||
|
||||
private SpanningCell spanningCell() {
|
||||
return spanningCells.get(spanningCellIdx);
|
||||
}
|
||||
@ -269,6 +290,8 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
}
|
||||
}
|
||||
|
||||
/* XML attributes : useful for colspan and rowspan */
|
||||
|
||||
public void visit(WtXmlAttributes e) {
|
||||
iterate(e);
|
||||
}
|
||||
@ -299,39 +322,9 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
iterate(e);
|
||||
}
|
||||
|
||||
public String renderCellAsString(WtNode e) {
|
||||
cellStringBuilder = new StringBuilder();
|
||||
iterate(e);
|
||||
String value = cellStringBuilder.toString();
|
||||
if (value == null) {
|
||||
value = "";
|
||||
}
|
||||
value = value.trim();
|
||||
cellStringBuilder = null;
|
||||
return value;
|
||||
}
|
||||
/* Link management */
|
||||
|
||||
|
||||
public void visit(WtText text) {
|
||||
if (xmlAttrStringBuilder != null) {
|
||||
xmlAttrStringBuilder.append(text.getContent());
|
||||
} else if (cellStringBuilder != null) {
|
||||
cellStringBuilder.append(text.getContent());
|
||||
}
|
||||
}
|
||||
|
||||
public void visit(WtNoLinkTitle e) {
|
||||
if (currentInternalLink != null) {
|
||||
cellStringBuilder.append(currentInternalLink);
|
||||
} else if (currentExternalLink != null) {
|
||||
cellStringBuilder.append(currentExternalLink);
|
||||
}
|
||||
}
|
||||
|
||||
public void visit(WtLinkTitle e) {
|
||||
iterate(e);
|
||||
}
|
||||
|
||||
public void visit(WtInternalLink e) {
|
||||
currentInternalLink = e.getTarget().getAsString();
|
||||
internalLinksInCell.add(currentInternalLink);
|
||||
@ -341,9 +334,60 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
|
||||
public void visit(WtExternalLink e) {
|
||||
WtUrl url = e.getTarget();
|
||||
currentExternalLink = url.getProtocol() + ":" + url.getPath();
|
||||
String externalLink = url.getProtocol() + ":" + url.getPath();
|
||||
if (cellStringBuilder != null) {
|
||||
if(rowId >= 0) {
|
||||
// We are inside the table: all hyperlinks
|
||||
// should be converted to their URLs regardless of
|
||||
// their label.
|
||||
cellStringBuilder.append(externalLink);
|
||||
} else {
|
||||
// We are in the header: keep the labels instead
|
||||
currentExternalLink = externalLink;
|
||||
iterate(e);
|
||||
currentExternalLink = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void visit(WtNoLinkTitle e) {
|
||||
if (cellStringBuilder != null) {
|
||||
if (currentInternalLink != null) {
|
||||
cellStringBuilder.append(currentInternalLink);
|
||||
} else if (currentExternalLink != null) {
|
||||
cellStringBuilder.append(currentExternalLink);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void visit(WtLinkTitle e) {
|
||||
iterate(e);
|
||||
}
|
||||
|
||||
public void visit(WtUrl e) {
|
||||
// already handled, in WtExternalLink, added here for clarity
|
||||
}
|
||||
|
||||
/* Content blocks */
|
||||
|
||||
public void visit(WtParsedWikitextPage e) {
|
||||
iterate(e);
|
||||
}
|
||||
|
||||
public void visit(WtSection e) {
|
||||
iterate(e);
|
||||
}
|
||||
|
||||
public void visit(WtBody e) {
|
||||
iterate(e);
|
||||
}
|
||||
|
||||
public void visit(WtItalics e) {
|
||||
iterate(e);
|
||||
}
|
||||
|
||||
public void visit(WtBold e) {
|
||||
iterate(e);
|
||||
currentExternalLink = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -402,9 +446,11 @@ public class WikitextImporter extends TabularImportingParserBase {
|
||||
List<Recon> recons = new ArrayList<Recon>(rowSize);
|
||||
for (int j = 0; j < rowSize; j++) {
|
||||
recons.add(null);
|
||||
if (i == 0)
|
||||
columnReconciled.add(false);
|
||||
}
|
||||
reconList.add(recons);
|
||||
columnReconciled.add(false);
|
||||
|
||||
}
|
||||
|
||||
int batchSize = 50;
|
||||
|
@ -49,7 +49,7 @@ import org.testng.annotations.Test;
|
||||
import com.google.refine.importers.WikitextImporter;
|
||||
|
||||
public class WikitextImporterTests extends ImporterTest {
|
||||
//System Under Test
|
||||
|
||||
private WikitextImporter importer = null;
|
||||
|
||||
@Override
|
||||
@ -83,7 +83,7 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+ "|-\n"
|
||||
+ "|}\n";
|
||||
try {
|
||||
prepareOptions(0, true, true);
|
||||
prepareOptions(0, true, true, null);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
@ -101,7 +101,7 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
String input = "\n"
|
||||
+"{|\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung|Cedefop]] || Cedefop || [http://www.cedefop.europa.eu/]\n"
|
||||
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung|Cedefop]] || Cedefop || http://www.cedefop.europa.eu/\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n"
|
||||
+"|-\n"
|
||||
@ -110,7 +110,7 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+"|}\n";
|
||||
|
||||
try {
|
||||
prepareOptions(0, true, true);
|
||||
prepareOptions(0, true, true, "https://de.wikipedia.org/wiki/");
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
@ -118,16 +118,25 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 3);
|
||||
Assert.assertEquals(project.rows.size(), 3);
|
||||
Assert.assertEquals(project.rows.get(0).cells.size(), 3);
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Cedefop");
|
||||
|
||||
// Reconciled cells
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "Cedefop");
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(1).recon, null);
|
||||
Assert.assertEquals(project.rows.get(2).cells.get(0).value, "Europäische Beobachtungsstelle für Drogen und Drogensucht");
|
||||
Assert.assertEquals(project.rows.get(2).cells.get(0).recon.getBestCandidate().id, "Q1377256");
|
||||
|
||||
// various ways to input external links
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
|
||||
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "europa.eu");
|
||||
Assert.assertEquals(project.rows.get(2).cells.get(2).value, "http://www.emcdda.europa.eu/");
|
||||
// Assert.assertEquals(project.rows.get(0).cells.get(2).value, "http://www.cedefop.europa.eu/");
|
||||
// unfortunately the above does not seem to be supported by the parser (parsed as blank instead)
|
||||
}
|
||||
|
||||
@Test
|
||||
public void readStyledTableWithHeader() {
|
||||
// Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit
|
||||
String input = "\n"
|
||||
+"==Agenturen==\n"
|
||||
+"{| class=\"wikitable sortable\"\n"
|
||||
+"! style=\"text-align:left; width: 60em\" | Offizieller Name\n"
|
||||
+"! style=\"text-align:left; width: 9em\" | Abkürzung\n"
|
||||
@ -137,27 +146,27 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+"! style=\"text-align:left; width: 6em\" | Gründung\n"
|
||||
+"! style=\"text-align:left; width: 50em\" | Anmerkungen\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung]] || Cedefop || [http://www.cedefop.europa.eu/] || [[Thessaloniki]] || {{Griechenland}} || 1975 ||\n"
|
||||
+"| [[Europäisches Zentrum für die Förderung der Berufsbildung]] || '''Cedefop''' || [http://www.cedefop.europa.eu/] || [[Thessaloniki]] || {{Griechenland}} || 1975 ||\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/] || [[Dublin]] || {{Irland}} || 1975 ||\n"
|
||||
+"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || ''EUROFOUND'' || [http://www.eurofound.europa.eu/] || [[Dublin]] || {{Irland}} || 1975 ||\n"
|
||||
+"|-\n"
|
||||
+"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/] || [[Lissabon]] || {{Portugal}} || 1993 ||\n"
|
||||
+"|-\n"
|
||||
+"|}\n";
|
||||
|
||||
try {
|
||||
prepareOptions(-1, true, true);
|
||||
prepareOptions(-1, true, true, null);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
}
|
||||
Assert.assertEquals(project.columnModel.columns.size(), 7);
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung");
|
||||
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "Cedefop");
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(1).value, "EUROFOUND");
|
||||
Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Offizieller Name");
|
||||
Assert.assertEquals(project.columnModel.columns.get(6).getName(), "Anmerkungen");
|
||||
Assert.assertEquals(project.rows.get(0).cells.size(), 7);
|
||||
|
||||
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/");
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -179,7 +188,7 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
+"|}\n";
|
||||
|
||||
try {
|
||||
prepareOptions(-1, true, true);
|
||||
prepareOptions(-1, true, true, null);
|
||||
parse(input);
|
||||
} catch (Exception e) {
|
||||
Assert.fail("Parsing failed", e);
|
||||
@ -197,12 +206,13 @@ public class WikitextImporterTests extends ImporterTest {
|
||||
|
||||
private void prepareOptions(
|
||||
int limit, boolean blankSpanningCells,
|
||||
boolean guessValueType) {
|
||||
boolean guessValueType, String wikiUrl) {
|
||||
|
||||
whenGetIntegerOption("limit", options, limit);
|
||||
whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
|
||||
whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells);
|
||||
whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
|
||||
whenGetStringOption("wikiUrl", options, wikiUrl);
|
||||
whenGetIntegerOption("headerLines", options, 1);
|
||||
whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api");
|
||||
}
|
||||
|
@ -377,7 +377,8 @@ function init() {
|
||||
"styles/views/extend-data-preview-dialog.less",
|
||||
"styles/index/fixed-width-parser-ui.less",
|
||||
"styles/index/xml-parser-ui.less",
|
||||
"styles/index/json-parser-ui.less"
|
||||
"styles/index/json-parser-ui.less",
|
||||
"styles/index/wikitext-parser-ui.less",
|
||||
]
|
||||
);
|
||||
|
||||
|
@ -115,6 +115,7 @@
|
||||
"store-nulls": "Store blank cells as nulls",
|
||||
"blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls",
|
||||
"wiki-base-url": "Reconcile to wiki with base URL:",
|
||||
"invalid-wikitext": "No table could be parsed. Are you sure this is a valid wiki table?",
|
||||
"store-source": "Store file source <br/>(file names, URLs)<br/>in each row",
|
||||
"preserve-empty": "Preserve empty strings",
|
||||
"trim": "Trim leading & trailing whitespace from strings",
|
||||
|
@ -186,8 +186,13 @@ Refine.WikitextParserUI.prototype._updatePreview = function() {
|
||||
if (result.status == "ok") {
|
||||
self._controller.getPreviewData(function(projectData) {
|
||||
self._progressContainer.hide();
|
||||
|
||||
new Refine.PreviewTable(projectData, self._dataContainer.unbind().empty());
|
||||
var container = self._dataContainer.unbind().empty();
|
||||
if (projectData.rowModel.rows.length === 0) {
|
||||
$('<div>').addClass("wikitext-parser-ui-message")
|
||||
.text($.i18n._('core-index-parser')["invalid-wikitext"]).appendTo(container);
|
||||
} else {
|
||||
new Refine.PreviewTable(projectData, container);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
@ -0,0 +1,42 @@
|
||||
/*
|
||||
|
||||
Copyright 2011, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
@import-less url("../theme.less");
|
||||
|
||||
.wikitext-parser-ui-message {
|
||||
background: #eee;
|
||||
font-size: 150%;
|
||||
color: #666;
|
||||
padding: 20px;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user