diff --git a/main/src/com/google/refine/importers/WikitextImporter.java b/main/src/com/google/refine/importers/WikitextImporter.java index c36774c01..1e3c63a34 100644 --- a/main/src/com/google/refine/importers/WikitextImporter.java +++ b/main/src/com/google/refine/importers/WikitextImporter.java @@ -15,7 +15,10 @@ import de.fau.cs.osr.ptk.common.AstVisitor; import org.sweble.wikitext.parser.ParserConfig; import org.sweble.wikitext.parser.utils.SimpleParserConfig; import org.sweble.wikitext.parser.WikitextParser; +import org.sweble.wikitext.parser.nodes.WtBold; +import org.sweble.wikitext.parser.nodes.WtItalics; import org.sweble.wikitext.parser.nodes.WtNode; +import org.sweble.wikitext.parser.nodes.WtSection; import org.sweble.wikitext.parser.nodes.WtText; import org.sweble.wikitext.parser.nodes.WtInternalLink; import org.sweble.wikitext.parser.nodes.WtExternalLink; @@ -156,18 +159,14 @@ public class WikitextImporter extends TabularImportingParserBase { return super.before(node); } + /* Default handler */ + public void visit(WtNode e) { // Ignore other nodes // System.out.println(e.getNodeName()); } - public void visit(WtParsedWikitextPage e) { - iterate(e); - } - - public void visit(WtBody e) { - iterate(e); - } + /* Table handling */ public void visit(WtTable e) { iterate(e); @@ -239,6 +238,28 @@ public class WikitextImporter extends TabularImportingParserBase { } } + public String renderCellAsString(WtNode e) { + cellStringBuilder = new StringBuilder(); + iterate(e); + String value = cellStringBuilder.toString(); + if (value == null) { + value = ""; + } + value = value.trim(); + cellStringBuilder = null; + return value; + } + + public void visit(WtText text) { + if (xmlAttrStringBuilder != null) { + xmlAttrStringBuilder.append(text.getContent()); + } else if (cellStringBuilder != null) { + cellStringBuilder.append(text.getContent()); + } + } + + /* Spanning cell helpers */ + private SpanningCell spanningCell() { return spanningCells.get(spanningCellIdx); } @@ -269,6 +290,8 @@ public class WikitextImporter extends TabularImportingParserBase { } } + /* XML attributes : useful for colspan and rowspan */ + public void visit(WtXmlAttributes e) { iterate(e); } @@ -299,39 +322,9 @@ public class WikitextImporter extends TabularImportingParserBase { iterate(e); } - public String renderCellAsString(WtNode e) { - cellStringBuilder = new StringBuilder(); - iterate(e); - String value = cellStringBuilder.toString(); - if (value == null) { - value = ""; - } - value = value.trim(); - cellStringBuilder = null; - return value; - } + /* Link management */ - public void visit(WtText text) { - if (xmlAttrStringBuilder != null) { - xmlAttrStringBuilder.append(text.getContent()); - } else if (cellStringBuilder != null) { - cellStringBuilder.append(text.getContent()); - } - } - - public void visit(WtNoLinkTitle e) { - if (currentInternalLink != null) { - cellStringBuilder.append(currentInternalLink); - } else if (currentExternalLink != null) { - cellStringBuilder.append(currentExternalLink); - } - } - - public void visit(WtLinkTitle e) { - iterate(e); - } - public void visit(WtInternalLink e) { currentInternalLink = e.getTarget().getAsString(); internalLinksInCell.add(currentInternalLink); @@ -341,9 +334,60 @@ public class WikitextImporter extends TabularImportingParserBase { public void visit(WtExternalLink e) { WtUrl url = e.getTarget(); - currentExternalLink = url.getProtocol() + ":" + url.getPath(); + String externalLink = url.getProtocol() + ":" + url.getPath(); + if (cellStringBuilder != null) { + if(rowId >= 0) { + // We are inside the table: all hyperlinks + // should be converted to their URLs regardless of + // their label. + cellStringBuilder.append(externalLink); + } else { + // We are in the header: keep the labels instead + currentExternalLink = externalLink; + iterate(e); + currentExternalLink = null; + } + } + } + + public void visit(WtNoLinkTitle e) { + if (cellStringBuilder != null) { + if (currentInternalLink != null) { + cellStringBuilder.append(currentInternalLink); + } else if (currentExternalLink != null) { + cellStringBuilder.append(currentExternalLink); + } + } + } + + public void visit(WtLinkTitle e) { + iterate(e); + } + + public void visit(WtUrl e) { + // already handled, in WtExternalLink, added here for clarity + } + + /* Content blocks */ + + public void visit(WtParsedWikitextPage e) { + iterate(e); + } + + public void visit(WtSection e) { + iterate(e); + } + + public void visit(WtBody e) { + iterate(e); + } + + public void visit(WtItalics e) { + iterate(e); + } + + public void visit(WtBold e) { iterate(e); - currentExternalLink = null; } @Override @@ -402,9 +446,11 @@ public class WikitextImporter extends TabularImportingParserBase { List recons = new ArrayList(rowSize); for (int j = 0; j < rowSize; j++) { recons.add(null); + if (i == 0) + columnReconciled.add(false); } reconList.add(recons); - columnReconciled.add(false); + } int batchSize = 50; diff --git a/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java b/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java index 82b5ff7b0..9515a1ab0 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java @@ -49,7 +49,7 @@ import org.testng.annotations.Test; import com.google.refine.importers.WikitextImporter; public class WikitextImporterTests extends ImporterTest { - //System Under Test + private WikitextImporter importer = null; @Override @@ -83,7 +83,7 @@ public class WikitextImporterTests extends ImporterTest { + "|-\n" + "|}\n"; try { - prepareOptions(0, true, true); + prepareOptions(0, true, true, null); parse(input); } catch (Exception e) { Assert.fail("Parsing failed", e); @@ -101,7 +101,7 @@ public class WikitextImporterTests extends ImporterTest { String input = "\n" +"{|\n" +"|-\n" - +"| [[Europäisches Zentrum für die Förderung der Berufsbildung|Cedefop]] || Cedefop || [http://www.cedefop.europa.eu/]\n" + +"| [[Europäisches Zentrum für die Förderung der Berufsbildung|Cedefop]] || Cedefop || http://www.cedefop.europa.eu/\n" +"|-\n" +"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/]\n" +"|-\n" @@ -110,7 +110,7 @@ public class WikitextImporterTests extends ImporterTest { +"|}\n"; try { - prepareOptions(0, true, true); + prepareOptions(0, true, true, "https://de.wikipedia.org/wiki/"); parse(input); } catch (Exception e) { Assert.fail("Parsing failed", e); @@ -118,16 +118,25 @@ public class WikitextImporterTests extends ImporterTest { Assert.assertEquals(project.columnModel.columns.size(), 3); Assert.assertEquals(project.rows.size(), 3); Assert.assertEquals(project.rows.get(0).cells.size(), 3); - Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Cedefop"); + + // Reconciled cells + Assert.assertEquals(project.rows.get(0).cells.get(1).value, "Cedefop"); + Assert.assertEquals(project.rows.get(0).cells.get(1).recon, null); Assert.assertEquals(project.rows.get(2).cells.get(0).value, "Europäische Beobachtungsstelle für Drogen und Drogensucht"); + Assert.assertEquals(project.rows.get(2).cells.get(0).recon.getBestCandidate().id, "Q1377256"); + + // various ways to input external links Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/"); - Assert.assertEquals(project.rows.get(2).cells.get(2).value, "europa.eu"); + Assert.assertEquals(project.rows.get(2).cells.get(2).value, "http://www.emcdda.europa.eu/"); + // Assert.assertEquals(project.rows.get(0).cells.get(2).value, "http://www.cedefop.europa.eu/"); + // unfortunately the above does not seem to be supported by the parser (parsed as blank instead) } @Test public void readStyledTableWithHeader() { // Data credits: Wikipedia contributors, https://de.wikipedia.org/w/index.php?title=Agenturen_der_Europäischen_Union&action=edit String input = "\n" + +"==Agenturen==\n" +"{| class=\"wikitable sortable\"\n" +"! style=\"text-align:left; width: 60em\" | Offizieller Name\n" +"! style=\"text-align:left; width: 9em\" | Abkürzung\n" @@ -137,27 +146,27 @@ public class WikitextImporterTests extends ImporterTest { +"! style=\"text-align:left; width: 6em\" | Gründung\n" +"! style=\"text-align:left; width: 50em\" | Anmerkungen\n" +"|-\n" - +"| [[Europäisches Zentrum für die Förderung der Berufsbildung]] || Cedefop || [http://www.cedefop.europa.eu/] || [[Thessaloniki]] || {{Griechenland}} || 1975 ||\n" + +"| [[Europäisches Zentrum für die Förderung der Berufsbildung]] || '''Cedefop''' || [http://www.cedefop.europa.eu/] || [[Thessaloniki]] || {{Griechenland}} || 1975 ||\n" +"|-\n" - +"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || EUROFOUND || [http://www.eurofound.europa.eu/] || [[Dublin]] || {{Irland}} || 1975 ||\n" + +"| [[Europäische Stiftung zur Verbesserung der Lebens- und Arbeitsbedingungen]] || ''EUROFOUND'' || [http://www.eurofound.europa.eu/] || [[Dublin]] || {{Irland}} || 1975 ||\n" +"|-\n" +"| [[Europäische Beobachtungsstelle für Drogen und Drogensucht]] || EMCDDA || [http://www.emcdda.europa.eu/] || [[Lissabon]] || {{Portugal}} || 1993 ||\n" +"|-\n" +"|}\n"; try { - prepareOptions(-1, true, true); + prepareOptions(-1, true, true, null); parse(input); } catch (Exception e) { Assert.fail("Parsing failed", e); } Assert.assertEquals(project.columnModel.columns.size(), 7); Assert.assertEquals(project.rows.get(0).cells.get(0).value, "Europäisches Zentrum für die Förderung der Berufsbildung"); + Assert.assertEquals(project.rows.get(0).cells.get(1).value, "Cedefop"); + Assert.assertEquals(project.rows.get(1).cells.get(1).value, "EUROFOUND"); Assert.assertEquals(project.columnModel.columns.get(0).getName(), "Offizieller Name"); Assert.assertEquals(project.columnModel.columns.get(6).getName(), "Anmerkungen"); - Assert.assertEquals(project.rows.get(0).cells.size(), 7); - - Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://www.eurofound.europa.eu/"); + Assert.assertEquals(project.rows.get(0).cells.size(), 7); } @Test @@ -179,7 +188,7 @@ public class WikitextImporterTests extends ImporterTest { +"|}\n"; try { - prepareOptions(-1, true, true); + prepareOptions(-1, true, true, null); parse(input); } catch (Exception e) { Assert.fail("Parsing failed", e); @@ -197,12 +206,13 @@ public class WikitextImporterTests extends ImporterTest { private void prepareOptions( int limit, boolean blankSpanningCells, - boolean guessValueType) { + boolean guessValueType, String wikiUrl) { whenGetIntegerOption("limit", options, limit); whenGetBooleanOption("guessCellValueTypes", options, guessValueType); whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells); whenGetBooleanOption("storeBlankCellsAsNulls", options, true); + whenGetStringOption("wikiUrl", options, wikiUrl); whenGetIntegerOption("headerLines", options, 1); whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api"); } diff --git a/main/webapp/modules/core/MOD-INF/controller.js b/main/webapp/modules/core/MOD-INF/controller.js index ca9d45c0f..e1120e992 100644 --- a/main/webapp/modules/core/MOD-INF/controller.js +++ b/main/webapp/modules/core/MOD-INF/controller.js @@ -377,7 +377,8 @@ function init() { "styles/views/extend-data-preview-dialog.less", "styles/index/fixed-width-parser-ui.less", "styles/index/xml-parser-ui.less", - "styles/index/json-parser-ui.less" + "styles/index/json-parser-ui.less", + "styles/index/wikitext-parser-ui.less", ] ); diff --git a/main/webapp/modules/core/langs/translation-en.json b/main/webapp/modules/core/langs/translation-en.json index b71bdad62..7c5f13ed7 100644 --- a/main/webapp/modules/core/langs/translation-en.json +++ b/main/webapp/modules/core/langs/translation-en.json @@ -115,6 +115,7 @@ "store-nulls": "Store blank cells as nulls", "blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls", "wiki-base-url": "Reconcile to wiki with base URL:", + "invalid-wikitext": "No table could be parsed. Are you sure this is a valid wiki table?", "store-source": "Store file source
(file names, URLs)
in each row", "preserve-empty": "Preserve empty strings", "trim": "Trim leading & trailing whitespace from strings", diff --git a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js index 902845717..105096da9 100644 --- a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js +++ b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js @@ -186,8 +186,13 @@ Refine.WikitextParserUI.prototype._updatePreview = function() { if (result.status == "ok") { self._controller.getPreviewData(function(projectData) { self._progressContainer.hide(); - - new Refine.PreviewTable(projectData, self._dataContainer.unbind().empty()); + var container = self._dataContainer.unbind().empty(); + if (projectData.rowModel.rows.length === 0) { + $('
').addClass("wikitext-parser-ui-message") + .text($.i18n._('core-index-parser')["invalid-wikitext"]).appendTo(container); + } else { + new Refine.PreviewTable(projectData, container); + } }); } }); diff --git a/main/webapp/modules/core/styles/index/wikitext-parser-ui.less b/main/webapp/modules/core/styles/index/wikitext-parser-ui.less new file mode 100644 index 000000000..03fb3ebf6 --- /dev/null +++ b/main/webapp/modules/core/styles/index/wikitext-parser-ui.less @@ -0,0 +1,42 @@ +/* + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +@import-less url("../theme.less"); + +.wikitext-parser-ui-message { + background: #eee; + font-size: 150%; + color: #666; + padding: 20px; + } +