diff --git a/main/src/com/google/refine/importers/WikitextImporter.java b/main/src/com/google/refine/importers/WikitextImporter.java index f1f202a70..93203aa0f 100644 --- a/main/src/com/google/refine/importers/WikitextImporter.java +++ b/main/src/com/google/refine/importers/WikitextImporter.java @@ -3,7 +3,11 @@ package com.google.refine.importers; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.json.JSONObject; import com.google.common.io.CharStreams; @@ -17,6 +21,8 @@ import org.sweble.wikitext.parser.nodes.WtItalics; import org.sweble.wikitext.parser.nodes.WtNewline; import org.sweble.wikitext.parser.nodes.WtNode; import org.sweble.wikitext.parser.nodes.WtSection; +import org.sweble.wikitext.parser.nodes.WtTagExtension; +import org.sweble.wikitext.parser.nodes.WtTagExtensionBody; import org.sweble.wikitext.parser.nodes.WtTemplate; import org.sweble.wikitext.parser.nodes.WtTemplateArgument; import org.sweble.wikitext.parser.nodes.WtTemplateArguments; @@ -44,7 +50,6 @@ import org.sweble.wikitext.parser.nodes.WtXmlStartTag; import org.sweble.wikitext.parser.WikitextEncodingValidator; import org.sweble.wikitext.parser.WikitextPreprocessor; import org.sweble.wikitext.parser.encval.ValidatedWikitext; -import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage; import org.sweble.wikitext.parser.nodes.WtPreproWikitextPage; import org.sweble.wikitext.parser.parser.PreprocessorToParserTransformer; import org.sweble.wikitext.parser.preprocessor.PreprocessedWikitext; @@ -80,6 +85,7 @@ public class WikitextImporter extends TabularImportingParserBase { JSONUtilities.safePut(options, "blankSpanningCells", true); JSONUtilities.safePut(options, "includeRawTemplates", false); JSONUtilities.safePut(options, "wikiUrl", "https://en.wikipedia.org/wiki/"); + JSONUtilities.safePut(options, "parseReferences", true); return options; } @@ -87,14 +93,16 @@ public class WikitextImporter extends TabularImportingParserBase { private class SpanningCell { public String value; public String reconciled; + public String reference; public int colspan; public int rowspan; public int row; public int col; - SpanningCell(String value, String reconciled, int row, int col, int rowspan, int colspan) { + SpanningCell(String value, String reconciled, String reference, int row, int col, int rowspan, int colspan) { this.value = value; this.reconciled = reconciled; + this.reference = reference; this.row = row; this.col = col; this.rowspan = rowspan; @@ -123,8 +131,12 @@ public class WikitextImporter extends TabularImportingParserBase { public String caption; public List header; public List> rows; + public List> references; public List wikilinkedCells; + private List currentRow; + private List currentRowReferences; + private Map namedReferences; private boolean blankSpanningCells; private boolean includeRawTemplates; @@ -136,28 +148,40 @@ public class WikitextImporter extends TabularImportingParserBase { private String currentXmlAttr; private String currentInternalLink; private String currentExternalLink; + private String lastExternalLink; + private String currentReference; + private String currentReferenceName; private int colspan; private int rowspan; private int spanningCellIdx; private List internalLinksInCell; + private final Pattern urlPattern = Pattern.compile("\\b(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", + Pattern.CASE_INSENSITIVE); + public WikitextTableVisitor(boolean blankSpanningCells, boolean includeRawTemplates) { this.blankSpanningCells = blankSpanningCells; this.includeRawTemplates = includeRawTemplates; caption = null; header = new ArrayList(); rows = new ArrayList>(); + references = new ArrayList>(); wikilinkedCells = new ArrayList(); spanningCells = new ArrayList(); cellStringBuilder = null; xmlAttrStringBuilder = null; + currentRowReferences = null; currentInternalLink = null; currentExternalLink = null; + lastExternalLink = null; + currentReference = null; + currentReferenceName = null; colspan = 0; rowspan = 0; rowId = -1; spanningCellIdx = 0; internalLinksInCell = new ArrayList(); + namedReferences = new HashMap(); } @Override @@ -169,7 +193,7 @@ public class WikitextImporter extends TabularImportingParserBase { public void visit(WtNode e) { // Ignore other nodes - System.out.println(e.getNodeName()); + // System.out.println(e.getNodeName()); } /* Table handling */ @@ -201,11 +225,13 @@ public class WikitextImporter extends TabularImportingParserBase { rowId = 0; } currentRow = new ArrayList(); + currentRowReferences = new ArrayList(); spanningCellIdx = 0; addSpanningCells(); iterate(e); if(currentRow.size() > 0) { rows.add(currentRow); + references.add(currentRowReferences); rowId++; } currentRow = null; @@ -218,12 +244,16 @@ public class WikitextImporter extends TabularImportingParserBase { rowspan = 1; colspan = 1; internalLinksInCell.clear(); + currentReference = null; + currentReferenceName = null; + String value = renderCellAsString(e); int colId = currentRow.size(); // Add the cell to the row we are currently building currentRow.add(value); + currentRowReferences.add(currentReference); // Reconcile it if we found exactly one link in the cell String reconciled = null; @@ -235,7 +265,8 @@ public class WikitextImporter extends TabularImportingParserBase { // Mark it as spanning if we found the tags if (colspan > 1 || rowspan > 1) { SpanningCell spanningCell = new SpanningCell( - value, reconciled, rowId, colId, rowspan, colspan); + value, reconciled, currentReference, + rowId, colId, rowspan, colspan); spanningCells.add(spanningCellIdx, spanningCell); } @@ -282,11 +313,52 @@ public class WikitextImporter extends TabularImportingParserBase { } } + public void visit(WtTagExtension tag) { + if ("ref".equals(tag.getName())) { + lastExternalLink = null; + currentReferenceName = null; + + iterate(tag); + + // load any reference parsed earlier + if (currentReferenceName != null) { + currentReference = namedReferences.get(currentReferenceName); + } else { + currentReferenceName = ""; + } + // update with any new link found in the body of the reference + if (lastExternalLink != null) { + currentReference = lastExternalLink; + } + + // store the reference for later use + if (currentReference != null && currentReferenceName != "") { + namedReferences.put(currentReferenceName, currentReference); + } + } + } + + public void visit(WtTagExtensionBody body) { + /* + * Here, the content of the tag is not parsed further, it's just a String. + * So we have to resort to string matching. + * https://github.com/sweble/sweble-wikitext/issues/67 + */ + String contents = body.getContent(); + Matcher matcher = urlPattern.matcher(contents); + while(matcher.find()) { + lastExternalLink = contents.substring(matcher.start(), matcher.end()); + } + } + public void writeText(String text) { - if (xmlAttrStringBuilder != null) { - xmlAttrStringBuilder.append(text); - } else if (cellStringBuilder != null) { - cellStringBuilder.append(text); + // do not render text that is inside + if (currentReferenceName == null) { + if (xmlAttrStringBuilder != null) { + xmlAttrStringBuilder.append(text); + } else if (cellStringBuilder != null) { + cellStringBuilder.append(text); + } } } @@ -305,8 +377,10 @@ public class WikitextImporter extends TabularImportingParserBase { while(currentRow.size() < cell.col + cell.colspan) { if (blankSpanningCells) { currentRow.add(null); + currentRowReferences.add(null); } else { currentRow.add(cell.value); + currentRowReferences.add(cell.reference); if (cell.reconciled != null) { wikilinkedCells.add(new WikilinkedCell(cell.reconciled, rowId, currentRow.size()-1)); } @@ -322,7 +396,7 @@ public class WikitextImporter extends TabularImportingParserBase { } } - /* XML attributes : useful for colspan and rowspan */ + /* XML attributes : useful for colspan and rowspan, and reference names */ public void visit(WtXmlAttributes e) { iterate(e); @@ -333,11 +407,12 @@ public class WikitextImporter extends TabularImportingParserBase { xmlAttrStringBuilder = new StringBuilder(); iterate(e); try { - int attrValue = Integer.parseInt(xmlAttrStringBuilder.toString()); if ("colspan".equals(currentXmlAttr)) { - colspan = attrValue; + colspan = Integer.parseInt(xmlAttrStringBuilder.toString()); } else if ("rowspan".equals(currentXmlAttr)) { - rowspan = attrValue; + rowspan = Integer.parseInt(xmlAttrStringBuilder.toString()); + } else if ("name".equals(currentXmlAttr)) { + currentReferenceName = xmlAttrStringBuilder.toString(); } } catch (NumberFormatException _) { } @@ -349,6 +424,7 @@ public class WikitextImporter extends TabularImportingParserBase { public void visit(WtName e) { try { currentXmlAttr = e.getAsString(); + } catch (UnsupportedOperationException _) { currentXmlAttr = null; } @@ -383,6 +459,7 @@ public class WikitextImporter extends TabularImportingParserBase { currentExternalLink = null; } } + lastExternalLink = externalLink; } public void visit(WtNoLinkTitle e) { @@ -406,7 +483,8 @@ public class WikitextImporter extends TabularImportingParserBase { /* Templates */ public void visit(WtTemplate e) { - if (includeRawTemplates) { + // only render templates if we are told to do so or inside a reference + if (includeRawTemplates || currentReferenceName != null) { writeText("{{"+e.getName().getAsString()); WtTemplateArguments args = e.getArgs(); for (int i = 0; i != args.size(); i++) { @@ -418,10 +496,13 @@ public class WikitextImporter extends TabularImportingParserBase { } public void visit(WtTemplateArgument e) { - writeText("|"); - if(e.hasName()) { - writeText(e.getName().getAsString()); - writeText("="); + // do not render templates that are inside a reference + if (currentReferenceName == null) { + writeText("|"); + if(e.hasName()) { + writeText(e.getName().getAsString()); + writeText("="); + } } iterate(e.getValue()); } @@ -460,21 +541,39 @@ public class WikitextImporter extends TabularImportingParserBase { private WikitextTableVisitor visitor = null; private List> reconList = null; private List columnReconciled = null; + private List columnReferenced = null; - public WikiTableDataReader(WikitextTableVisitor visitor) { + public WikiTableDataReader(WikitextTableVisitor visitor, boolean references) { this.visitor = visitor; currentRow = -1; reconList = null; + + if (references) { + // Check which column had references + columnReferenced = new ArrayList(); + for (List row : this.visitor.references) { + for (int i = 0; i != row.size(); i++) { + while (i >= columnReferenced.size()) { + columnReferenced.add(false); + } + if (row.get(i) != null) { + columnReferenced.set(i, true); + } + } + } + } } @Override public List getNextRowOfCells() throws IOException { List row = null; List origRow = null; + List refRow = null; if (currentRow == -1) { origRow = this.visitor.header; } else if(currentRow < this.visitor.rows.size()) { origRow = this.visitor.rows.get(currentRow); + refRow = this.visitor.references.get(currentRow); } if (origRow != null) { @@ -485,6 +584,18 @@ public class WikitextImporter extends TabularImportingParserBase { recon = reconList.get(currentRow).get(i); } row.add(new Cell(origRow.get(i), recon)); + + // if we should add reference colums… + if (columnReferenced != null && columnReferenced.get(i)) { + String refValue = null; + // for headers + if(currentRow == -1) { + refValue = origRow.get(i)+"_ref"; + } else { + refValue = refRow.get(i); + } + row.add(new Cell(refValue, null)); + } } } currentRow++; @@ -577,10 +688,11 @@ public class WikitextImporter extends TabularImportingParserBase { // Compile the retrieved page boolean blankSpanningCells = JSONUtilities.getBoolean(options, "blankSpanningCells", true); boolean includeRawTemplates = JSONUtilities.getBoolean(options, "includeRawTemplates", false); + boolean parseReferences = JSONUtilities.getBoolean(options, "parseReferences", true); final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells, includeRawTemplates); vs.go(parsedArticle); - WikiTableDataReader dataReader = new WikiTableDataReader(vs); + WikiTableDataReader dataReader = new WikiTableDataReader(vs, parseReferences); // Reconcile if needed String wikiUrl = JSONUtilities.getString(options, "wikiUrl", null); diff --git a/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java b/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java index 97853bfc6..ec0173189 100644 --- a/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java +++ b/main/tests/server/src/com/google/refine/tests/importers/WikitextImporterTests.java @@ -196,6 +196,34 @@ public class WikitextImporterTests extends ImporterTest { Assert.assertNull(project.rows.get(1).cells.get(3).value); Assert.assertEquals(project.rows.get(1).cells.get(4).value, "Butter"); } + + @Test + public void readTableWithReferences() { + // inspired from https://www.mediawiki.org/wiki/Help:Tables + String input = "{|\n" + +"! price\n" + +"! fruit\n" + +"! merchant\n" + +"|-\n" + +"| a || b See [http://gnu.org here] || c or http://microsoft.com/ \n" + +"|-\n" + +"| d || e || f \n" + +"|-\n" + +"|}\n"; + + try { + prepareOptions(-1, true, true, null); + parse(input); + } catch (Exception e) { + Assert.fail("Parsing failed", e); + } + Assert.assertEquals(project.columnModel.columns.size(), 5); + Assert.assertEquals(project.rows.get(0).cells.get(1).value, "b"); + Assert.assertEquals(project.rows.get(0).cells.get(2).value, "http://gnu.org"); + Assert.assertEquals(project.rows.get(0).cells.get(4).value, "http://microsoft.com/"); + Assert.assertEquals(project.rows.get(1).cells.get(4).value, "http://gnu.org"); + Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://microsoft.com/"); + } //--helpers-- private void parse(String wikitext) { @@ -210,6 +238,7 @@ public class WikitextImporterTests extends ImporterTest { whenGetBooleanOption("guessCellValueTypes", options, guessValueType); whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells); whenGetBooleanOption("storeBlankCellsAsNulls", options, true); + whenGetBooleanOption("parseReferences", options, true); whenGetStringOption("wikiUrl", options, wikiUrl); whenGetIntegerOption("headerLines", options, 1); whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api"); diff --git a/main/webapp/modules/core/langs/translation-en.json b/main/webapp/modules/core/langs/translation-en.json index 8d8448f27..dc0c725dc 100644 --- a/main/webapp/modules/core/langs/translation-en.json +++ b/main/webapp/modules/core/langs/translation-en.json @@ -115,6 +115,7 @@ "store-nulls": "Store blank cells as nulls", "blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls", "include-raw-templates": "Include templates as raw wikicode", + "parse-references": "Extract references in additional columns", "wiki-base-url": "Reconcile to wiki with base URL:", "invalid-wikitext": "No table could be parsed. Are you sure this is a valid wiki table?", "store-source": "Store file source
(file names, URLs)
in each row", diff --git a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html index ff41f7841..5f5477126 100644 --- a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html +++ b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html @@ -15,6 +15,8 @@ + + diff --git a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js index 1b30198a7..2a7081d29 100644 --- a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js +++ b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js @@ -88,6 +88,7 @@ Refine.WikitextParserUI.prototype.getOptions = function() { options.storeBlankRows = this._optionContainerElmts.storeBlankRowsCheckbox[0].checked; options.blankSpanningCells = this._optionContainerElmts.blankSpanningCellsCheckbox[0].checked; options.includeRawTemplates = this._optionContainerElmts.includeRawTemplatesCheckbox[0].checked; + options.parseReferences = this._optionContainerElmts.parseReferencesCheckbox[0].checked; options.guessCellValueTypes = this._optionContainerElmts.guessCellValueTypesCheckbox[0].checked; @@ -115,6 +116,7 @@ Refine.WikitextParserUI.prototype._initialize = function() { $('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]); $('#or-import-blankSpanningCells').text($.i18n._('core-index-parser')["blank-spanning-cells"]); $('#or-import-includeRawTemplates').text($.i18n._('core-index-parser')["include-raw-templates"]); + $('#or-import-parseReferences').text($.i18n._('core-index-parser')["parse-references"]); $('#or-import-blank').text($.i18n._('core-index-parser')["store-blank"]); $('#or-import-null').text($.i18n._('core-index-parser')["store-nulls"]); $('#or-import-source').html($.i18n._('core-index-parser')["store-source"]); @@ -148,6 +150,10 @@ Refine.WikitextParserUI.prototype._initialize = function() { this._optionContainerElmts.includeRawTemplatesCheckbox.prop("checked", true); } + if (this._config.parseReferences) { + this._optionContainerElmts.parseReferencesCheckbox.prop("checked", true); + } + if (this._config.storeBlankRows) { this._optionContainerElmts.storeBlankRowsCheckbox.prop("checked", true); }