Merge pull request #1254 from OpenRefine/issue1243
Add support for references in Wikitable importer.
This commit is contained in:
commit
49b48f58bb
@ -3,7 +3,11 @@ package com.google.refine.importers;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.json.JSONObject;
|
import org.json.JSONObject;
|
||||||
import com.google.common.io.CharStreams;
|
import com.google.common.io.CharStreams;
|
||||||
@ -17,6 +21,8 @@ import org.sweble.wikitext.parser.nodes.WtItalics;
|
|||||||
import org.sweble.wikitext.parser.nodes.WtNewline;
|
import org.sweble.wikitext.parser.nodes.WtNewline;
|
||||||
import org.sweble.wikitext.parser.nodes.WtNode;
|
import org.sweble.wikitext.parser.nodes.WtNode;
|
||||||
import org.sweble.wikitext.parser.nodes.WtSection;
|
import org.sweble.wikitext.parser.nodes.WtSection;
|
||||||
|
import org.sweble.wikitext.parser.nodes.WtTagExtension;
|
||||||
|
import org.sweble.wikitext.parser.nodes.WtTagExtensionBody;
|
||||||
import org.sweble.wikitext.parser.nodes.WtTemplate;
|
import org.sweble.wikitext.parser.nodes.WtTemplate;
|
||||||
import org.sweble.wikitext.parser.nodes.WtTemplateArgument;
|
import org.sweble.wikitext.parser.nodes.WtTemplateArgument;
|
||||||
import org.sweble.wikitext.parser.nodes.WtTemplateArguments;
|
import org.sweble.wikitext.parser.nodes.WtTemplateArguments;
|
||||||
@ -44,7 +50,6 @@ import org.sweble.wikitext.parser.nodes.WtXmlStartTag;
|
|||||||
import org.sweble.wikitext.parser.WikitextEncodingValidator;
|
import org.sweble.wikitext.parser.WikitextEncodingValidator;
|
||||||
import org.sweble.wikitext.parser.WikitextPreprocessor;
|
import org.sweble.wikitext.parser.WikitextPreprocessor;
|
||||||
import org.sweble.wikitext.parser.encval.ValidatedWikitext;
|
import org.sweble.wikitext.parser.encval.ValidatedWikitext;
|
||||||
import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage;
|
|
||||||
import org.sweble.wikitext.parser.nodes.WtPreproWikitextPage;
|
import org.sweble.wikitext.parser.nodes.WtPreproWikitextPage;
|
||||||
import org.sweble.wikitext.parser.parser.PreprocessorToParserTransformer;
|
import org.sweble.wikitext.parser.parser.PreprocessorToParserTransformer;
|
||||||
import org.sweble.wikitext.parser.preprocessor.PreprocessedWikitext;
|
import org.sweble.wikitext.parser.preprocessor.PreprocessedWikitext;
|
||||||
@ -80,6 +85,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
JSONUtilities.safePut(options, "blankSpanningCells", true);
|
JSONUtilities.safePut(options, "blankSpanningCells", true);
|
||||||
JSONUtilities.safePut(options, "includeRawTemplates", false);
|
JSONUtilities.safePut(options, "includeRawTemplates", false);
|
||||||
JSONUtilities.safePut(options, "wikiUrl", "https://en.wikipedia.org/wiki/");
|
JSONUtilities.safePut(options, "wikiUrl", "https://en.wikipedia.org/wiki/");
|
||||||
|
JSONUtilities.safePut(options, "parseReferences", true);
|
||||||
|
|
||||||
return options;
|
return options;
|
||||||
}
|
}
|
||||||
@ -87,14 +93,16 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
private class SpanningCell {
|
private class SpanningCell {
|
||||||
public String value;
|
public String value;
|
||||||
public String reconciled;
|
public String reconciled;
|
||||||
|
public String reference;
|
||||||
public int colspan;
|
public int colspan;
|
||||||
public int rowspan;
|
public int rowspan;
|
||||||
public int row;
|
public int row;
|
||||||
public int col;
|
public int col;
|
||||||
|
|
||||||
SpanningCell(String value, String reconciled, int row, int col, int rowspan, int colspan) {
|
SpanningCell(String value, String reconciled, String reference, int row, int col, int rowspan, int colspan) {
|
||||||
this.value = value;
|
this.value = value;
|
||||||
this.reconciled = reconciled;
|
this.reconciled = reconciled;
|
||||||
|
this.reference = reference;
|
||||||
this.row = row;
|
this.row = row;
|
||||||
this.col = col;
|
this.col = col;
|
||||||
this.rowspan = rowspan;
|
this.rowspan = rowspan;
|
||||||
@ -123,8 +131,12 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
public String caption;
|
public String caption;
|
||||||
public List<String> header;
|
public List<String> header;
|
||||||
public List<List<String>> rows;
|
public List<List<String>> rows;
|
||||||
|
public List<List<String>> references;
|
||||||
public List<WikilinkedCell> wikilinkedCells;
|
public List<WikilinkedCell> wikilinkedCells;
|
||||||
|
|
||||||
private List<String> currentRow;
|
private List<String> currentRow;
|
||||||
|
private List<String> currentRowReferences;
|
||||||
|
private Map<String, String> namedReferences;
|
||||||
|
|
||||||
private boolean blankSpanningCells;
|
private boolean blankSpanningCells;
|
||||||
private boolean includeRawTemplates;
|
private boolean includeRawTemplates;
|
||||||
@ -136,28 +148,40 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
private String currentXmlAttr;
|
private String currentXmlAttr;
|
||||||
private String currentInternalLink;
|
private String currentInternalLink;
|
||||||
private String currentExternalLink;
|
private String currentExternalLink;
|
||||||
|
private String lastExternalLink;
|
||||||
|
private String currentReference;
|
||||||
|
private String currentReferenceName;
|
||||||
private int colspan;
|
private int colspan;
|
||||||
private int rowspan;
|
private int rowspan;
|
||||||
private int spanningCellIdx;
|
private int spanningCellIdx;
|
||||||
private List<String> internalLinksInCell;
|
private List<String> internalLinksInCell;
|
||||||
|
|
||||||
|
private final Pattern urlPattern = Pattern.compile("\\b(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]",
|
||||||
|
Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
public WikitextTableVisitor(boolean blankSpanningCells, boolean includeRawTemplates) {
|
public WikitextTableVisitor(boolean blankSpanningCells, boolean includeRawTemplates) {
|
||||||
this.blankSpanningCells = blankSpanningCells;
|
this.blankSpanningCells = blankSpanningCells;
|
||||||
this.includeRawTemplates = includeRawTemplates;
|
this.includeRawTemplates = includeRawTemplates;
|
||||||
caption = null;
|
caption = null;
|
||||||
header = new ArrayList<String>();
|
header = new ArrayList<String>();
|
||||||
rows = new ArrayList<List<String>>();
|
rows = new ArrayList<List<String>>();
|
||||||
|
references = new ArrayList<List<String>>();
|
||||||
wikilinkedCells = new ArrayList<WikilinkedCell>();
|
wikilinkedCells = new ArrayList<WikilinkedCell>();
|
||||||
spanningCells = new ArrayList<SpanningCell>();
|
spanningCells = new ArrayList<SpanningCell>();
|
||||||
cellStringBuilder = null;
|
cellStringBuilder = null;
|
||||||
xmlAttrStringBuilder = null;
|
xmlAttrStringBuilder = null;
|
||||||
|
currentRowReferences = null;
|
||||||
currentInternalLink = null;
|
currentInternalLink = null;
|
||||||
currentExternalLink = null;
|
currentExternalLink = null;
|
||||||
|
lastExternalLink = null;
|
||||||
|
currentReference = null;
|
||||||
|
currentReferenceName = null;
|
||||||
colspan = 0;
|
colspan = 0;
|
||||||
rowspan = 0;
|
rowspan = 0;
|
||||||
rowId = -1;
|
rowId = -1;
|
||||||
spanningCellIdx = 0;
|
spanningCellIdx = 0;
|
||||||
internalLinksInCell = new ArrayList<String>();
|
internalLinksInCell = new ArrayList<String>();
|
||||||
|
namedReferences = new HashMap<String, String>();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -169,7 +193,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
|
|
||||||
public void visit(WtNode e) {
|
public void visit(WtNode e) {
|
||||||
// Ignore other nodes
|
// Ignore other nodes
|
||||||
System.out.println(e.getNodeName());
|
// System.out.println(e.getNodeName());
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Table handling */
|
/* Table handling */
|
||||||
@ -201,11 +225,13 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
rowId = 0;
|
rowId = 0;
|
||||||
}
|
}
|
||||||
currentRow = new ArrayList<String>();
|
currentRow = new ArrayList<String>();
|
||||||
|
currentRowReferences = new ArrayList<String>();
|
||||||
spanningCellIdx = 0;
|
spanningCellIdx = 0;
|
||||||
addSpanningCells();
|
addSpanningCells();
|
||||||
iterate(e);
|
iterate(e);
|
||||||
if(currentRow.size() > 0) {
|
if(currentRow.size() > 0) {
|
||||||
rows.add(currentRow);
|
rows.add(currentRow);
|
||||||
|
references.add(currentRowReferences);
|
||||||
rowId++;
|
rowId++;
|
||||||
}
|
}
|
||||||
currentRow = null;
|
currentRow = null;
|
||||||
@ -218,12 +244,16 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
rowspan = 1;
|
rowspan = 1;
|
||||||
colspan = 1;
|
colspan = 1;
|
||||||
internalLinksInCell.clear();
|
internalLinksInCell.clear();
|
||||||
|
currentReference = null;
|
||||||
|
currentReferenceName = null;
|
||||||
|
|
||||||
String value = renderCellAsString(e);
|
String value = renderCellAsString(e);
|
||||||
|
|
||||||
int colId = currentRow.size();
|
int colId = currentRow.size();
|
||||||
|
|
||||||
// Add the cell to the row we are currently building
|
// Add the cell to the row we are currently building
|
||||||
currentRow.add(value);
|
currentRow.add(value);
|
||||||
|
currentRowReferences.add(currentReference);
|
||||||
|
|
||||||
// Reconcile it if we found exactly one link in the cell
|
// Reconcile it if we found exactly one link in the cell
|
||||||
String reconciled = null;
|
String reconciled = null;
|
||||||
@ -235,7 +265,8 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
// Mark it as spanning if we found the tags
|
// Mark it as spanning if we found the tags
|
||||||
if (colspan > 1 || rowspan > 1) {
|
if (colspan > 1 || rowspan > 1) {
|
||||||
SpanningCell spanningCell = new SpanningCell(
|
SpanningCell spanningCell = new SpanningCell(
|
||||||
value, reconciled, rowId, colId, rowspan, colspan);
|
value, reconciled, currentReference,
|
||||||
|
rowId, colId, rowspan, colspan);
|
||||||
spanningCells.add(spanningCellIdx, spanningCell);
|
spanningCells.add(spanningCellIdx, spanningCell);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -282,11 +313,52 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void visit(WtTagExtension tag) {
|
||||||
|
if ("ref".equals(tag.getName())) {
|
||||||
|
lastExternalLink = null;
|
||||||
|
currentReferenceName = null;
|
||||||
|
|
||||||
|
iterate(tag);
|
||||||
|
|
||||||
|
// load any reference parsed earlier
|
||||||
|
if (currentReferenceName != null) {
|
||||||
|
currentReference = namedReferences.get(currentReferenceName);
|
||||||
|
} else {
|
||||||
|
currentReferenceName = "";
|
||||||
|
}
|
||||||
|
// update with any new link found in the body of the reference
|
||||||
|
if (lastExternalLink != null) {
|
||||||
|
currentReference = lastExternalLink;
|
||||||
|
}
|
||||||
|
|
||||||
|
// store the reference for later use
|
||||||
|
if (currentReference != null && ! "".equals(currentReferenceName)) {
|
||||||
|
namedReferences.put(currentReferenceName, currentReference);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void visit(WtTagExtensionBody body) {
|
||||||
|
/*
|
||||||
|
* Here, the content of the <ref> tag is not parsed further, it's just a String.
|
||||||
|
* So we have to resort to string matching.
|
||||||
|
* https://github.com/sweble/sweble-wikitext/issues/67
|
||||||
|
*/
|
||||||
|
String contents = body.getContent();
|
||||||
|
Matcher matcher = urlPattern.matcher(contents);
|
||||||
|
while(matcher.find()) {
|
||||||
|
lastExternalLink = contents.substring(matcher.start(), matcher.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void writeText(String text) {
|
public void writeText(String text) {
|
||||||
if (xmlAttrStringBuilder != null) {
|
// do not render text that is inside <ref></ref>
|
||||||
xmlAttrStringBuilder.append(text);
|
if (currentReferenceName == null) {
|
||||||
} else if (cellStringBuilder != null) {
|
if (xmlAttrStringBuilder != null) {
|
||||||
cellStringBuilder.append(text);
|
xmlAttrStringBuilder.append(text);
|
||||||
|
} else if (cellStringBuilder != null) {
|
||||||
|
cellStringBuilder.append(text);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -305,8 +377,10 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
while(currentRow.size() < cell.col + cell.colspan) {
|
while(currentRow.size() < cell.col + cell.colspan) {
|
||||||
if (blankSpanningCells) {
|
if (blankSpanningCells) {
|
||||||
currentRow.add(null);
|
currentRow.add(null);
|
||||||
|
currentRowReferences.add(null);
|
||||||
} else {
|
} else {
|
||||||
currentRow.add(cell.value);
|
currentRow.add(cell.value);
|
||||||
|
currentRowReferences.add(cell.reference);
|
||||||
if (cell.reconciled != null) {
|
if (cell.reconciled != null) {
|
||||||
wikilinkedCells.add(new WikilinkedCell(cell.reconciled, rowId, currentRow.size()-1));
|
wikilinkedCells.add(new WikilinkedCell(cell.reconciled, rowId, currentRow.size()-1));
|
||||||
}
|
}
|
||||||
@ -322,7 +396,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* XML attributes : useful for colspan and rowspan */
|
/* XML attributes : useful for colspan and rowspan, and reference names */
|
||||||
|
|
||||||
public void visit(WtXmlAttributes e) {
|
public void visit(WtXmlAttributes e) {
|
||||||
iterate(e);
|
iterate(e);
|
||||||
@ -333,11 +407,12 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
xmlAttrStringBuilder = new StringBuilder();
|
xmlAttrStringBuilder = new StringBuilder();
|
||||||
iterate(e);
|
iterate(e);
|
||||||
try {
|
try {
|
||||||
int attrValue = Integer.parseInt(xmlAttrStringBuilder.toString());
|
|
||||||
if ("colspan".equals(currentXmlAttr)) {
|
if ("colspan".equals(currentXmlAttr)) {
|
||||||
colspan = attrValue;
|
colspan = Integer.parseInt(xmlAttrStringBuilder.toString());
|
||||||
} else if ("rowspan".equals(currentXmlAttr)) {
|
} else if ("rowspan".equals(currentXmlAttr)) {
|
||||||
rowspan = attrValue;
|
rowspan = Integer.parseInt(xmlAttrStringBuilder.toString());
|
||||||
|
} else if ("name".equals(currentXmlAttr)) {
|
||||||
|
currentReferenceName = xmlAttrStringBuilder.toString();
|
||||||
}
|
}
|
||||||
} catch (NumberFormatException _) {
|
} catch (NumberFormatException _) {
|
||||||
}
|
}
|
||||||
@ -349,6 +424,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
public void visit(WtName e) {
|
public void visit(WtName e) {
|
||||||
try {
|
try {
|
||||||
currentXmlAttr = e.getAsString();
|
currentXmlAttr = e.getAsString();
|
||||||
|
|
||||||
} catch (UnsupportedOperationException _) {
|
} catch (UnsupportedOperationException _) {
|
||||||
currentXmlAttr = null;
|
currentXmlAttr = null;
|
||||||
}
|
}
|
||||||
@ -383,6 +459,7 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
currentExternalLink = null;
|
currentExternalLink = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
lastExternalLink = externalLink;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void visit(WtNoLinkTitle e) {
|
public void visit(WtNoLinkTitle e) {
|
||||||
@ -406,7 +483,8 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
/* Templates */
|
/* Templates */
|
||||||
|
|
||||||
public void visit(WtTemplate e) {
|
public void visit(WtTemplate e) {
|
||||||
if (includeRawTemplates) {
|
// only render templates if we are told to do so or inside a reference
|
||||||
|
if (includeRawTemplates || currentReferenceName != null) {
|
||||||
writeText("{{"+e.getName().getAsString());
|
writeText("{{"+e.getName().getAsString());
|
||||||
WtTemplateArguments args = e.getArgs();
|
WtTemplateArguments args = e.getArgs();
|
||||||
for (int i = 0; i != args.size(); i++) {
|
for (int i = 0; i != args.size(); i++) {
|
||||||
@ -418,10 +496,13 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void visit(WtTemplateArgument e) {
|
public void visit(WtTemplateArgument e) {
|
||||||
writeText("|");
|
// do not render templates that are inside a reference
|
||||||
if(e.hasName()) {
|
if (currentReferenceName == null) {
|
||||||
writeText(e.getName().getAsString());
|
writeText("|");
|
||||||
writeText("=");
|
if(e.hasName()) {
|
||||||
|
writeText(e.getName().getAsString());
|
||||||
|
writeText("=");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
iterate(e.getValue());
|
iterate(e.getValue());
|
||||||
}
|
}
|
||||||
@ -460,21 +541,39 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
private WikitextTableVisitor visitor = null;
|
private WikitextTableVisitor visitor = null;
|
||||||
private List<List<Recon>> reconList = null;
|
private List<List<Recon>> reconList = null;
|
||||||
private List<Boolean> columnReconciled = null;
|
private List<Boolean> columnReconciled = null;
|
||||||
|
private List<Boolean> columnReferenced = null;
|
||||||
|
|
||||||
public WikiTableDataReader(WikitextTableVisitor visitor) {
|
public WikiTableDataReader(WikitextTableVisitor visitor, boolean references) {
|
||||||
this.visitor = visitor;
|
this.visitor = visitor;
|
||||||
currentRow = -1;
|
currentRow = -1;
|
||||||
reconList = null;
|
reconList = null;
|
||||||
|
|
||||||
|
if (references) {
|
||||||
|
// Check which column had references
|
||||||
|
columnReferenced = new ArrayList<Boolean>();
|
||||||
|
for (List<String> row : this.visitor.references) {
|
||||||
|
for (int i = 0; i != row.size(); i++) {
|
||||||
|
while (i >= columnReferenced.size()) {
|
||||||
|
columnReferenced.add(false);
|
||||||
|
}
|
||||||
|
if (row.get(i) != null) {
|
||||||
|
columnReferenced.set(i, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Object> getNextRowOfCells() throws IOException {
|
public List<Object> getNextRowOfCells() throws IOException {
|
||||||
List<Object> row = null;
|
List<Object> row = null;
|
||||||
List<String> origRow = null;
|
List<String> origRow = null;
|
||||||
|
List<String> refRow = null;
|
||||||
if (currentRow == -1) {
|
if (currentRow == -1) {
|
||||||
origRow = this.visitor.header;
|
origRow = this.visitor.header;
|
||||||
} else if(currentRow < this.visitor.rows.size()) {
|
} else if(currentRow < this.visitor.rows.size()) {
|
||||||
origRow = this.visitor.rows.get(currentRow);
|
origRow = this.visitor.rows.get(currentRow);
|
||||||
|
refRow = this.visitor.references.get(currentRow);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (origRow != null) {
|
if (origRow != null) {
|
||||||
@ -485,6 +584,18 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
recon = reconList.get(currentRow).get(i);
|
recon = reconList.get(currentRow).get(i);
|
||||||
}
|
}
|
||||||
row.add(new Cell(origRow.get(i), recon));
|
row.add(new Cell(origRow.get(i), recon));
|
||||||
|
|
||||||
|
// if we should add reference colums…
|
||||||
|
if (columnReferenced != null && columnReferenced.get(i)) {
|
||||||
|
String refValue = null;
|
||||||
|
// for headers
|
||||||
|
if(currentRow == -1) {
|
||||||
|
refValue = origRow.get(i)+"_ref";
|
||||||
|
} else {
|
||||||
|
refValue = refRow.get(i);
|
||||||
|
}
|
||||||
|
row.add(new Cell(refValue, null));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentRow++;
|
currentRow++;
|
||||||
@ -577,10 +688,11 @@ public class WikitextImporter extends TabularImportingParserBase {
|
|||||||
// Compile the retrieved page
|
// Compile the retrieved page
|
||||||
boolean blankSpanningCells = JSONUtilities.getBoolean(options, "blankSpanningCells", true);
|
boolean blankSpanningCells = JSONUtilities.getBoolean(options, "blankSpanningCells", true);
|
||||||
boolean includeRawTemplates = JSONUtilities.getBoolean(options, "includeRawTemplates", false);
|
boolean includeRawTemplates = JSONUtilities.getBoolean(options, "includeRawTemplates", false);
|
||||||
|
boolean parseReferences = JSONUtilities.getBoolean(options, "parseReferences", true);
|
||||||
final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells, includeRawTemplates);
|
final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells, includeRawTemplates);
|
||||||
vs.go(parsedArticle);
|
vs.go(parsedArticle);
|
||||||
|
|
||||||
WikiTableDataReader dataReader = new WikiTableDataReader(vs);
|
WikiTableDataReader dataReader = new WikiTableDataReader(vs, parseReferences);
|
||||||
|
|
||||||
// Reconcile if needed
|
// Reconcile if needed
|
||||||
String wikiUrl = JSONUtilities.getString(options, "wikiUrl", null);
|
String wikiUrl = JSONUtilities.getString(options, "wikiUrl", null);
|
||||||
|
@ -196,6 +196,34 @@ public class WikitextImporterTests extends ImporterTest {
|
|||||||
Assert.assertNull(project.rows.get(1).cells.get(3).value);
|
Assert.assertNull(project.rows.get(1).cells.get(3).value);
|
||||||
Assert.assertEquals(project.rows.get(1).cells.get(4).value, "Butter");
|
Assert.assertEquals(project.rows.get(1).cells.get(4).value, "Butter");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void readTableWithReferences() {
|
||||||
|
// inspired from https://www.mediawiki.org/wiki/Help:Tables
|
||||||
|
String input = "{|\n"
|
||||||
|
+"! price\n"
|
||||||
|
+"! fruit\n"
|
||||||
|
+"! merchant\n"
|
||||||
|
+"|-\n"
|
||||||
|
+"| a || b <ref name=\"myref\"> See [http://gnu.org here]</ref> || c <ref name=\"ms\"> or http://microsoft.com/ </ref>\n"
|
||||||
|
+"|-\n"
|
||||||
|
+"| d || e <ref name=\"ms\"/>|| f <ref name=\"myref\" />\n"
|
||||||
|
+"|-\n"
|
||||||
|
+"|}\n";
|
||||||
|
|
||||||
|
try {
|
||||||
|
prepareOptions(-1, true, true, null);
|
||||||
|
parse(input);
|
||||||
|
} catch (Exception e) {
|
||||||
|
Assert.fail("Parsing failed", e);
|
||||||
|
}
|
||||||
|
Assert.assertEquals(project.columnModel.columns.size(), 5);
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.get(1).value, "b");
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.get(2).value, "http://gnu.org");
|
||||||
|
Assert.assertEquals(project.rows.get(0).cells.get(4).value, "http://microsoft.com/");
|
||||||
|
Assert.assertEquals(project.rows.get(1).cells.get(4).value, "http://gnu.org");
|
||||||
|
Assert.assertEquals(project.rows.get(1).cells.get(2).value, "http://microsoft.com/");
|
||||||
|
}
|
||||||
//--helpers--
|
//--helpers--
|
||||||
|
|
||||||
private void parse(String wikitext) {
|
private void parse(String wikitext) {
|
||||||
@ -210,6 +238,7 @@ public class WikitextImporterTests extends ImporterTest {
|
|||||||
whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
|
whenGetBooleanOption("guessCellValueTypes", options, guessValueType);
|
||||||
whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells);
|
whenGetBooleanOption("blankSpanningCells", options, blankSpanningCells);
|
||||||
whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
|
whenGetBooleanOption("storeBlankCellsAsNulls", options, true);
|
||||||
|
whenGetBooleanOption("parseReferences", options, true);
|
||||||
whenGetStringOption("wikiUrl", options, wikiUrl);
|
whenGetStringOption("wikiUrl", options, wikiUrl);
|
||||||
whenGetIntegerOption("headerLines", options, 1);
|
whenGetIntegerOption("headerLines", options, 1);
|
||||||
whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api");
|
whenGetStringOption("reconService", options, "https://tools.wmflabs.org/openrefine-wikidata/en/api");
|
||||||
|
@ -115,6 +115,7 @@
|
|||||||
"store-nulls": "Store blank cells as nulls",
|
"store-nulls": "Store blank cells as nulls",
|
||||||
"blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls",
|
"blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls",
|
||||||
"include-raw-templates": "Include templates as raw wikicode",
|
"include-raw-templates": "Include templates as raw wikicode",
|
||||||
|
"parse-references": "Extract references in additional columns",
|
||||||
"wiki-base-url": "Reconcile to wiki with base URL:",
|
"wiki-base-url": "Reconcile to wiki with base URL:",
|
||||||
"invalid-wikitext": "No table could be parsed. Are you sure this is a valid wiki table?",
|
"invalid-wikitext": "No table could be parsed. Are you sure this is a valid wiki table?",
|
||||||
"store-source": "Store file source <br/>(file names, URLs)<br/>in each row",
|
"store-source": "Store file source <br/>(file names, URLs)<br/>in each row",
|
||||||
|
@ -15,6 +15,8 @@
|
|||||||
<td colspan="2"><label for="$store-blank-rows" id="or-import-blank"></label></td></tr>
|
<td colspan="2"><label for="$store-blank-rows" id="or-import-blank"></label></td></tr>
|
||||||
<tr><td width="1%"><input type="checkbox" bind="includeRawTemplatesCheckbox" id="$include-raw-templates" /></td>
|
<tr><td width="1%"><input type="checkbox" bind="includeRawTemplatesCheckbox" id="$include-raw-templates" /></td>
|
||||||
<td colspan="2"><label for="$include-raw-templates" id="or-import-includeRawTemplates"></label></td></tr>
|
<td colspan="2"><label for="$include-raw-templates" id="or-import-includeRawTemplates"></label></td></tr>
|
||||||
|
<tr><td width="1%"><input type="checkbox" bind="parseReferencesCheckbox" id="$parse-references" /></td>
|
||||||
|
<td colspan="2"><label for="$parse-references" id="or-import-parseReferences"></label></td></tr>
|
||||||
|
|
||||||
<tr><td width="1%"><input type="checkbox" bind="storeBlankCellsAsNullsCheckbox" id="$store-blank-cells" /></td>
|
<tr><td width="1%"><input type="checkbox" bind="storeBlankCellsAsNullsCheckbox" id="$store-blank-cells" /></td>
|
||||||
<td colspan="2"><label for="$store-blank-cells" id="or-import-null"></label></td></tr>
|
<td colspan="2"><label for="$store-blank-cells" id="or-import-null"></label></td></tr>
|
||||||
|
@ -88,6 +88,7 @@ Refine.WikitextParserUI.prototype.getOptions = function() {
|
|||||||
options.storeBlankRows = this._optionContainerElmts.storeBlankRowsCheckbox[0].checked;
|
options.storeBlankRows = this._optionContainerElmts.storeBlankRowsCheckbox[0].checked;
|
||||||
options.blankSpanningCells = this._optionContainerElmts.blankSpanningCellsCheckbox[0].checked;
|
options.blankSpanningCells = this._optionContainerElmts.blankSpanningCellsCheckbox[0].checked;
|
||||||
options.includeRawTemplates = this._optionContainerElmts.includeRawTemplatesCheckbox[0].checked;
|
options.includeRawTemplates = this._optionContainerElmts.includeRawTemplatesCheckbox[0].checked;
|
||||||
|
options.parseReferences = this._optionContainerElmts.parseReferencesCheckbox[0].checked;
|
||||||
|
|
||||||
options.guessCellValueTypes = this._optionContainerElmts.guessCellValueTypesCheckbox[0].checked;
|
options.guessCellValueTypes = this._optionContainerElmts.guessCellValueTypesCheckbox[0].checked;
|
||||||
|
|
||||||
@ -115,6 +116,7 @@ Refine.WikitextParserUI.prototype._initialize = function() {
|
|||||||
$('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]);
|
$('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]);
|
||||||
$('#or-import-blankSpanningCells').text($.i18n._('core-index-parser')["blank-spanning-cells"]);
|
$('#or-import-blankSpanningCells').text($.i18n._('core-index-parser')["blank-spanning-cells"]);
|
||||||
$('#or-import-includeRawTemplates').text($.i18n._('core-index-parser')["include-raw-templates"]);
|
$('#or-import-includeRawTemplates').text($.i18n._('core-index-parser')["include-raw-templates"]);
|
||||||
|
$('#or-import-parseReferences').text($.i18n._('core-index-parser')["parse-references"]);
|
||||||
$('#or-import-blank').text($.i18n._('core-index-parser')["store-blank"]);
|
$('#or-import-blank').text($.i18n._('core-index-parser')["store-blank"]);
|
||||||
$('#or-import-null').text($.i18n._('core-index-parser')["store-nulls"]);
|
$('#or-import-null').text($.i18n._('core-index-parser')["store-nulls"]);
|
||||||
$('#or-import-source').html($.i18n._('core-index-parser')["store-source"]);
|
$('#or-import-source').html($.i18n._('core-index-parser')["store-source"]);
|
||||||
@ -148,6 +150,10 @@ Refine.WikitextParserUI.prototype._initialize = function() {
|
|||||||
this._optionContainerElmts.includeRawTemplatesCheckbox.prop("checked", true);
|
this._optionContainerElmts.includeRawTemplatesCheckbox.prop("checked", true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this._config.parseReferences) {
|
||||||
|
this._optionContainerElmts.parseReferencesCheckbox.prop("checked", true);
|
||||||
|
}
|
||||||
|
|
||||||
if (this._config.storeBlankRows) {
|
if (this._config.storeBlankRows) {
|
||||||
this._optionContainerElmts.storeBlankRowsCheckbox.prop("checked", true);
|
this._optionContainerElmts.storeBlankRowsCheckbox.prop("checked", true);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user