diff --git a/main/src/com/google/refine/importers/WikitextImporter.java b/main/src/com/google/refine/importers/WikitextImporter.java index 3fd9e3315..06d6262ed 100644 --- a/main/src/com/google/refine/importers/WikitextImporter.java +++ b/main/src/com/google/refine/importers/WikitextImporter.java @@ -4,6 +4,7 @@ import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; +import java.util.Map; import org.json.JSONObject; import org.slf4j.Logger; @@ -45,9 +46,15 @@ import xtc.parser.ParseException; import com.google.refine.ProjectMetadata; import com.google.refine.importing.ImportingJob; +import com.google.refine.model.Cell; import com.google.refine.model.Project; +import com.google.refine.model.Recon; +import com.google.refine.model.ReconCandidate; +import com.google.refine.model.ReconStats; +import com.google.refine.model.recon.StandardReconConfig.ColumnDetail; import com.google.refine.util.JSONUtilities; - +import com.google.refine.model.recon.StandardReconConfig; +import com.google.refine.model.recon.ReconJob; public class WikitextImporter extends TabularImportingParserBase { @@ -64,19 +71,22 @@ public class WikitextImporter extends TabularImportingParserBase { JSONUtilities.safePut(options, "guessCellValueTypes", false); JSONUtilities.safePut(options, "blankSpanningCells", true); + JSONUtilities.safePut(options, "wikiUrl", "https://en.wikipedia.org/wiki/"); return options; } private class SpanningCell { public String value; + public String reconciled; public int colspan; public int rowspan; public int row; public int col; - SpanningCell(String value, int row, int col, int rowspan, int colspan) { + SpanningCell(String value, String reconciled, int row, int col, int rowspan, int colspan) { this.value = value; + this.reconciled = reconciled; this.row = row; this.col = col; this.rowspan = rowspan; @@ -84,11 +94,28 @@ public class WikitextImporter extends TabularImportingParserBase { } } + private class WikilinkedCell { + public String internalLink; + public int row; + public int col; + + WikilinkedCell(String internalLink, int row, int col) { + this.internalLink = internalLink; + this.row = row; + this.col = col; + } + + public String toURL(String wikiBaseUrl) { + return wikiBaseUrl + internalLink; + } + } + public class WikitextTableVisitor extends AstVisitor { public String caption; public List header; public List> rows; + public List wikilinkedCells; private List currentRow; private boolean blankSpanningCells; @@ -103,12 +130,14 @@ public class WikitextImporter extends TabularImportingParserBase { private int colspan; private int rowspan; private int spanningCellIdx; + private List internalLinksInCell; public WikitextTableVisitor(boolean blankSpanningCells) { this.blankSpanningCells = blankSpanningCells; caption = null; header = new ArrayList(); rows = new ArrayList>(); + wikilinkedCells = new ArrayList(); spanningCells = new ArrayList(); cellStringBuilder = null; xmlAttrStringBuilder = null; @@ -118,6 +147,7 @@ public class WikitextImporter extends TabularImportingParserBase { rowspan = 0; rowId = -1; spanningCellIdx = 0; + internalLinksInCell = new ArrayList(); } @Override @@ -181,14 +211,29 @@ public class WikitextImporter extends TabularImportingParserBase { if (currentRow != null) { rowspan = 1; colspan = 1; + internalLinksInCell.clear(); String value = renderCellAsString(e); + + int colId = currentRow.size(); + + // Add the cell to the row we are currently building currentRow.add(value); + + // Reconcile it if we found exactly one link in the cell + String reconciled = null; + if (internalLinksInCell.size() == 1) { + reconciled = internalLinksInCell.get(0); + wikilinkedCells.add(new WikilinkedCell(reconciled, rowId, colId)); + } + + // Mark it as spanning if we found the tags if (colspan > 1 || rowspan > 1) { SpanningCell spanningCell = new SpanningCell( - value, rowId, currentRow.size()-1, rowspan, colspan); + value, reconciled, rowId, colId, rowspan, colspan); spanningCells.add(spanningCellIdx, spanningCell); } + // Add all spanning cells that need to be inserted after this one. addSpanningCells(); } } @@ -208,6 +253,9 @@ public class WikitextImporter extends TabularImportingParserBase { currentRow.add(null); } else { currentRow.add(cell.value); + if (cell.reconciled != null) { + wikilinkedCells.add(new WikilinkedCell(cell.reconciled, rowId, currentRow.size()-1)); + } } } } @@ -230,13 +278,12 @@ public class WikitextImporter extends TabularImportingParserBase { iterate(e); try { int attrValue = Integer.parseInt(xmlAttrStringBuilder.toString()); - if (currentXmlAttr.equals("colspan")) { + if ("colspan".equals(currentXmlAttr)) { colspan = attrValue; - } else if (currentXmlAttr.equals("rowspan")) { + } else if ("rowspan".equals(currentXmlAttr)) { rowspan = attrValue; } } catch (NumberFormatException _) { - ; } currentXmlAttr = null; xmlAttrStringBuilder = null; @@ -286,6 +333,7 @@ public class WikitextImporter extends TabularImportingParserBase { public void visit(WtInternalLink e) { currentInternalLink = e.getTarget().getAsString(); + internalLinksInCell.add(currentInternalLink); iterate(e); currentInternalLink = null; } @@ -307,10 +355,13 @@ public class WikitextImporter extends TabularImportingParserBase { public class WikiTableDataReader implements TableDataReader { private int currentRow = -1; private WikitextTableVisitor visitor = null; + private List> reconList = null; + private List columnReconciled = null; public WikiTableDataReader(WikitextTableVisitor visitor) { this.visitor = visitor; currentRow = -1; + reconList = null; } @Override @@ -322,16 +373,74 @@ public class WikitextImporter extends TabularImportingParserBase { } else if(currentRow < this.visitor.rows.size()) { origRow = this.visitor.rows.get(currentRow); } - currentRow++; if (origRow != null) { row = new ArrayList(); for (int i = 0; i < origRow.size(); i++) { - row.add(origRow.get(i)); + Recon recon = null; + if (currentRow >= 0 && reconList != null) { + recon = reconList.get(currentRow).get(i); + } + row.add(new Cell(origRow.get(i), recon)); } } + currentRow++; return row; } + + private void reconcileToQids(String wikiBaseUrl) { + if("null".equals(wikiBaseUrl)) { + return; // TODO: more thorough URL validation instead + } + + // Init the list of recons + reconList = new ArrayList>(); + columnReconciled = new ArrayList(); + for (int i = 0; i < this.visitor.rows.size(); i++) { + int rowSize = this.visitor.rows.get(i).size(); + List recons = new ArrayList(rowSize); + for (int j = 0; j < rowSize; j++) { + recons.add(null); + } + reconList.add(recons); + columnReconciled.add(false); + } + + // Wikidata reconciliation endpoint, hardcoded because the user might not have it in its services + StandardReconConfig cfg = new StandardReconConfig( + "https://tools.wmflabs.org/openrefine-wikidata/en/api", + "http://www.wikidata.org/entity/", + "http://www.wikidata.org/prop/direct/", + "", + "entity", + true, + new ArrayList(), + 1 + ); + + int batchSize = 50; + int i = 0; + int totalSize = this.visitor.wikilinkedCells.size(); + while (i < totalSize) { + List jobs = new ArrayList(); + int batchStart = i; + while (i < batchStart + batchSize && i < totalSize) { + WikilinkedCell cell = this.visitor.wikilinkedCells.get(i); + jobs.add(cfg.createSimpleJob(cell.toURL(wikiBaseUrl))); + i++; + } + + List recons = cfg.batchRecon(jobs, 0); + for (int j = batchStart; j < batchStart + batchSize && j < totalSize; j++) { + WikilinkedCell cell = this.visitor.wikilinkedCells.get(j); + Recon recon = recons.get(j - batchStart); + if (recon != null) { + reconList.get(cell.row).set(cell.col, recon); + columnReconciled.set(cell.col, true); + } + } + } + } } @Override @@ -377,7 +486,13 @@ public class WikitextImporter extends TabularImportingParserBase { final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells); vs.go(parsedArticle); - TableDataReader dataReader = new WikiTableDataReader(vs); + WikiTableDataReader dataReader = new WikiTableDataReader(vs); + + // Reconcile if needed + String wikiUrl = JSONUtilities.getString(options, "wikiUrl", null); + if (wikiUrl != null) { + dataReader.reconcileToQids(wikiUrl); + } JSONUtilities.safePut(options, "headerLines", 1); @@ -386,8 +501,17 @@ public class WikitextImporter extends TabularImportingParserBase { metadata.setName(vs.caption); // TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way? } - + TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); + + // Add reconciliation statistics + if (dataReader.columnReconciled != null) { + for(int i = 0; i != dataReader.columnReconciled.size(); i++) { + if (dataReader.columnReconciled.get(i)) { + project.columnModel.columns.get(i).setReconStats(ReconStats.create(project, i)); + } + } + } } catch (IOException e1) { e1.printStackTrace(); } catch (ParseException e1) { @@ -395,4 +519,6 @@ public class WikitextImporter extends TabularImportingParserBase { e1.printStackTrace(); } } + + } diff --git a/main/src/com/google/refine/model/recon/StandardReconConfig.java b/main/src/com/google/refine/model/recon/StandardReconConfig.java index d51c9d123..f04764723 100644 --- a/main/src/com/google/refine/model/recon/StandardReconConfig.java +++ b/main/src/com/google/refine/model/recon/StandardReconConfig.java @@ -232,6 +232,30 @@ public class StandardReconConfig extends ReconConfig { public String getBriefDescription(Project project, String columnName) { return "Reconcile cells in column " + columnName + " to type " + typeID; } + + public ReconJob createSimpleJob(String query) { + /* Same as createJob, but for simpler queries + * without any properties. This is much easier + * to generate as there is no need for a Project, + * Row and Cell: this means the job can be created + * outside the usual context of reconciliation (e.g. + * in an importer). + */ + StandardReconJob job = new StandardReconJob(); + try { + StringWriter stringWriter = new StringWriter(); + JSONWriter jsonWriter = new JSONWriter(stringWriter); + jsonWriter.object(); + jsonWriter.key("query"); + jsonWriter.value(query); + jsonWriter.endObject(); + job.text = query; + job.code = stringWriter.toString(); + return job; + } catch (JSONException _) { + return null; + } + } @Override public ReconJob createJob(Project project, int rowIndex, Row row, diff --git a/main/webapp/modules/core/langs/translation-en.json b/main/webapp/modules/core/langs/translation-en.json index 3e7b64dd8..b71bdad62 100644 --- a/main/webapp/modules/core/langs/translation-en.json +++ b/main/webapp/modules/core/langs/translation-en.json @@ -114,6 +114,7 @@ "store-blank": "Store blank rows", "store-nulls": "Store blank cells as nulls", "blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls", + "wiki-base-url": "Reconcile to wiki with base URL:", "store-source": "Store file source
(file names, URLs)
in each row", "preserve-empty": "Preserve empty strings", "trim": "Trim leading & trailing whitespace from strings", diff --git a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html index b7d1daf6d..6b4fee39a 100644 --- a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html +++ b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.html @@ -1,9 +1,11 @@ -
+
+ - - @@ -19,7 +21,7 @@ - - + +
+
+ +
 
diff --git a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js index 9be1574de..0bf9b7dc9 100644 --- a/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js +++ b/main/webapp/modules/core/scripts/index/parser-interfaces/wikitext-parser-ui.js @@ -75,6 +75,11 @@ Refine.WikitextParserUI.prototype.getOptions = function() { } return def; }; + if (this._optionContainerElmts.wikiCheckbox[0].checked) { + options.wikiUrl = this._optionContainerElmts.wikiUrlInput[0].value; + } else { + options.wikiUrl = null; + } if (this._optionContainerElmts.limitCheckbox[0].checked) { options.limit = parseIntDefault(this._optionContainerElmts.limitInput[0].value, -1); } else { @@ -101,6 +106,7 @@ Refine.WikitextParserUI.prototype._initialize = function() { this._optionContainerElmts.previewButton.html($.i18n._('core-buttons')["update-preview"]); + $('#or-import-wiki-base-url').text($.i18n._('core-index-parser')["wiki-base-url"]); $('#or-import-load').text($.i18n._('core-index-parser')["load-at-most"]); $('#or-import-rows2').text($.i18n._('core-index-parser')["rows-data"]); $('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]); @@ -118,8 +124,13 @@ Refine.WikitextParserUI.prototype._initialize = function() { }); }); */ - console.log(this._config); + var wikiUrl = this._config.wikiUrl.toString(); + if (wikiUrl != null) { + this._optionContainerElmts.wikiUrlInput[0].value = wikiUrl; + this._optionContainerElmts.wikiCheckbox.prop("checked", true); + } + if (this._config.limit > 0) { this._optionContainerElmts.limitCheckbox.prop("checked", true); this._optionContainerElmts.limitInput[0].value = this._config.limit.toString();