Support reconciliation via sitelinks.

Wikilinks are automatically reconciled at import time.

Related to #56.
This commit is contained in:
Antonin Delpeuch 2017-08-15 20:17:24 +01:00
parent aa4517ba58
commit 86dc240335
5 changed files with 180 additions and 16 deletions

View File

@ -4,6 +4,7 @@ import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map;
import org.json.JSONObject; import org.json.JSONObject;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -45,9 +46,15 @@ import xtc.parser.ParseException;
import com.google.refine.ProjectMetadata; import com.google.refine.ProjectMetadata;
import com.google.refine.importing.ImportingJob; import com.google.refine.importing.ImportingJob;
import com.google.refine.model.Cell;
import com.google.refine.model.Project; import com.google.refine.model.Project;
import com.google.refine.model.Recon;
import com.google.refine.model.ReconCandidate;
import com.google.refine.model.ReconStats;
import com.google.refine.model.recon.StandardReconConfig.ColumnDetail;
import com.google.refine.util.JSONUtilities; import com.google.refine.util.JSONUtilities;
import com.google.refine.model.recon.StandardReconConfig;
import com.google.refine.model.recon.ReconJob;
public class WikitextImporter extends TabularImportingParserBase { public class WikitextImporter extends TabularImportingParserBase {
@ -64,19 +71,22 @@ public class WikitextImporter extends TabularImportingParserBase {
JSONUtilities.safePut(options, "guessCellValueTypes", false); JSONUtilities.safePut(options, "guessCellValueTypes", false);
JSONUtilities.safePut(options, "blankSpanningCells", true); JSONUtilities.safePut(options, "blankSpanningCells", true);
JSONUtilities.safePut(options, "wikiUrl", "https://en.wikipedia.org/wiki/");
return options; return options;
} }
private class SpanningCell { private class SpanningCell {
public String value; public String value;
public String reconciled;
public int colspan; public int colspan;
public int rowspan; public int rowspan;
public int row; public int row;
public int col; public int col;
SpanningCell(String value, int row, int col, int rowspan, int colspan) { SpanningCell(String value, String reconciled, int row, int col, int rowspan, int colspan) {
this.value = value; this.value = value;
this.reconciled = reconciled;
this.row = row; this.row = row;
this.col = col; this.col = col;
this.rowspan = rowspan; this.rowspan = rowspan;
@ -84,11 +94,28 @@ public class WikitextImporter extends TabularImportingParserBase {
} }
} }
private class WikilinkedCell {
public String internalLink;
public int row;
public int col;
WikilinkedCell(String internalLink, int row, int col) {
this.internalLink = internalLink;
this.row = row;
this.col = col;
}
public String toURL(String wikiBaseUrl) {
return wikiBaseUrl + internalLink;
}
}
public class WikitextTableVisitor extends AstVisitor<WtNode> { public class WikitextTableVisitor extends AstVisitor<WtNode> {
public String caption; public String caption;
public List<String> header; public List<String> header;
public List<List<String>> rows; public List<List<String>> rows;
public List<WikilinkedCell> wikilinkedCells;
private List<String> currentRow; private List<String> currentRow;
private boolean blankSpanningCells; private boolean blankSpanningCells;
@ -103,12 +130,14 @@ public class WikitextImporter extends TabularImportingParserBase {
private int colspan; private int colspan;
private int rowspan; private int rowspan;
private int spanningCellIdx; private int spanningCellIdx;
private List<String> internalLinksInCell;
public WikitextTableVisitor(boolean blankSpanningCells) { public WikitextTableVisitor(boolean blankSpanningCells) {
this.blankSpanningCells = blankSpanningCells; this.blankSpanningCells = blankSpanningCells;
caption = null; caption = null;
header = new ArrayList<String>(); header = new ArrayList<String>();
rows = new ArrayList<List<String>>(); rows = new ArrayList<List<String>>();
wikilinkedCells = new ArrayList<WikilinkedCell>();
spanningCells = new ArrayList<SpanningCell>(); spanningCells = new ArrayList<SpanningCell>();
cellStringBuilder = null; cellStringBuilder = null;
xmlAttrStringBuilder = null; xmlAttrStringBuilder = null;
@ -118,6 +147,7 @@ public class WikitextImporter extends TabularImportingParserBase {
rowspan = 0; rowspan = 0;
rowId = -1; rowId = -1;
spanningCellIdx = 0; spanningCellIdx = 0;
internalLinksInCell = new ArrayList<String>();
} }
@Override @Override
@ -181,14 +211,29 @@ public class WikitextImporter extends TabularImportingParserBase {
if (currentRow != null) { if (currentRow != null) {
rowspan = 1; rowspan = 1;
colspan = 1; colspan = 1;
internalLinksInCell.clear();
String value = renderCellAsString(e); String value = renderCellAsString(e);
int colId = currentRow.size();
// Add the cell to the row we are currently building
currentRow.add(value); currentRow.add(value);
// Reconcile it if we found exactly one link in the cell
String reconciled = null;
if (internalLinksInCell.size() == 1) {
reconciled = internalLinksInCell.get(0);
wikilinkedCells.add(new WikilinkedCell(reconciled, rowId, colId));
}
// Mark it as spanning if we found the tags
if (colspan > 1 || rowspan > 1) { if (colspan > 1 || rowspan > 1) {
SpanningCell spanningCell = new SpanningCell( SpanningCell spanningCell = new SpanningCell(
value, rowId, currentRow.size()-1, rowspan, colspan); value, reconciled, rowId, colId, rowspan, colspan);
spanningCells.add(spanningCellIdx, spanningCell); spanningCells.add(spanningCellIdx, spanningCell);
} }
// Add all spanning cells that need to be inserted after this one.
addSpanningCells(); addSpanningCells();
} }
} }
@ -208,6 +253,9 @@ public class WikitextImporter extends TabularImportingParserBase {
currentRow.add(null); currentRow.add(null);
} else { } else {
currentRow.add(cell.value); currentRow.add(cell.value);
if (cell.reconciled != null) {
wikilinkedCells.add(new WikilinkedCell(cell.reconciled, rowId, currentRow.size()-1));
}
} }
} }
} }
@ -230,13 +278,12 @@ public class WikitextImporter extends TabularImportingParserBase {
iterate(e); iterate(e);
try { try {
int attrValue = Integer.parseInt(xmlAttrStringBuilder.toString()); int attrValue = Integer.parseInt(xmlAttrStringBuilder.toString());
if (currentXmlAttr.equals("colspan")) { if ("colspan".equals(currentXmlAttr)) {
colspan = attrValue; colspan = attrValue;
} else if (currentXmlAttr.equals("rowspan")) { } else if ("rowspan".equals(currentXmlAttr)) {
rowspan = attrValue; rowspan = attrValue;
} }
} catch (NumberFormatException _) { } catch (NumberFormatException _) {
;
} }
currentXmlAttr = null; currentXmlAttr = null;
xmlAttrStringBuilder = null; xmlAttrStringBuilder = null;
@ -286,6 +333,7 @@ public class WikitextImporter extends TabularImportingParserBase {
public void visit(WtInternalLink e) { public void visit(WtInternalLink e) {
currentInternalLink = e.getTarget().getAsString(); currentInternalLink = e.getTarget().getAsString();
internalLinksInCell.add(currentInternalLink);
iterate(e); iterate(e);
currentInternalLink = null; currentInternalLink = null;
} }
@ -307,10 +355,13 @@ public class WikitextImporter extends TabularImportingParserBase {
public class WikiTableDataReader implements TableDataReader { public class WikiTableDataReader implements TableDataReader {
private int currentRow = -1; private int currentRow = -1;
private WikitextTableVisitor visitor = null; private WikitextTableVisitor visitor = null;
private List<List<Recon>> reconList = null;
private List<Boolean> columnReconciled = null;
public WikiTableDataReader(WikitextTableVisitor visitor) { public WikiTableDataReader(WikitextTableVisitor visitor) {
this.visitor = visitor; this.visitor = visitor;
currentRow = -1; currentRow = -1;
reconList = null;
} }
@Override @Override
@ -322,16 +373,74 @@ public class WikitextImporter extends TabularImportingParserBase {
} else if(currentRow < this.visitor.rows.size()) { } else if(currentRow < this.visitor.rows.size()) {
origRow = this.visitor.rows.get(currentRow); origRow = this.visitor.rows.get(currentRow);
} }
currentRow++;
if (origRow != null) { if (origRow != null) {
row = new ArrayList<Object>(); row = new ArrayList<Object>();
for (int i = 0; i < origRow.size(); i++) { for (int i = 0; i < origRow.size(); i++) {
row.add(origRow.get(i)); Recon recon = null;
if (currentRow >= 0 && reconList != null) {
recon = reconList.get(currentRow).get(i);
}
row.add(new Cell(origRow.get(i), recon));
} }
} }
currentRow++;
return row; return row;
} }
private void reconcileToQids(String wikiBaseUrl) {
if("null".equals(wikiBaseUrl)) {
return; // TODO: more thorough URL validation instead
}
// Init the list of recons
reconList = new ArrayList<List<Recon>>();
columnReconciled = new ArrayList<Boolean>();
for (int i = 0; i < this.visitor.rows.size(); i++) {
int rowSize = this.visitor.rows.get(i).size();
List<Recon> recons = new ArrayList<Recon>(rowSize);
for (int j = 0; j < rowSize; j++) {
recons.add(null);
}
reconList.add(recons);
columnReconciled.add(false);
}
// Wikidata reconciliation endpoint, hardcoded because the user might not have it in its services
StandardReconConfig cfg = new StandardReconConfig(
"https://tools.wmflabs.org/openrefine-wikidata/en/api",
"http://www.wikidata.org/entity/",
"http://www.wikidata.org/prop/direct/",
"",
"entity",
true,
new ArrayList<ColumnDetail>(),
1
);
int batchSize = 50;
int i = 0;
int totalSize = this.visitor.wikilinkedCells.size();
while (i < totalSize) {
List<ReconJob> jobs = new ArrayList<ReconJob>();
int batchStart = i;
while (i < batchStart + batchSize && i < totalSize) {
WikilinkedCell cell = this.visitor.wikilinkedCells.get(i);
jobs.add(cfg.createSimpleJob(cell.toURL(wikiBaseUrl)));
i++;
}
List<Recon> recons = cfg.batchRecon(jobs, 0);
for (int j = batchStart; j < batchStart + batchSize && j < totalSize; j++) {
WikilinkedCell cell = this.visitor.wikilinkedCells.get(j);
Recon recon = recons.get(j - batchStart);
if (recon != null) {
reconList.get(cell.row).set(cell.col, recon);
columnReconciled.set(cell.col, true);
}
}
}
}
} }
@Override @Override
@ -377,7 +486,13 @@ public class WikitextImporter extends TabularImportingParserBase {
final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells); final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells);
vs.go(parsedArticle); vs.go(parsedArticle);
TableDataReader dataReader = new WikiTableDataReader(vs); WikiTableDataReader dataReader = new WikiTableDataReader(vs);
// Reconcile if needed
String wikiUrl = JSONUtilities.getString(options, "wikiUrl", null);
if (wikiUrl != null) {
dataReader.reconcileToQids(wikiUrl);
}
JSONUtilities.safePut(options, "headerLines", 1); JSONUtilities.safePut(options, "headerLines", 1);
@ -386,8 +501,17 @@ public class WikitextImporter extends TabularImportingParserBase {
metadata.setName(vs.caption); metadata.setName(vs.caption);
// TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way? // TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way?
} }
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions); TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
// Add reconciliation statistics
if (dataReader.columnReconciled != null) {
for(int i = 0; i != dataReader.columnReconciled.size(); i++) {
if (dataReader.columnReconciled.get(i)) {
project.columnModel.columns.get(i).setReconStats(ReconStats.create(project, i));
}
}
}
} catch (IOException e1) { } catch (IOException e1) {
e1.printStackTrace(); e1.printStackTrace();
} catch (ParseException e1) { } catch (ParseException e1) {
@ -395,4 +519,6 @@ public class WikitextImporter extends TabularImportingParserBase {
e1.printStackTrace(); e1.printStackTrace();
} }
} }
} }

View File

@ -232,6 +232,30 @@ public class StandardReconConfig extends ReconConfig {
public String getBriefDescription(Project project, String columnName) { public String getBriefDescription(Project project, String columnName) {
return "Reconcile cells in column " + columnName + " to type " + typeID; return "Reconcile cells in column " + columnName + " to type " + typeID;
} }
public ReconJob createSimpleJob(String query) {
/* Same as createJob, but for simpler queries
* without any properties. This is much easier
* to generate as there is no need for a Project,
* Row and Cell: this means the job can be created
* outside the usual context of reconciliation (e.g.
* in an importer).
*/
StandardReconJob job = new StandardReconJob();
try {
StringWriter stringWriter = new StringWriter();
JSONWriter jsonWriter = new JSONWriter(stringWriter);
jsonWriter.object();
jsonWriter.key("query");
jsonWriter.value(query);
jsonWriter.endObject();
job.text = query;
job.code = stringWriter.toString();
return job;
} catch (JSONException _) {
return null;
}
}
@Override @Override
public ReconJob createJob(Project project, int rowIndex, Row row, public ReconJob createJob(Project project, int rowIndex, Row row,

View File

@ -114,6 +114,7 @@
"store-blank": "Store blank rows", "store-blank": "Store blank rows",
"store-nulls": "Store blank cells as nulls", "store-nulls": "Store blank cells as nulls",
"blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls", "blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls",
"wiki-base-url": "Reconcile to wiki with base URL:",
"store-source": "Store file source <br/>(file names, URLs)<br/>in each row", "store-source": "Store file source <br/>(file names, URLs)<br/>in each row",
"preserve-empty": "Preserve empty strings", "preserve-empty": "Preserve empty strings",
"trim": "Trim leading &amp; trailing whitespace from strings", "trim": "Trim leading &amp; trailing whitespace from strings",

View File

@ -1,9 +1,11 @@
<div class="grid-layout layout-tightest"><table> <div class="grid-layout layout-loose"><table>
<tr><td colspan="2" id="or-import-colsep"></td></tr> <tr><td colspan="2" id="or-import-colsep"></td></tr>
<tr><td width="1%"><input type="checkbox" bind="wikiCheckbox" id="$reconcileWiki" /></td><td><label for="$reconcileWiki" id="or-import-wiki-base-url"></label>
<input bind="wikiUrlInput" type="text" class="lightweight" size="30" id="$wikiUrl" /></td></tr>
<tr><td width="1%"><input type="checkbox" bind="limitCheckbox" id="$limit" /></td> <tr><td width="1%"><input type="checkbox" bind="limitCheckbox" id="$limit" /></td>
<td><label for="$limit" id="or-import-load"></label></td> <td><label for="$limit" id="or-import-load"></label>
<td><input bind="limitInput" type="text" class="lightweight" size="2" value="0" /> <input bind="limitInput" type="text" class="lightweight" size="2" value="0" />
<label for="$limit" id="or-import-rows2"></label></td></tr> <label for="$limit" id="or-import-rows2"></label></td></tr>
<tr><td width="1%"><input type="checkbox" bind="guessCellValueTypesCheckbox" id="$guess" /></td> <tr><td width="1%"><input type="checkbox" bind="guessCellValueTypesCheckbox" id="$guess" /></td>
<td><label for="$guess" id="or-import-parseCell"></label></td></tr> <td><label for="$guess" id="or-import-parseCell"></label></td></tr>
@ -19,7 +21,7 @@
<td><label for="$include-file-sources" id="or-import-source"></label></td></tr> <td><label for="$include-file-sources" id="or-import-source"></label></td></tr>
<tr> <tr>
<td style="text-align: right;">&nbsp;</td> <td width="1%"></td>
<td width="1%"><button class="button" bind="previewButton"></button></td> <td><button class="button" bind="previewButton"></button></td>
</tr> </tr>
</table></div> </table></div>

View File

@ -75,6 +75,11 @@ Refine.WikitextParserUI.prototype.getOptions = function() {
} }
return def; return def;
}; };
if (this._optionContainerElmts.wikiCheckbox[0].checked) {
options.wikiUrl = this._optionContainerElmts.wikiUrlInput[0].value;
} else {
options.wikiUrl = null;
}
if (this._optionContainerElmts.limitCheckbox[0].checked) { if (this._optionContainerElmts.limitCheckbox[0].checked) {
options.limit = parseIntDefault(this._optionContainerElmts.limitInput[0].value, -1); options.limit = parseIntDefault(this._optionContainerElmts.limitInput[0].value, -1);
} else { } else {
@ -101,6 +106,7 @@ Refine.WikitextParserUI.prototype._initialize = function() {
this._optionContainerElmts.previewButton.html($.i18n._('core-buttons')["update-preview"]); this._optionContainerElmts.previewButton.html($.i18n._('core-buttons')["update-preview"]);
$('#or-import-wiki-base-url').text($.i18n._('core-index-parser')["wiki-base-url"]);
$('#or-import-load').text($.i18n._('core-index-parser')["load-at-most"]); $('#or-import-load').text($.i18n._('core-index-parser')["load-at-most"]);
$('#or-import-rows2').text($.i18n._('core-index-parser')["rows-data"]); $('#or-import-rows2').text($.i18n._('core-index-parser')["rows-data"]);
$('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]); $('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]);
@ -118,8 +124,13 @@ Refine.WikitextParserUI.prototype._initialize = function() {
}); });
}); });
*/ */
console.log(this._config);
var wikiUrl = this._config.wikiUrl.toString();
if (wikiUrl != null) {
this._optionContainerElmts.wikiUrlInput[0].value = wikiUrl;
this._optionContainerElmts.wikiCheckbox.prop("checked", true);
}
if (this._config.limit > 0) { if (this._config.limit > 0) {
this._optionContainerElmts.limitCheckbox.prop("checked", true); this._optionContainerElmts.limitCheckbox.prop("checked", true);
this._optionContainerElmts.limitInput[0].value = this._config.limit.toString(); this._optionContainerElmts.limitInput[0].value = this._config.limit.toString();