Support reconciliation via sitelinks.

Wikilinks are automatically reconciled at import time.

Related to #56.
This commit is contained in:
Antonin Delpeuch 2017-08-15 20:17:24 +01:00
parent aa4517ba58
commit 86dc240335
5 changed files with 180 additions and 16 deletions

View File

@ -4,6 +4,7 @@ import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.json.JSONObject;
import org.slf4j.Logger;
@ -45,9 +46,15 @@ import xtc.parser.ParseException;
import com.google.refine.ProjectMetadata;
import com.google.refine.importing.ImportingJob;
import com.google.refine.model.Cell;
import com.google.refine.model.Project;
import com.google.refine.model.Recon;
import com.google.refine.model.ReconCandidate;
import com.google.refine.model.ReconStats;
import com.google.refine.model.recon.StandardReconConfig.ColumnDetail;
import com.google.refine.util.JSONUtilities;
import com.google.refine.model.recon.StandardReconConfig;
import com.google.refine.model.recon.ReconJob;
public class WikitextImporter extends TabularImportingParserBase {
@ -64,19 +71,22 @@ public class WikitextImporter extends TabularImportingParserBase {
JSONUtilities.safePut(options, "guessCellValueTypes", false);
JSONUtilities.safePut(options, "blankSpanningCells", true);
JSONUtilities.safePut(options, "wikiUrl", "https://en.wikipedia.org/wiki/");
return options;
}
private class SpanningCell {
public String value;
public String reconciled;
public int colspan;
public int rowspan;
public int row;
public int col;
SpanningCell(String value, int row, int col, int rowspan, int colspan) {
SpanningCell(String value, String reconciled, int row, int col, int rowspan, int colspan) {
this.value = value;
this.reconciled = reconciled;
this.row = row;
this.col = col;
this.rowspan = rowspan;
@ -84,11 +94,28 @@ public class WikitextImporter extends TabularImportingParserBase {
}
}
private class WikilinkedCell {
public String internalLink;
public int row;
public int col;
WikilinkedCell(String internalLink, int row, int col) {
this.internalLink = internalLink;
this.row = row;
this.col = col;
}
public String toURL(String wikiBaseUrl) {
return wikiBaseUrl + internalLink;
}
}
public class WikitextTableVisitor extends AstVisitor<WtNode> {
public String caption;
public List<String> header;
public List<List<String>> rows;
public List<WikilinkedCell> wikilinkedCells;
private List<String> currentRow;
private boolean blankSpanningCells;
@ -103,12 +130,14 @@ public class WikitextImporter extends TabularImportingParserBase {
private int colspan;
private int rowspan;
private int spanningCellIdx;
private List<String> internalLinksInCell;
public WikitextTableVisitor(boolean blankSpanningCells) {
this.blankSpanningCells = blankSpanningCells;
caption = null;
header = new ArrayList<String>();
rows = new ArrayList<List<String>>();
wikilinkedCells = new ArrayList<WikilinkedCell>();
spanningCells = new ArrayList<SpanningCell>();
cellStringBuilder = null;
xmlAttrStringBuilder = null;
@ -118,6 +147,7 @@ public class WikitextImporter extends TabularImportingParserBase {
rowspan = 0;
rowId = -1;
spanningCellIdx = 0;
internalLinksInCell = new ArrayList<String>();
}
@Override
@ -181,14 +211,29 @@ public class WikitextImporter extends TabularImportingParserBase {
if (currentRow != null) {
rowspan = 1;
colspan = 1;
internalLinksInCell.clear();
String value = renderCellAsString(e);
int colId = currentRow.size();
// Add the cell to the row we are currently building
currentRow.add(value);
// Reconcile it if we found exactly one link in the cell
String reconciled = null;
if (internalLinksInCell.size() == 1) {
reconciled = internalLinksInCell.get(0);
wikilinkedCells.add(new WikilinkedCell(reconciled, rowId, colId));
}
// Mark it as spanning if we found the tags
if (colspan > 1 || rowspan > 1) {
SpanningCell spanningCell = new SpanningCell(
value, rowId, currentRow.size()-1, rowspan, colspan);
value, reconciled, rowId, colId, rowspan, colspan);
spanningCells.add(spanningCellIdx, spanningCell);
}
// Add all spanning cells that need to be inserted after this one.
addSpanningCells();
}
}
@ -208,6 +253,9 @@ public class WikitextImporter extends TabularImportingParserBase {
currentRow.add(null);
} else {
currentRow.add(cell.value);
if (cell.reconciled != null) {
wikilinkedCells.add(new WikilinkedCell(cell.reconciled, rowId, currentRow.size()-1));
}
}
}
}
@ -230,13 +278,12 @@ public class WikitextImporter extends TabularImportingParserBase {
iterate(e);
try {
int attrValue = Integer.parseInt(xmlAttrStringBuilder.toString());
if (currentXmlAttr.equals("colspan")) {
if ("colspan".equals(currentXmlAttr)) {
colspan = attrValue;
} else if (currentXmlAttr.equals("rowspan")) {
} else if ("rowspan".equals(currentXmlAttr)) {
rowspan = attrValue;
}
} catch (NumberFormatException _) {
;
}
currentXmlAttr = null;
xmlAttrStringBuilder = null;
@ -286,6 +333,7 @@ public class WikitextImporter extends TabularImportingParserBase {
public void visit(WtInternalLink e) {
currentInternalLink = e.getTarget().getAsString();
internalLinksInCell.add(currentInternalLink);
iterate(e);
currentInternalLink = null;
}
@ -307,10 +355,13 @@ public class WikitextImporter extends TabularImportingParserBase {
public class WikiTableDataReader implements TableDataReader {
private int currentRow = -1;
private WikitextTableVisitor visitor = null;
private List<List<Recon>> reconList = null;
private List<Boolean> columnReconciled = null;
public WikiTableDataReader(WikitextTableVisitor visitor) {
this.visitor = visitor;
currentRow = -1;
reconList = null;
}
@Override
@ -322,16 +373,74 @@ public class WikitextImporter extends TabularImportingParserBase {
} else if(currentRow < this.visitor.rows.size()) {
origRow = this.visitor.rows.get(currentRow);
}
currentRow++;
if (origRow != null) {
row = new ArrayList<Object>();
for (int i = 0; i < origRow.size(); i++) {
row.add(origRow.get(i));
Recon recon = null;
if (currentRow >= 0 && reconList != null) {
recon = reconList.get(currentRow).get(i);
}
row.add(new Cell(origRow.get(i), recon));
}
}
currentRow++;
return row;
}
private void reconcileToQids(String wikiBaseUrl) {
if("null".equals(wikiBaseUrl)) {
return; // TODO: more thorough URL validation instead
}
// Init the list of recons
reconList = new ArrayList<List<Recon>>();
columnReconciled = new ArrayList<Boolean>();
for (int i = 0; i < this.visitor.rows.size(); i++) {
int rowSize = this.visitor.rows.get(i).size();
List<Recon> recons = new ArrayList<Recon>(rowSize);
for (int j = 0; j < rowSize; j++) {
recons.add(null);
}
reconList.add(recons);
columnReconciled.add(false);
}
// Wikidata reconciliation endpoint, hardcoded because the user might not have it in its services
StandardReconConfig cfg = new StandardReconConfig(
"https://tools.wmflabs.org/openrefine-wikidata/en/api",
"http://www.wikidata.org/entity/",
"http://www.wikidata.org/prop/direct/",
"",
"entity",
true,
new ArrayList<ColumnDetail>(),
1
);
int batchSize = 50;
int i = 0;
int totalSize = this.visitor.wikilinkedCells.size();
while (i < totalSize) {
List<ReconJob> jobs = new ArrayList<ReconJob>();
int batchStart = i;
while (i < batchStart + batchSize && i < totalSize) {
WikilinkedCell cell = this.visitor.wikilinkedCells.get(i);
jobs.add(cfg.createSimpleJob(cell.toURL(wikiBaseUrl)));
i++;
}
List<Recon> recons = cfg.batchRecon(jobs, 0);
for (int j = batchStart; j < batchStart + batchSize && j < totalSize; j++) {
WikilinkedCell cell = this.visitor.wikilinkedCells.get(j);
Recon recon = recons.get(j - batchStart);
if (recon != null) {
reconList.get(cell.row).set(cell.col, recon);
columnReconciled.set(cell.col, true);
}
}
}
}
}
@Override
@ -377,7 +486,13 @@ public class WikitextImporter extends TabularImportingParserBase {
final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells);
vs.go(parsedArticle);
TableDataReader dataReader = new WikiTableDataReader(vs);
WikiTableDataReader dataReader = new WikiTableDataReader(vs);
// Reconcile if needed
String wikiUrl = JSONUtilities.getString(options, "wikiUrl", null);
if (wikiUrl != null) {
dataReader.reconcileToQids(wikiUrl);
}
JSONUtilities.safePut(options, "headerLines", 1);
@ -388,6 +503,15 @@ public class WikitextImporter extends TabularImportingParserBase {
}
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
// Add reconciliation statistics
if (dataReader.columnReconciled != null) {
for(int i = 0; i != dataReader.columnReconciled.size(); i++) {
if (dataReader.columnReconciled.get(i)) {
project.columnModel.columns.get(i).setReconStats(ReconStats.create(project, i));
}
}
}
} catch (IOException e1) {
e1.printStackTrace();
} catch (ParseException e1) {
@ -395,4 +519,6 @@ public class WikitextImporter extends TabularImportingParserBase {
e1.printStackTrace();
}
}
}

View File

@ -233,6 +233,30 @@ public class StandardReconConfig extends ReconConfig {
return "Reconcile cells in column " + columnName + " to type " + typeID;
}
public ReconJob createSimpleJob(String query) {
/* Same as createJob, but for simpler queries
* without any properties. This is much easier
* to generate as there is no need for a Project,
* Row and Cell: this means the job can be created
* outside the usual context of reconciliation (e.g.
* in an importer).
*/
StandardReconJob job = new StandardReconJob();
try {
StringWriter stringWriter = new StringWriter();
JSONWriter jsonWriter = new JSONWriter(stringWriter);
jsonWriter.object();
jsonWriter.key("query");
jsonWriter.value(query);
jsonWriter.endObject();
job.text = query;
job.code = stringWriter.toString();
return job;
} catch (JSONException _) {
return null;
}
}
@Override
public ReconJob createJob(Project project, int rowIndex, Row row,
String columnName, Cell cell) {

View File

@ -114,6 +114,7 @@
"store-blank": "Store blank rows",
"store-nulls": "Store blank cells as nulls",
"blank-spanning-cells": "Pad cells spanning over multiple rows or columns with nulls",
"wiki-base-url": "Reconcile to wiki with base URL:",
"store-source": "Store file source <br/>(file names, URLs)<br/>in each row",
"preserve-empty": "Preserve empty strings",
"trim": "Trim leading &amp; trailing whitespace from strings",

View File

@ -1,9 +1,11 @@
<div class="grid-layout layout-tightest"><table>
<div class="grid-layout layout-loose"><table>
<tr><td colspan="2" id="or-import-colsep"></td></tr>
<tr><td width="1%"><input type="checkbox" bind="wikiCheckbox" id="$reconcileWiki" /></td><td><label for="$reconcileWiki" id="or-import-wiki-base-url"></label>
<input bind="wikiUrlInput" type="text" class="lightweight" size="30" id="$wikiUrl" /></td></tr>
<tr><td width="1%"><input type="checkbox" bind="limitCheckbox" id="$limit" /></td>
<td><label for="$limit" id="or-import-load"></label></td>
<td><input bind="limitInput" type="text" class="lightweight" size="2" value="0" />
<td><label for="$limit" id="or-import-load"></label>
<input bind="limitInput" type="text" class="lightweight" size="2" value="0" />
<label for="$limit" id="or-import-rows2"></label></td></tr>
<tr><td width="1%"><input type="checkbox" bind="guessCellValueTypesCheckbox" id="$guess" /></td>
<td><label for="$guess" id="or-import-parseCell"></label></td></tr>
@ -19,7 +21,7 @@
<td><label for="$include-file-sources" id="or-import-source"></label></td></tr>
<tr>
<td style="text-align: right;">&nbsp;</td>
<td width="1%"><button class="button" bind="previewButton"></button></td>
<td width="1%"></td>
<td><button class="button" bind="previewButton"></button></td>
</tr>
</table></div>

View File

@ -75,6 +75,11 @@ Refine.WikitextParserUI.prototype.getOptions = function() {
}
return def;
};
if (this._optionContainerElmts.wikiCheckbox[0].checked) {
options.wikiUrl = this._optionContainerElmts.wikiUrlInput[0].value;
} else {
options.wikiUrl = null;
}
if (this._optionContainerElmts.limitCheckbox[0].checked) {
options.limit = parseIntDefault(this._optionContainerElmts.limitInput[0].value, -1);
} else {
@ -101,6 +106,7 @@ Refine.WikitextParserUI.prototype._initialize = function() {
this._optionContainerElmts.previewButton.html($.i18n._('core-buttons')["update-preview"]);
$('#or-import-wiki-base-url').text($.i18n._('core-index-parser')["wiki-base-url"]);
$('#or-import-load').text($.i18n._('core-index-parser')["load-at-most"]);
$('#or-import-rows2').text($.i18n._('core-index-parser')["rows-data"]);
$('#or-import-parseCell').html($.i18n._('core-index-parser')["parse-cell"]);
@ -118,7 +124,12 @@ Refine.WikitextParserUI.prototype._initialize = function() {
});
});
*/
console.log(this._config);
var wikiUrl = this._config.wikiUrl.toString();
if (wikiUrl != null) {
this._optionContainerElmts.wikiUrlInput[0].value = wikiUrl;
this._optionContainerElmts.wikiCheckbox.prop("checked", true);
}
if (this._config.limit > 0) {
this._optionContainerElmts.limitCheckbox.prop("checked", true);