Foundations of the edit inspector backend

This commit is contained in:
Antonin Delpeuch 2018-01-08 12:07:27 +00:00
parent 38176189e4
commit 990a404c76
12 changed files with 471 additions and 6 deletions

View File

@ -37,6 +37,8 @@ import java.io.IOException;
import java.io.LineNumberReader;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.List;
import java.util.Properties;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
@ -49,6 +51,9 @@ import com.google.refine.browsing.Engine;
import com.google.refine.commands.Command;
import org.openrefine.wikidata.exporters.QuickStatementsExporter;
import org.openrefine.wikidata.qa.EditInspector;
import org.openrefine.wikidata.qa.QAWarning;
import org.openrefine.wikidata.schema.ItemUpdate;
import org.openrefine.wikidata.schema.WikibaseSchema;
import com.google.refine.model.Project;
import com.google.refine.util.ParsingUtilities;
@ -68,20 +73,36 @@ public class PreviewWikibaseSchemaCommand extends Command {
JSONObject json = ParsingUtilities.evaluateJsonStringToObject(jsonString);
WikibaseSchema schema = WikibaseSchema.reconstruct(json);
// Evaluate project
Engine engine = getEngine(request, project);
List<ItemUpdate> editBatch = schema.evaluate(project, engine);
StringWriter sb = new StringWriter(2048);
JSONWriter writer = new JSONWriter(sb, 32);
writer.object();
{
StringWriter stringWriter = new StringWriter();
QuickStatementsExporter exporter = new QuickStatementsExporter();
Engine engine = getEngine(request, project);
exporter.translateSchema(project, engine, schema, stringWriter);
// Inspect the edits and generate warnings
EditInspector inspector = new EditInspector();
inspector.inspect(editBatch);
writer.key("warnings");
writer.array();
for (QAWarning warning : inspector.getWarnings()) {
warning.write(writer, new Properties());
}
writer.endArray();
// Export to QuickStatements
QuickStatementsExporter exporter = new QuickStatementsExporter();
exporter.translateItemList(editBatch, stringWriter);
String fullQS = stringWriter.toString();
stringWriter = new StringWriter();
LineNumberReader reader = new LineNumberReader(new StringReader(fullQS));
// Only keep the first 50 lines
int maxQSLinesForPreview = 50;
reader.setLineNumber(0);
String line = reader.readLine();

View File

@ -42,7 +42,6 @@ public class QuickStatementsExporter implements WriterExporter {
return "text";
}
@Override
public void export(Project project, Properties options, Engine engine, Writer writer)
throws IOException {
@ -53,9 +52,21 @@ public class QuickStatementsExporter implements WriterExporter {
translateSchema(project, engine, schema, writer);
}
/**
* Exports a project and a schema to a QuickStatements file
* @param project: the project to translate
* @param engine: the engine used for evaluation of the edits
* @param schema: the WikibaseSchema used for translation of tabular data to edits
* @param writer: the writer to which the QS should be written
* @throws IOException
*/
public void translateSchema(Project project, Engine engine, WikibaseSchema schema, Writer writer) throws IOException {
List<ItemUpdate> items = schema.evaluate(project, engine);
for (ItemUpdate item : items) {
translateItemList(items, writer);
}
public void translateItemList(List<ItemUpdate> editBatch, Writer writer) throws IOException {
for (ItemUpdate item : editBatch) {
translateItem(item, writer);
}
}
@ -73,7 +84,7 @@ public class QuickStatementsExporter implements WriterExporter {
protected void translateItem(ItemUpdate item, Writer writer) throws IOException {
String qid = item.getItemId().getId();
if (item.getItemId().getId() == "Q0") {
if (item.isNew()) {
writer.write("CREATE\n");
qid = "LAST";
item.normalizeLabelsAndAliases();

View File

@ -0,0 +1,57 @@
package org.openrefine.wikidata.qa;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.openrefine.wikidata.qa.scrutinizers.EditScrutinizer;
import org.openrefine.wikidata.qa.scrutinizers.NewItemScrutinizer;
import org.openrefine.wikidata.schema.ItemUpdate;
/**
* Runs a collection of edit scrutinizers on an edit batch
* @author antonin
*
*/
public class EditInspector {
private Map<String, EditScrutinizer> scrutinizers;
private QAWarningStore warningStore;
public EditInspector() {
scrutinizers = new HashMap<>();
warningStore = new QAWarningStore();
// Register all known scrutinizers here
register(new NewItemScrutinizer());
}
/**
* Adds a new scrutinizer to the inspector
* @param scrutinizer
*/
public void register(EditScrutinizer scrutinizer) {
String key = scrutinizer.getClass().getName();
scrutinizers.put(key, scrutinizer);
scrutinizer.setStore(warningStore);
}
/**
* Inspect a batch of edits with the registered scrutinizers
* @param editBatch
*/
public void inspect(List<ItemUpdate> editBatch) {
for(EditScrutinizer scrutinizer : scrutinizers.values()) {
scrutinizer.scrutinize(editBatch);
}
}
/**
* Retrieve the warnings after inspection of the edits
* @return
*/
public List<QAWarning> getWarnings() {
return warningStore.getWarnings();
}
}

View File

@ -0,0 +1,97 @@
package org.openrefine.wikidata.qa;
import org.openrefine.wikidata.utils.JacksonJsonizable;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* A class to represent a QA warning emited by the Wikidata schema
* This could probably be reused at a broader scale, for instance for
* Data Package validation.
*
* @author antonin
*
*/
public class QAWarning extends JacksonJsonizable implements Comparable<QAWarning> {
public enum Severity {
INFO, // We just report something to the user but it is probably fine
WARNING, // Edits that look wrong but in some cases they are actually fine
IMPORTANT, // There is almost surely something wrong about the edit but in rare cases we might want to allow it
CRITICAL, // We should never edit if there is a critical issue
}
/// The type of QA warning emitted
private String type;
// The key for aggregation of other QA warnings together - this specializes the id
private String bucketId;
// The severity of the issue
private Severity severity;
// The number of times this issue was found
private int count;
@JsonCreator
public QAWarning(
@JsonProperty("type") String type,
@JsonProperty("bucket_id") String bucketId,
@JsonProperty("severity") Severity severity,
@JsonProperty("count") int count) {
this.type = type;
this.bucketId = bucketId;
this.severity = severity;
this.count = count;
}
/**
* Returns the full key for aggregation of QA warnings
* @return
*/
public String getAggregationId() {
if (this.bucketId != null) {
return this.type + "_" + this.bucketId;
} else {
return this.type;
}
}
/**
* Aggregates another QA warning of the same aggregation id.
* @param other
*/
public void aggregate(QAWarning other) {
assert other.getAggregationId() == getAggregationId();
this.count += other.getCount();
if(this.severity.compareTo(other.getSeverity()) < 0) {
this.severity = other.getSeverity();
}
}
@JsonProperty("type")
public String getType() {
return type;
}
@JsonProperty("bucketId")
public String getBucketId() {
return bucketId;
}
@JsonProperty("severity")
public Severity getSeverity() {
return severity;
}
@JsonProperty("count")
public int getCount() {
return count;
}
/**
* Warnings are sorted by decreasing severity.
*/
@Override
public int compareTo(QAWarning other) {
return - severity.compareTo(other.getSeverity());
}
}

View File

@ -0,0 +1,60 @@
package org.openrefine.wikidata.qa;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* A store for QA warnings which aggregates them by type.
* @author antonin
*/
public class QAWarningStore {
private Map<String, QAWarning> map;
private QAWarning.Severity maxSeverity;
public QAWarningStore() {
this.map = new HashMap<>();
this.maxSeverity = QAWarning.Severity.INFO;
}
/**
* Stores a warning, aggregating it with any existing
* @param warning
*/
public void addWarning(QAWarning warning) {
String aggregationKey = warning.getAggregationId();
QAWarning.Severity severity = warning.getSeverity();
if (severity.compareTo(maxSeverity) > 0) {
maxSeverity = severity;
}
if (map.containsKey(aggregationKey)) {
QAWarning existing = map.get(aggregationKey);
existing.aggregate(warning);
} else {
map.put(aggregationKey, warning);
}
}
/**
* Returns the list of aggregated warnings, ordered by decreasing severity
*/
@JsonProperty("warnings")
List<QAWarning> getWarnings() {
List<QAWarning> result = new ArrayList<>(map.values());
Collections.sort(result);
return result;
}
/**
* Returns the maximum severity of the stored warnings (INFO if empty)
*/
@JsonProperty("max_severity")
QAWarning.Severity getMaxSeverity() {
return maxSeverity;
}
}

View File

@ -0,0 +1,59 @@
package org.openrefine.wikidata.qa.scrutinizers;
import java.util.List;
import org.openrefine.wikidata.qa.QAWarning;
import org.openrefine.wikidata.qa.QAWarningStore;
import org.openrefine.wikidata.schema.ItemUpdate;
/**
* Interface for any class that
* @author antonin
*
*/
public abstract class EditScrutinizer {
private QAWarningStore store;
public void setStore(QAWarningStore store) {
this.store = store;
}
/**
* Reads the candidate edits and emits warnings in the store
* @param edit: the list of ItemUpdates to scrutinize
*/
public abstract void scrutinize(List<ItemUpdate> edit);
/**
* Helper to be used by subclasses to emit INFO warnings
* @param warning
*/
protected void info(String type) {
store.addWarning(new QAWarning(type, null, QAWarning.Severity.INFO, 1));
}
/**
* Helper to be used by subclasses to emit warnings
* @param warning
*/
protected void warning(String type) {
store.addWarning(new QAWarning(type, null, QAWarning.Severity.WARNING, 1));
}
/**
* Helper to be used by subclasses to emit important warnings
* @param warning
*/
protected void important(String type) {
store.addWarning(new QAWarning(type, null, QAWarning.Severity.IMPORTANT, 1));
}
/**
* Helper to be used by subclasses to emit critical warnings
* @param warning
*/
protected void critical(String type) {
store.addWarning(new QAWarning(type, null, QAWarning.Severity.CRITICAL, 1));
}
}

View File

@ -0,0 +1,19 @@
package org.openrefine.wikidata.qa.scrutinizers;
import java.util.List;
import org.openrefine.wikidata.schema.ItemUpdate;
public abstract class ItemEditScrutinizer extends EditScrutinizer {
@Override
public void scrutinize(List<ItemUpdate> edit) {
for(ItemUpdate update : edit) {
if(!update.isNull()) {
scrutinize(update);
}
}
}
public abstract void scrutinize(ItemUpdate update);
}

View File

@ -0,0 +1,44 @@
package org.openrefine.wikidata.qa.scrutinizers;
import org.openrefine.wikidata.schema.ItemUpdate;
import org.wikidata.wdtk.datamodel.interfaces.StatementGroup;
/**
* A scrutinizer that inspects new items
* @author antonin
*/
public class NewItemScrutinizer extends ItemEditScrutinizer {
@Override
public void scrutinize(ItemUpdate update) {
if (update.isNew()) {
info("new-item-created");
if (update.getLabels().isEmpty() && update.getAliases().isEmpty()) {
important("new-item-without-labels-or-aliases");
}
if (update.getDescriptions().isEmpty()) {
warning("new-item-without-descriptions");
}
if (! update.getDeletedStatements().isEmpty()) {
warning("new-item-with-deleted-statements");
}
// Try to find a "instance of" or "subclass of" claim
boolean typeFound = false;
for(StatementGroup group : update.getAddedStatementGroups()) {
String pid = group.getProperty().getId();
if ("P31".equals(pid) || "P279".equals(pid)) {
typeFound = true;
break;
}
}
if (!typeFound) {
warning("new-item-without-P31-or-P279");
}
}
}
}

View File

@ -0,0 +1,53 @@
package org.openrefine.wikidata.qa.scrutinizers;
import java.util.Iterator;
import org.wikidata.wdtk.datamodel.interfaces.Reference;
import org.wikidata.wdtk.datamodel.interfaces.Snak;
import org.wikidata.wdtk.datamodel.interfaces.Statement;
public abstract class SnakScrutinizer extends StatementScrutinizer {
public abstract void scrutinizeAdded(Snak snak);
public abstract void scrutinizeDeleted(Snak snak);
@Override
public void scrutinizeAdded(Statement statement) {
// Main snak
scrutinizeAdded(statement.getClaim().getMainSnak());
// Qualifiers
scrutinizeSnakSet(statement.getClaim().getAllQualifiers(), true);
// References
for(Reference ref : statement.getReferences()) {
scrutinizeSnakSet(ref.getAllSnaks(), true);
}
}
@Override
public void scrutinizeDeleted(Statement statement) {
// Main snak
scrutinizeDeleted(statement.getClaim().getMainSnak());
// Qualifiers
scrutinizeSnakSet(statement.getClaim().getAllQualifiers(), false);
// References
for(Reference ref : statement.getReferences()) {
scrutinizeSnakSet(ref.getAllSnaks(), false);
}
}
private void scrutinizeSnakSet(Iterator<Snak> snaks, boolean add) {
while(snaks.hasNext()) {
Snak snak = snaks.next();
if (add) {
scrutinizeAdded(snak);
} else {
scrutinizeDeleted(snak);
}
}
}
}

View File

@ -0,0 +1,19 @@
package org.openrefine.wikidata.qa.scrutinizers;
import org.openrefine.wikidata.schema.ItemUpdate;
import org.wikidata.wdtk.datamodel.interfaces.StatementGroup;
public abstract class StatementGroupScrutinizer extends ItemEditScrutinizer {
@Override
public void scrutinize(ItemUpdate update) {
for(StatementGroup statementGroup : update.getAddedStatementGroups()) {
scrutinizeAdded(statementGroup);
}
}
public abstract void scrutinizeAdded(StatementGroup statementGroup);
public abstract void scrutinizeDeleted(StatementGroup statementGroup);
}

View File

@ -0,0 +1,18 @@
package org.openrefine.wikidata.qa.scrutinizers;
import org.openrefine.wikidata.schema.ItemUpdate;
import org.wikidata.wdtk.datamodel.interfaces.Statement;
public abstract class StatementScrutinizer extends ItemEditScrutinizer {
@Override
public void scrutinize(ItemUpdate update) {
for(Statement statement : update.getAddedStatements()) {
scrutinizeAdded(statement);
}
}
public abstract void scrutinizeAdded(Statement statement);
public abstract void scrutinizeDeleted(Statement statement);
}

View File

@ -180,4 +180,11 @@ public class ItemUpdate {
}
aliases = filteredAliases;
}
/**
* is this update about a new item?
*/
public boolean isNew() {
return "Q0".equals(getItemId().getId());
}
}