Introduce a scheduler package to reorganize item updates

This commit is contained in:
Antonin Delpeuch 2018-03-01 22:20:29 +00:00
parent 88178d7c04
commit bb044612e0
20 changed files with 1010 additions and 80 deletions

View File

@ -18,35 +18,15 @@ import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor;
* Format documentation: * Format documentation:
* https://www.wikidata.org/wiki/Help:QuickStatements * https://www.wikidata.org/wiki/Help:QuickStatements
* *
* Any new entity id will be
* assumed to be the last one created, represented with "LAST". It is
* fine to do this assumption because we are working on edit batches
* previously scheduled by {@link QuickStatementsUpdateScheduler}.
*
* @author Antonin Delpeuch * @author Antonin Delpeuch
* *
*/ */
public class QSValuePrinter implements ValueVisitor<String> { public class QSValuePrinter implements ValueVisitor<String> {
private final ReconEntityIdValue lastCreatedEntityIdValue;
/**
* Constructor.
*
* Creates a printer for a context where no entity was previously
* created with the "CREATE" command. Any new entity id will not
* be printed.
*/
public QSValuePrinter() {
lastCreatedEntityIdValue = null;
}
/**
* Creates a printer for a context where an entity was previously
* created with the "CREATE" command. If this id is encountered,
* it will be printed as "LAST".
*
* @param lastCreatedEntityIdValue
* the virtual id of the last created entity
*/
public QSValuePrinter(ReconEntityIdValue lastCreatedEntityIdValue) {
this.lastCreatedEntityIdValue = lastCreatedEntityIdValue;
}
@Override @Override
public String visit(DatatypeIdValue value) { public String visit(DatatypeIdValue value) {
@ -57,11 +37,8 @@ public class QSValuePrinter implements ValueVisitor<String> {
@Override @Override
public String visit(EntityIdValue value) { public String visit(EntityIdValue value) {
if (lastCreatedEntityIdValue != null && lastCreatedEntityIdValue.equals(value)) { if (ReconEntityIdValue.class.isInstance(value) && ((ReconEntityIdValue)value).isNew()) {
return "LAST"; return "LAST";
} else if (ReconEntityIdValue.class.isInstance(value)) {
// oops, we are trying to print another newly created entity (not the last one)
return null;
} }
return value.getId(); return value.getId();
} }

View File

@ -12,6 +12,8 @@ import com.google.refine.model.Project;
import org.openrefine.wikidata.schema.WikibaseSchema; import org.openrefine.wikidata.schema.WikibaseSchema;
import org.openrefine.wikidata.updates.ItemUpdate; import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException;
import org.openrefine.wikidata.updates.scheduler.QuickStatementsUpdateScheduler;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.wikidata.wdtk.datamodel.interfaces.Claim; import org.wikidata.wdtk.datamodel.interfaces.Claim;
@ -27,6 +29,9 @@ import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor;
public class QuickStatementsExporter implements WriterExporter { public class QuickStatementsExporter implements WriterExporter {
final static Logger logger = LoggerFactory.getLogger("QuickStatementsExporter"); final static Logger logger = LoggerFactory.getLogger("QuickStatementsExporter");
public static final String impossibleSchedulingErrorMessage =
"This edit batch cannot be performed with QuickStatements due to the structure of its new items.";
public QuickStatementsExporter(){ public QuickStatementsExporter(){
} }
@ -64,10 +69,17 @@ public class QuickStatementsExporter implements WriterExporter {
translateItemList(items, writer); translateItemList(items, writer);
} }
public void translateItemList(List<ItemUpdate> editBatch, Writer writer) throws IOException { public void translateItemList(List<ItemUpdate> updates, Writer writer) throws IOException {
for (ItemUpdate item : editBatch) { QuickStatementsUpdateScheduler scheduler = new QuickStatementsUpdateScheduler();
translateItem(item, writer); try {
List<ItemUpdate> scheduled = scheduler.schedule(updates);
for (ItemUpdate item : scheduled) {
translateItem(item, writer);
}
} catch(ImpossibleSchedulingException e) {
writer.write(impossibleSchedulingErrorMessage);
} }
} }
protected void translateNameDescr(String qid, Set<MonolingualTextValue> values, String prefix, ItemIdValue id, Writer writer) throws IOException { protected void translateNameDescr(String qid, Set<MonolingualTextValue> values, String prefix, ItemIdValue id, Writer writer) throws IOException {
@ -86,7 +98,7 @@ public class QuickStatementsExporter implements WriterExporter {
if (item.isNew()) { if (item.isNew()) {
writer.write("CREATE\n"); writer.write("CREATE\n");
qid = "LAST"; qid = "LAST";
item.normalizeLabelsAndAliases(); item = item.normalizeLabelsAndAliases();
} }
translateNameDescr(qid, item.getLabels(), "L", item.getItemId(), writer); translateNameDescr(qid, item.getLabels(), "L", item.getItemId(), writer);

View File

@ -17,6 +17,9 @@ import org.json.JSONWriter;
import org.openrefine.wikidata.editing.ConnectionManager; import org.openrefine.wikidata.editing.ConnectionManager;
import org.openrefine.wikidata.editing.NewItemLibrary; import org.openrefine.wikidata.editing.NewItemLibrary;
import org.openrefine.wikidata.updates.ItemUpdate; import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException;
import org.openrefine.wikidata.updates.scheduler.UpdateScheduler;
import org.openrefine.wikidata.updates.scheduler.WikibaseAPIUpdateScheduler;
import org.openrefine.wikidata.schema.WikibaseSchema; import org.openrefine.wikidata.schema.WikibaseSchema;
import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue; import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -29,6 +32,7 @@ import org.wikidata.wdtk.datamodel.interfaces.ItemDocument;
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue; import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
import org.wikidata.wdtk.util.WebResourceFetcherImpl; import org.wikidata.wdtk.util.WebResourceFetcherImpl;
import org.wikidata.wdtk.wikibaseapi.ApiConnection; import org.wikidata.wdtk.wikibaseapi.ApiConnection;
import org.wikidata.wdtk.wikibaseapi.TermStatementUpdate;
import org.wikidata.wdtk.wikibaseapi.WikibaseDataEditor; import org.wikidata.wdtk.wikibaseapi.WikibaseDataEditor;
import org.wikidata.wdtk.wikibaseapi.WikibaseDataFetcher; import org.wikidata.wdtk.wikibaseapi.WikibaseDataFetcher;
import org.wikidata.wdtk.wikibaseapi.apierrors.MediaWikiApiErrorException; import org.wikidata.wdtk.wikibaseapi.apierrors.MediaWikiApiErrorException;
@ -215,8 +219,10 @@ public class PerformWikibaseEditsOperation extends EngineDependentOperation {
// Evaluate the schema // Evaluate the schema
List<ItemUpdate> itemDocuments = _schema.evaluate(_project, _engine); List<ItemUpdate> itemDocuments = _schema.evaluate(_project, _engine);
// Group statements by item // Schedule the edit batch
Map<EntityIdValue, ItemUpdate> updates = ItemUpdate.groupBySubject(itemDocuments); WikibaseAPIUpdateScheduler scheduler = new WikibaseAPIUpdateScheduler();
List<ItemUpdate> updates = null;
updates = scheduler.schedule(itemDocuments);
/** /**
* TODO: * TODO:
@ -228,7 +234,7 @@ public class PerformWikibaseEditsOperation extends EngineDependentOperation {
NewItemLibrary newItemLibrary = new NewItemLibrary(); NewItemLibrary newItemLibrary = new NewItemLibrary();
DataObjectFactory factory = new DataObjectFactoryImpl(); DataObjectFactory factory = new DataObjectFactoryImpl();
List<ItemUpdate> remainingItemUpdates = new ArrayList<>(); List<ItemUpdate> remainingItemUpdates = new ArrayList<>();
remainingItemUpdates.addAll(updates.values()); remainingItemUpdates.addAll(updates);
int totalItemUpdates = updates.size(); int totalItemUpdates = updates.size();
int updatesDone = 0; int updatesDone = 0;
int batchSize = 50; int batchSize = 50;
@ -295,6 +301,20 @@ public class PerformWikibaseEditsOperation extends EngineDependentOperation {
} else { } else {
// Existing item // Existing item
ItemDocument currentDocument = (ItemDocument)currentDocs.get(update.getItemId().getId()); ItemDocument currentDocument = (ItemDocument)currentDocs.get(update.getItemId().getId());
/*
TermStatementUpdate tsUpdate = new TermStatementUpdate(
currentDocument,
update.getAddedStatements().stream().collect(Collectors.toList()),
update.getDeletedStatements().stream().collect(Collectors.toList()),
update.getLabels().stream().collect(Collectors.toList()),
update.getDescriptions().stream().collect(Collectors.toList()),
update.getAliases().stream().collect(Collectors.toList()),
new ArrayList<MonolingualTextValue>()
);
ObjectMapper mapper = new ObjectMapper();
logger.info(mapper.writeValueAsString(update));
logger.info(update.toString());
logger.info(tsUpdate.getJsonUpdateString()); */
wbde.updateTermsStatements(currentDocument, wbde.updateTermsStatements(currentDocument,
update.getLabels().stream().collect(Collectors.toList()), update.getLabels().stream().collect(Collectors.toList()),
update.getDescriptions().stream().collect(Collectors.toList()), update.getDescriptions().stream().collect(Collectors.toList()),

View File

@ -18,11 +18,15 @@ import org.openrefine.wikidata.qa.scrutinizers.SingleValueScrutinizer;
import org.openrefine.wikidata.qa.scrutinizers.UnsourcedScrutinizer; import org.openrefine.wikidata.qa.scrutinizers.UnsourcedScrutinizer;
import org.openrefine.wikidata.qa.scrutinizers.WhitespaceScrutinizer; import org.openrefine.wikidata.qa.scrutinizers.WhitespaceScrutinizer;
import org.openrefine.wikidata.updates.ItemUpdate; import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException;
import org.openrefine.wikidata.updates.scheduler.UpdateScheduler;
import org.openrefine.wikidata.updates.scheduler.WikibaseAPIUpdateScheduler;
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue; import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
/** /**
* Runs a collection of edit scrutinizers on an edit batch * Runs a collection of edit scrutinizers on an edit batch.
* @author antonin *
* @author Antonin Delpeuch
* *
*/ */
public class EditInspector { public class EditInspector {
@ -63,10 +67,19 @@ public class EditInspector {
* @param editBatch * @param editBatch
*/ */
public void inspect(List<ItemUpdate> editBatch) { public void inspect(List<ItemUpdate> editBatch) {
Map<EntityIdValue, ItemUpdate> updates = ItemUpdate.groupBySubject(editBatch); // First, schedule them with some scheduler,
List<ItemUpdate> mergedUpdates = updates.values().stream().collect(Collectors.toList()); // so that all newly created entities appear in the batch
for(EditScrutinizer scrutinizer : scrutinizers.values()) { UpdateScheduler scheduler = new WikibaseAPIUpdateScheduler();
scrutinizer.scrutinize(mergedUpdates); try {
editBatch = scheduler.schedule(editBatch);
Map<EntityIdValue, ItemUpdate> updates = ItemUpdate.groupBySubject(editBatch);
List<ItemUpdate> mergedUpdates = updates.values().stream().collect(Collectors.toList());
for(EditScrutinizer scrutinizer : scrutinizers.values()) {
scrutinizer.scrutinize(mergedUpdates);
}
} catch(ImpossibleSchedulingException e) {
warningStore.addWarning(new QAWarning(
"scheduling-failed", null, QAWarning.Severity.CRITICAL, 1));
} }
if (warningStore.getNbWarnings() == 0) { if (warningStore.getNbWarnings() == 0) {

View File

@ -8,6 +8,8 @@ import org.wikidata.wdtk.datamodel.helpers.Hash;
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue; import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor; import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.google.refine.model.Recon; import com.google.refine.model.Recon;
/** /**
@ -38,11 +40,13 @@ public abstract class ReconEntityIdValue implements PrefetchedEntityIdValue {
Recon.Judgment.New.equals(_recon.judgment)); Recon.Judgment.New.equals(_recon.judgment));
} }
protected boolean isMatched() { @JsonIgnore
public boolean isMatched() {
return Recon.Judgment.Matched.equals(_recon.judgment) && _recon.match != null; return Recon.Judgment.Matched.equals(_recon.judgment) && _recon.match != null;
} }
protected boolean isNew() { @JsonIgnore
public boolean isNew() {
return !isMatched(); return !isMatched();
} }

View File

@ -31,7 +31,7 @@ import com.fasterxml.jackson.annotation.JsonProperty;
*/ */
public class ItemUpdate { public class ItemUpdate {
private final ItemIdValue qid; private final ItemIdValue qid;
private final Set<Statement> addedStatements; private final List<Statement> addedStatements;
private final Set<Statement> deletedStatements; private final Set<Statement> deletedStatements;
private final Set<MonolingualTextValue> labels; private final Set<MonolingualTextValue> labels;
private final Set<MonolingualTextValue> descriptions; private final Set<MonolingualTextValue> descriptions;
@ -42,11 +42,24 @@ public class ItemUpdate {
* *
* @param qid * @param qid
* the subject of the document. It can be a reconciled item value for new items. * the subject of the document. It can be a reconciled item value for new items.
* @param addedStatements
* the statements to add on the item. They should be distinct. They
* are modelled as a list because their insertion order matters.
* @param deletedStatements
* the statements to remove from the item
* @param labels
* the labels to add on the item
* @param descriptions
* the descriptions to add on the item
* @param aliases
* the aliases to add on the item. In theory their order should matter
* but in practice people rarely rely on the order of aliases so this
* is just kept as a set for simplicity.
*/ */
@JsonCreator @JsonCreator
public ItemUpdate( public ItemUpdate(
@JsonProperty("subject") ItemIdValue qid, @JsonProperty("subject") ItemIdValue qid,
@JsonProperty("addedStatements") Set<Statement> addedStatements, @JsonProperty("addedStatements") List<Statement> addedStatements,
@JsonProperty("deletedStatements") Set<Statement> deletedStatements, @JsonProperty("deletedStatements") Set<Statement> deletedStatements,
@JsonProperty("labels") Set<MonolingualTextValue> labels, @JsonProperty("labels") Set<MonolingualTextValue> labels,
@JsonProperty("descriptions") Set<MonolingualTextValue> descriptions, @JsonProperty("descriptions") Set<MonolingualTextValue> descriptions,
@ -54,7 +67,7 @@ public class ItemUpdate {
Validate.notNull(qid); Validate.notNull(qid);
this.qid = qid; this.qid = qid;
if(addedStatements == null) { if(addedStatements == null) {
addedStatements = Collections.emptySet(); addedStatements = Collections.emptyList();
} }
this.addedStatements = addedStatements; this.addedStatements = addedStatements;
if(deletedStatements == null) { if(deletedStatements == null) {
@ -84,10 +97,13 @@ public class ItemUpdate {
} }
/** /**
* @return the set of all added statements * Added statements are recorded as a list because
* their order of insertion matters.
*
* @return the list of all added statements
*/ */
@JsonProperty("addedStatements") @JsonProperty("addedStatements")
public Set<Statement> getAddedStatements() { public List<Statement> getAddedStatements() {
return addedStatements; return addedStatements;
} }
@ -124,11 +140,18 @@ public class ItemUpdate {
} }
/** /**
* @return true when this change is empty * @return true when this change is empty and its subject is not new
* (no statements or terms changed)
*/ */
@JsonIgnore @JsonIgnore
public boolean isNull() { public boolean isNull() {
return isEmpty() && !isNew();
}
/**
* @return true when this change leaves the content of the document untouched
*/
@JsonIgnore
public boolean isEmpty() {
return (addedStatements.isEmpty() return (addedStatements.isEmpty()
&& deletedStatements.isEmpty() && deletedStatements.isEmpty()
&& labels.isEmpty() && labels.isEmpty()
@ -145,8 +168,12 @@ public class ItemUpdate {
*/ */
public ItemUpdate merge(ItemUpdate other) { public ItemUpdate merge(ItemUpdate other) {
Validate.isTrue(qid.equals(other.getItemId())); Validate.isTrue(qid.equals(other.getItemId()));
Set<Statement> newAddedStatements = new HashSet<>(addedStatements); List<Statement> newAddedStatements = new ArrayList<>(addedStatements);
newAddedStatements.addAll(other.getAddedStatements()); for(Statement statement : other.getAddedStatements()) {
if (!newAddedStatements.contains(statement)) {
newAddedStatements.add(statement);
}
}
Set<Statement> newDeletedStatements = new HashSet<>(deletedStatements); Set<Statement> newDeletedStatements = new HashSet<>(deletedStatements);
newDeletedStatements.addAll(other.getDeletedStatements()); newDeletedStatements.addAll(other.getDeletedStatements());
Set<MonolingualTextValue> newLabels = new HashSet<>(labels); Set<MonolingualTextValue> newLabels = new HashSet<>(labels);
@ -264,16 +291,29 @@ public class ItemUpdate {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
builder.append("<Update on "); builder.append("<Update on ");
builder.append(qid); builder.append(qid);
builder.append("\n Labels: "); if (!labels.isEmpty()) {
builder.append(labels); builder.append("\n Labels: ");
builder.append("\n Descriptions: "); builder.append(labels);
builder.append(descriptions); }
builder.append("\n Aliases: "); if (!descriptions.isEmpty()) {
builder.append(aliases); builder.append("\n Descriptions: ");
builder.append("\n Added statements: "); builder.append(descriptions);
builder.append(addedStatements); }
builder.append("\n Deleted statements: "); if (!aliases.isEmpty()) {
builder.append(deletedStatements); builder.append("\n Aliases: ");
builder.append(aliases);
}
if (!addedStatements.isEmpty()) {
builder.append("\n Added statements: ");
builder.append(addedStatements);
}
if (!deletedStatements.isEmpty()) {
builder.append("\n Deleted statements: ");
builder.append(deletedStatements);
}
if (isNull()) {
builder.append(" (null update)");
}
builder.append("\n>"); builder.append("\n>");
return builder.toString(); return builder.toString();
} }

View File

@ -1,7 +1,9 @@
package org.openrefine.wikidata.updates; package org.openrefine.wikidata.updates;
import java.util.Set; import java.util.Set;
import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List;
import org.jsoup.helper.Validate; import org.jsoup.helper.Validate;
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue; import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
@ -17,7 +19,7 @@ import org.wikidata.wdtk.datamodel.interfaces.Statement;
*/ */
public class ItemUpdateBuilder { public class ItemUpdateBuilder {
private ItemIdValue qid; private ItemIdValue qid;
private Set<Statement> addedStatements; private List<Statement> addedStatements;
private Set<Statement> deletedStatements; private Set<Statement> deletedStatements;
private Set<MonolingualTextValue> labels; private Set<MonolingualTextValue> labels;
private Set<MonolingualTextValue> descriptions; private Set<MonolingualTextValue> descriptions;
@ -33,7 +35,7 @@ public class ItemUpdateBuilder {
public ItemUpdateBuilder(ItemIdValue qid) { public ItemUpdateBuilder(ItemIdValue qid) {
Validate.notNull(qid); Validate.notNull(qid);
this.qid = qid; this.qid = qid;
this.addedStatements = new HashSet<>(); this.addedStatements = new ArrayList<>();
this.deletedStatements = new HashSet<Statement>(); this.deletedStatements = new HashSet<Statement>();
this.labels = new HashSet<MonolingualTextValue>(); this.labels = new HashSet<MonolingualTextValue>();
this.descriptions = new HashSet<MonolingualTextValue>(); this.descriptions = new HashSet<MonolingualTextValue>();
@ -103,6 +105,19 @@ public class ItemUpdateBuilder {
labels.add(label); labels.add(label);
return this; return this;
} }
/**
* Adds a list of labels to the item. It will override any
* existing label in each language.
*
* @param labels
* the labels to add
*/
public ItemUpdateBuilder addLabels(Set<MonolingualTextValue> labels) {
Validate.isTrue(!built, "ItemUpdate has already been built");
this.labels.addAll(labels);
return this;
}
/** /**
* Adds a description to the item. It will override any existing * Adds a description to the item. It will override any existing
@ -116,6 +131,19 @@ public class ItemUpdateBuilder {
descriptions.add(description); descriptions.add(description);
return this; return this;
} }
/**
* Adds a list of descriptions to the item. It will override any
* existing description in each language.
*
* @param descriptions
* the descriptions to add
*/
public ItemUpdateBuilder addDescriptions(Set<MonolingualTextValue> descriptions) {
Validate.isTrue(!built, "ItemUpdate has already been built");
this.descriptions.addAll(descriptions);
return this;
}
/** /**
* Adds an alias to the item. It will be added to any existing * Adds an alias to the item. It will be added to any existing
@ -129,6 +157,19 @@ public class ItemUpdateBuilder {
aliases.add(alias); aliases.add(alias);
return this; return this;
} }
/**
* Adds a list of aliases to the item. They will be added to any
* existing aliases in each language.
*
* @param aliases
* the aliases to add
*/
public ItemUpdateBuilder addAliases(Set<MonolingualTextValue> aliases) {
Validate.isTrue(!built, "ItemUpdate has already been built");
this.aliases.addAll(aliases);
return this;
}
/** /**
* Constructs the {@link ItemUpdate}. * Constructs the {@link ItemUpdate}.

View File

@ -0,0 +1,8 @@
package org.openrefine.wikidata.updates.scheduler;
public class ImpossibleSchedulingException extends Exception {
private static final long serialVersionUID = 6621563898380564148L;
}

View File

@ -0,0 +1,152 @@
package org.openrefine.wikidata.updates.scheduler;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue;
import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue;
import org.wikidata.wdtk.datamodel.interfaces.DatatypeIdValue;
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
import org.wikidata.wdtk.datamodel.interfaces.GlobeCoordinatesValue;
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
import org.wikidata.wdtk.datamodel.interfaces.QuantityValue;
import org.wikidata.wdtk.datamodel.interfaces.Snak;
import org.wikidata.wdtk.datamodel.interfaces.SnakGroup;
import org.wikidata.wdtk.datamodel.interfaces.Statement;
import org.wikidata.wdtk.datamodel.interfaces.StringValue;
import org.wikidata.wdtk.datamodel.interfaces.TimeValue;
import org.wikidata.wdtk.datamodel.interfaces.Value;
import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor;
/**
* A class that extracts the new entity ids referred to
* in a statement.
*
* @author Antonin Delpeuch
*
*/
public class PointerExtractor implements ValueVisitor<Set<ReconItemIdValue>> {
/**
* Extracts all the new entities mentioned by this statement. This
* does not include the subject of the statement.
*
* @param statement
* the statement to inspect
* @return
* the set of all new entities mentioned by the statement
*/
public Set<ReconItemIdValue> extractPointers(Statement statement) {
Set<ReconItemIdValue> result = new HashSet<>();
result.addAll(extractPointers(statement.getClaim().getMainSnak()));
result.addAll(extractPointers(statement.getClaim().getQualifiers()));
statement.getReferences().stream()
.map(l -> extractPointers(l.getSnakGroups()))
.forEach(s -> result.addAll(s));
return result;
}
/**
* Extracts all the new entities mentioned by this list of snak groups.
*
* @param snakGroups
* @return
*/
public Set<ReconItemIdValue> extractPointers(List<SnakGroup> snakGroups) {
Set<ReconItemIdValue> result = new HashSet<>();
snakGroups.stream()
.map(s -> extractPointers(s))
.forEach(s -> result.addAll(s));
return result;
}
/***
* Extracts all the new entities mentioned by this snak group.
*
* @param snakGroup
* @return
*/
public Set<ReconItemIdValue> extractPointers(SnakGroup snakGroup) {
Set<ReconItemIdValue> result = new HashSet<>();
snakGroup.getSnaks().stream()
.map(s -> extractPointers(s))
.forEach(s -> result.addAll(s));
return result;
}
/**
* Extracts all new entities mentioned by this snak group.
* Currently there will be at most one: the target of the snak
* (as property ids cannot be new for now).
*
* @param snak
* @return
*/
public Set<ReconItemIdValue> extractPointers(Snak snak) {
Set<ReconItemIdValue> result = new HashSet<>();
result.addAll(extractPointers(snak.getPropertyId()));
result.addAll(extractPointers(snak.getValue()));
return result;
}
/**
* Extracts any new entity from the value.
*
* @param value
* @return
*/
public Set<ReconItemIdValue> extractPointers(Value value) {
if (value == null) {
return Collections.emptySet();
}
Set<ReconItemIdValue> pointers = value.accept(this);
if (pointers == null) {
return Collections.emptySet();
}
return pointers;
}
@Override
public Set<ReconItemIdValue> visit(DatatypeIdValue value) {
return null;
}
@Override
public Set<ReconItemIdValue> visit(EntityIdValue value) {
if(ReconItemIdValue.class.isInstance(value)) {
ReconItemIdValue recon = (ReconItemIdValue)value;
if(recon.isNew()) {
return Collections.singleton(recon);
}
}
return null;
}
@Override
public Set<ReconItemIdValue> visit(GlobeCoordinatesValue value) {
return null;
}
@Override
public Set<ReconItemIdValue> visit(MonolingualTextValue value) {
return null;
}
@Override
public Set<ReconItemIdValue> visit(QuantityValue value) {
// units cannot be new because WDTK represents them as strings already
return null;
}
@Override
public Set<ReconItemIdValue> visit(StringValue value) {
return null;
}
@Override
public Set<ReconItemIdValue> visit(TimeValue value) {
return null;
}
}

View File

@ -0,0 +1,118 @@
package org.openrefine.wikidata.updates.scheduler;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue;
import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue;
import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
import org.wikidata.wdtk.datamodel.interfaces.Statement;
public class QuickStatementsUpdateScheduler implements UpdateScheduler {
private PointerExtractor extractor = new PointerExtractor();
/**
* This map holds for each new entity id value a list of updates
* that refer to this id (and should hence be scheduled right after
* creation of that entity).
*/
private Map<ItemIdValue, UpdateSequence> pointerUpdates;
/**
* This contains all updates which do not refer to any new entity
* apart from possibly the subject, in the order that they were supplied to us.
*/
private UpdateSequence pointerFreeUpdates;
/**
* Separates out the statements which refer to new items from the rest
* of the update. The resulting updates are stored in {@link referencingUpdates}
* and {@link updatesWithoutReferences}.
*
* @param update
* @throws ImpossibleSchedulingException
* if two new item ids are referred to in the same statement
*/
protected void splitUpdate(ItemUpdate update) throws ImpossibleSchedulingException {
ItemUpdateBuilder remainingUpdateBuilder = new ItemUpdateBuilder(update.getItemId())
.addLabels(update.getLabels())
.addDescriptions(update.getDescriptions())
.addAliases(update.getAliases())
.deleteStatements(update.getDeletedStatements());
Map<ItemIdValue, ItemUpdateBuilder> referencingUpdates = new HashMap<>();
for(Statement statement : update.getAddedStatements()) {
Set<ReconItemIdValue> pointers = extractor.extractPointers(statement);
if (pointers.isEmpty()) {
remainingUpdateBuilder.addStatement(statement);
} else if (pointers.size() == 1 && !update.isNew()) {
ItemIdValue pointer = pointers.stream().findFirst().get();
ItemUpdateBuilder referencingBuilder = referencingUpdates.get(pointer);
if (referencingBuilder == null) {
referencingBuilder = new ItemUpdateBuilder(update.getItemId());
}
referencingBuilder.addStatement(statement);
referencingUpdates.put(pointer, referencingBuilder);
} else {
throw new ImpossibleSchedulingException();
}
}
// Add the update that is not referring to anything to the schedule
ItemUpdate pointerFree = remainingUpdateBuilder.build();
if (!pointerFree.isNull()) {
pointerFreeUpdates.add(pointerFree);
}
// Add the other updates to the map
for(Entry<ItemIdValue, ItemUpdateBuilder> entry : referencingUpdates.entrySet()) {
ItemUpdate pointerUpdate = entry.getValue().build();
UpdateSequence pointerUpdatesForKey = pointerUpdates.get(entry.getKey());
if (pointerUpdatesForKey == null) {
pointerUpdatesForKey = new UpdateSequence();
}
pointerUpdatesForKey.add(pointerUpdate);
pointerUpdates.put(entry.getKey(), pointerUpdatesForKey);
}
}
@Override
public List<ItemUpdate> schedule(List<ItemUpdate> updates) throws ImpossibleSchedulingException {
pointerUpdates = new HashMap<>();
pointerFreeUpdates = new UpdateSequence();
for(ItemUpdate update : updates) {
splitUpdate(update);
}
// Reconstruct
List<ItemUpdate> fullSchedule = new ArrayList<>();
Set<ItemIdValue> mentionedNewEntities = new HashSet<>(pointerUpdates.keySet());
for(ItemUpdate update : pointerFreeUpdates.getUpdates()) {
fullSchedule.add(update);
UpdateSequence backPointers = pointerUpdates.get(update.getItemId());
if (backPointers != null) {
fullSchedule.addAll(backPointers.getUpdates());
}
mentionedNewEntities.remove(update.getItemId());
}
// Create any item that was referred to but untouched
// (this is just for the sake of correctness, it would be bad to do that
// as the items would remain blank in this batch).
for(ItemIdValue missingId : mentionedNewEntities) {
fullSchedule.add(new ItemUpdateBuilder(missingId).build());
fullSchedule.addAll(pointerUpdates.get(missingId).getUpdates());
}
return fullSchedule;
}
}

View File

@ -0,0 +1,32 @@
package org.openrefine.wikidata.updates.scheduler;
import java.util.List;
import org.openrefine.wikidata.updates.ItemUpdate;
/**
* A scheduling strategy for item updates.
* Given a list of initial updates, the scheduler
* reorganizes these updates (possibly splitting them
* or merging them) to create a sequence that is suitable
* for a particular import process.
*
* @author Antonin Delpeuch
*
*/
public interface UpdateScheduler {
/**
* Performs the scheduling. The initial updates are provided
* as a list so that the scheduler can attempt to respect the
* initial order (but no guarantee is made for that in general).
*
* @param updates
* the updates to schedule
* @return
* the reorganized updates
* @throws ImpossibleSchedulingException
* when the scheduler cannot cope with a particular edit plan.
*/
public List<ItemUpdate> schedule(List<ItemUpdate> updates) throws ImpossibleSchedulingException;
}

View File

@ -0,0 +1,59 @@
package org.openrefine.wikidata.updates.scheduler;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.openrefine.wikidata.updates.ItemUpdate;
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
/**
* Helper class to store a list of updates where each subject
* appears at most once. It preserves order of insertion.
*
* @author Antonin Delpeuch
*/
public class UpdateSequence {
/**
* The list of updates stored by this container
*/
private List<ItemUpdate> updates = new ArrayList<>();
/**
* An index to keep track of where each item is touched in the sequence
*/
private Map<ItemIdValue, Integer> index = new HashMap<>();
/**
* Adds a new update to the list, merging it with any existing
* one with the same subject.
*
* @param update
*/
public void add(ItemUpdate update) {
ItemIdValue subject = update.getItemId();
if(index.containsKey(subject)) {
int i = index.get(subject);
ItemUpdate oldUpdate = updates.get(i);
updates.set(i, oldUpdate.merge(update));
} else {
index.put(subject, updates.size());
updates.add(update);
}
}
/**
* @return the list of merged updates
*/
public List<ItemUpdate> getUpdates() {
return updates;
}
/**
* @return the set of touched subjects
*/
public Set<ItemIdValue> getSubjects() {
return index.keySet();
}
}

View File

@ -0,0 +1,115 @@
package org.openrefine.wikidata.updates.scheduler;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue;
import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
import org.wikidata.wdtk.datamodel.interfaces.Statement;
/**
* A simple scheduler for batches commited via the Wikibase API.
*
* The strategy is quite simple and makes at most two edits
* per touched item (which is not minimal though). Each update
* is split between statements making references to new items,
* and statements not making these references. All updates with no
* references to new items are done first (which creates all new
* items), then all other updates are done.
*
* @author Antonin Delpeuch
*
*/
public class WikibaseAPIUpdateScheduler implements UpdateScheduler {
/**
* The first part of updates: the ones which create new items
* without referring to any other new item.
*/
private UpdateSequence pointerFreeUpdates;
/**
* The second part of the updates: all existing items, plus
* all parts of new items that refer to other new items.
*/
private UpdateSequence pointerFullUpdates;
/**
* The set of all new items referred to in the whole batch.
*/
private Set<ItemIdValue> allPointers;
private PointerExtractor extractor = new PointerExtractor();
@Override
public List<ItemUpdate> schedule(List<ItemUpdate> updates) {
List<ItemUpdate> result = new ArrayList<>();
pointerFreeUpdates = new UpdateSequence();
pointerFullUpdates = new UpdateSequence();
allPointers = new HashSet<>();
for(ItemUpdate update : updates) {
splitUpdate(update);
}
// Part 1: add all the pointer free updates
result.addAll(pointerFreeUpdates.getUpdates());
// Part 1': add the remaining new items that have not been touched
Set<ItemIdValue> unseenPointers = new HashSet<>(allPointers);
unseenPointers.removeAll(pointerFreeUpdates.getSubjects());
result.addAll(unseenPointers.stream()
.map(e -> new ItemUpdateBuilder(e).build())
.collect(Collectors.toList()));
// Part 2: add all the pointer full updates
result.addAll(pointerFullUpdates.getUpdates());
return result;
}
/**
* Splits an update into two parts
* @param update
*/
protected void splitUpdate(ItemUpdate update) {
ItemUpdateBuilder pointerFreeBuilder = new ItemUpdateBuilder(update.getItemId())
.addLabels(update.getLabels())
.addDescriptions(update.getDescriptions())
.addAliases(update.getAliases())
.deleteStatements(update.getDeletedStatements());
ItemUpdateBuilder pointerFullBuilder = new ItemUpdateBuilder(update.getItemId());
for(Statement statement : update.getAddedStatements()) {
Set<ReconItemIdValue> pointers = extractor.extractPointers(statement);
if (pointers.isEmpty()) {
pointerFreeBuilder.addStatement(statement);
} else {
pointerFullBuilder.addStatement(statement);
}
allPointers.addAll(pointers);
}
if(update.isNew()) {
// If the update is new, we might need to split it
// in two (if it refers to any other new entity).
ItemUpdate pointerFree = pointerFreeBuilder.build();
if (!pointerFree.isNull()) {
pointerFreeUpdates.add(pointerFree);
}
ItemUpdate pointerFull = pointerFullBuilder.build();
if (!pointerFull.isEmpty()) {
pointerFullUpdates.add(pointerFull);
}
} else {
// Otherwise, we just make sure this edit is done after
// all item creations.
pointerFullUpdates.add(update);
}
}
}

View File

@ -40,15 +40,11 @@ public class QSValuePrinterTest {
@Test @Test
public void printNewItemId() { public void printNewItemId() {
ReconEntityIdValue id = TestingDataGenerator.makeNewItemIdValue(12345L, "my new item"); ReconEntityIdValue id = TestingDataGenerator.makeNewItemIdValue(12345L, "my new item");
assertNull(id.accept(printer)); assertEquals("LAST", id.accept(printer));
// because no entity was previously created // because no entity was previously created
QSValuePrinter printerAfterCreate = new QSValuePrinter(id);
ReconEntityIdValue equalId = TestingDataGenerator.makeNewItemIdValue(12345L, "my other new item");
assertEquals("LAST", printerAfterCreate.visit(equalId));
ReconEntityIdValue differentId = TestingDataGenerator.makeNewItemIdValue(34567L, "my new item"); ReconEntityIdValue differentId = TestingDataGenerator.makeMatchedItemIdValue("Q78", "my existing item");
assertNull(printerAfterCreate.visit(differentId)); assertEquals("Q78", differentId.accept(printer));
} }
// Globe coordinates // Globe coordinates

View File

@ -18,6 +18,7 @@ import org.json.JSONWriter;
import org.openrefine.wikidata.testing.TestingDataGenerator; import org.openrefine.wikidata.testing.TestingDataGenerator;
import org.openrefine.wikidata.updates.ItemUpdate; import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.ItemUpdateBuilder; import org.openrefine.wikidata.updates.ItemUpdateBuilder;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import org.wikidata.wdtk.datamodel.helpers.Datamodel; import org.wikidata.wdtk.datamodel.helpers.Datamodel;
import org.wikidata.wdtk.datamodel.interfaces.Claim; import org.wikidata.wdtk.datamodel.interfaces.Claim;
@ -63,12 +64,24 @@ public class WikibaseSchemaTest extends RefineTest {
Collections.singletonList(Datamodel.makeReference(Collections.singletonList(retrievedSnakGroup))), Collections.singletonList(Datamodel.makeReference(Collections.singletonList(retrievedSnakGroup))),
StatementRank.NORMAL, ""); StatementRank.NORMAL, "");
private Project project;
static JSONObject jsonFromFile(String filename) throws IOException, JSONException { static JSONObject jsonFromFile(String filename) throws IOException, JSONException {
byte[] contents = Files.readAllBytes(Paths.get(filename)); byte[] contents = Files.readAllBytes(Paths.get(filename));
String decoded = new String(contents, "utf-8"); String decoded = new String(contents, "utf-8");
return ParsingUtilities.evaluateJsonStringToObject(decoded); return ParsingUtilities.evaluateJsonStringToObject(decoded);
} }
@BeforeMethod
public void setUpProject() {
project = this.createCSVProject(
"subject,inception,reference\n"+
"Q1377,1919,http://www.ljubljana-slovenia.com/university-ljubljana\n"+
"Q865528,1965,");
project.rows.get(0).cells.set(0, TestingDataGenerator.makeMatchedCell("Q1377", "University of Ljubljana"));
project.rows.get(1).cells.set(0, TestingDataGenerator.makeMatchedCell("Q865528", "University of Warwick"));
}
@Test @Test
public void testSerialize() throws JSONException, IOException { public void testSerialize() throws JSONException, IOException {
JSONObject serialized = jsonFromFile("data/schema/history_of_medicine.json"); JSONObject serialized = jsonFromFile("data/schema/history_of_medicine.json");
@ -94,12 +107,6 @@ public class WikibaseSchemaTest extends RefineTest {
public void testEvaluate() throws JSONException, IOException { public void testEvaluate() throws JSONException, IOException {
JSONObject serialized = jsonFromFile("data/schema/inception.json"); JSONObject serialized = jsonFromFile("data/schema/inception.json");
WikibaseSchema schema = WikibaseSchema.reconstruct(serialized); WikibaseSchema schema = WikibaseSchema.reconstruct(serialized);
Project project = this.createCSVProject(
"subject,inception,reference\n"+
"Q1377,1919,http://www.ljubljana-slovenia.com/university-ljubljana\n"+
"Q865528,1965,");
project.rows.get(0).cells.set(0, TestingDataGenerator.makeMatchedCell("Q1377", "University of Ljubljana"));
project.rows.get(1).cells.set(0, TestingDataGenerator.makeMatchedCell("Q865528", "University of Warwick"));
Engine engine = new Engine(project); Engine engine = new Engine(project);
List<ItemUpdate> updates = schema.evaluate(project, engine); List<ItemUpdate> updates = schema.evaluate(project, engine);
List<ItemUpdate> expected = new ArrayList<>(); List<ItemUpdate> expected = new ArrayList<>();
@ -109,4 +116,31 @@ public class WikibaseSchemaTest extends RefineTest {
expected.add(update2); expected.add(update2);
assertEquals(expected, updates); assertEquals(expected, updates);
} }
@Test
public void testEvaluateRespectsFacets() throws JSONException, IOException {
JSONObject serialized = jsonFromFile("data/schema/inception.json");
WikibaseSchema schema = WikibaseSchema.reconstruct(serialized);
Engine engine = new Engine(project);
JSONObject engineConfig = new JSONObject("{\n" +
" \"mode\": \"row-based\",\n" +
" \"facets\": [\n" +
" {\n" +
" \"mode\": \"text\",\n" +
" \"invert\": false,\n" +
" \"caseSensitive\": false,\n" +
" \"query\": \"www\",\n" +
" \"name\": \"reference\",\n" +
" \"type\": \"text\",\n" +
" \"columnName\": \"reference\"\n" +
" }\n" +
" ]\n" +
" }");
engine.initializeFromJSON(engineConfig);
List<ItemUpdate> updates = schema.evaluate(project, engine);
List<ItemUpdate> expected = new ArrayList<>();
ItemUpdate update1 = new ItemUpdateBuilder(qid1).addStatement(statement1).build();
expected.add(update1);
assertEquals(expected, updates);
}
} }

View File

@ -62,6 +62,16 @@ public class ItemUpdateTest {
public void testIsNull() { public void testIsNull() {
ItemUpdate update = new ItemUpdateBuilder(existingSubject).build(); ItemUpdate update = new ItemUpdateBuilder(existingSubject).build();
assertTrue(update.isNull()); assertTrue(update.isNull());
ItemUpdate update2 = new ItemUpdateBuilder(newSubject).build();
assertFalse(update2.isNull());
}
@Test
public void testIsEmpty() {
ItemUpdate update = new ItemUpdateBuilder(existingSubject).build();
assertTrue(update.isEmpty());
ItemUpdate update2 = new ItemUpdateBuilder(newSubject).build();
assertTrue(update2.isEmpty());
} }
@Test @Test
@ -78,8 +88,8 @@ public class ItemUpdateTest {
.addStatement(statement1) .addStatement(statement1)
.addStatement(statement2) .addStatement(statement2)
.build(); .build();
assertEquals(Arrays.asList(statement1, statement2).stream().collect(Collectors.toSet()), assertFalse(update.isNull());
update.getAddedStatements()); assertEquals(Arrays.asList(statement1, statement2), update.getAddedStatements());
assertEquals(statementGroups, update.getAddedStatementGroups().stream().collect(Collectors.toSet())); assertEquals(statementGroups, update.getAddedStatementGroups().stream().collect(Collectors.toSet()));
} }
@ -130,6 +140,7 @@ public class ItemUpdateTest {
.addAlias(aliasEn) .addAlias(aliasEn)
.addAlias(aliasFr) .addAlias(aliasFr)
.build(); .build();
assertFalse(updateA.isNull());
ItemUpdate normalized = updateA.normalizeLabelsAndAliases(); ItemUpdate normalized = updateA.normalizeLabelsAndAliases();
ItemUpdate expectedUpdate = new ItemUpdateBuilder(newSubject) ItemUpdate expectedUpdate = new ItemUpdateBuilder(newSubject)
.addLabel(label) .addLabel(label)

View File

@ -0,0 +1,95 @@
package org.openrefine.wikidata.updates.scheduler;
import static org.junit.Assert.assertEquals;
import java.math.BigDecimal;
import java.util.Collections;
import java.util.Set;
import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue;
import org.openrefine.wikidata.testing.TestingDataGenerator;
import org.testng.annotations.Test;
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
import org.wikidata.wdtk.datamodel.interfaces.Claim;
import org.wikidata.wdtk.datamodel.interfaces.GlobeCoordinatesValue;
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue;
import org.wikidata.wdtk.datamodel.interfaces.Reference;
import org.wikidata.wdtk.datamodel.interfaces.Snak;
import org.wikidata.wdtk.datamodel.interfaces.SnakGroup;
import org.wikidata.wdtk.datamodel.interfaces.StatementRank;
import org.wikidata.wdtk.datamodel.interfaces.TimeValue;
import org.wikidata.wdtk.datamodel.interfaces.Value;
public class PointerExtractorTest {
private ItemIdValue existingId = Datamodel.makeWikidataItemIdValue("Q43");
private ItemIdValue matchedId = TestingDataGenerator.makeMatchedItemIdValue("Q89","eist");
private ItemIdValue newIdA = TestingDataGenerator.makeNewItemIdValue(1234L, "new item A");
private ItemIdValue newIdB = TestingDataGenerator.makeNewItemIdValue(4567L, "new item B");
private PropertyIdValue pid = Datamodel.makeWikidataPropertyIdValue("P89");
private Snak snakWithNew = Datamodel.makeValueSnak(pid, newIdA);
private Snak snakWithoutNew = Datamodel.makeValueSnak(pid, matchedId);
private SnakGroup snakGroupWithNew = Datamodel.makeSnakGroup(Collections.singletonList(snakWithNew));
private SnakGroup snakGroupWithoutNew = Datamodel.makeSnakGroup(Collections.singletonList(snakWithoutNew));
private Claim claimWithNew = Datamodel.makeClaim(existingId, snakWithNew, Collections.emptyList());
private Claim claimNewSubject = Datamodel.makeClaim(newIdB, snakWithoutNew, Collections.emptyList());
private Claim claimNewQualifier = Datamodel.makeClaim(matchedId, snakWithoutNew,
Collections.singletonList(snakGroupWithNew));
private static PointerExtractor e = new PointerExtractor();
@Test
public void testExtractEntityId() {
assertEquals(Collections.singleton(newIdA), e.extractPointers(newIdA));
assertEmpty(e.extractPointers(existingId));
assertEmpty(e.extractPointers(matchedId));
}
@Test
public void testExtractDatavalues() {
assertEmpty(Datamodel.makeDatatypeIdValue("string"));
assertEmpty(Datamodel.makeGlobeCoordinatesValue(1.34, 2.354, 0.1, GlobeCoordinatesValue.GLOBE_EARTH));
assertEmpty(Datamodel.makeStringValue("est"));
assertEmpty(Datamodel.makeMonolingualTextValue("srtu", "en"));
assertEmpty(Datamodel.makeWikidataPropertyIdValue("P78"));
assertEmpty(Datamodel.makeQuantityValue(new BigDecimal("898")));
assertEmpty(Datamodel.makeQuantityValue(new BigDecimal("7.87"), "http://www.wikidata.org/entity/Q34"));
assertEmpty(Datamodel.makeTimeValue(1898, (byte)2, (byte)3, TimeValue.CM_GREGORIAN_PRO));
}
@Test
public void testSnak() {
assertEmpty(e.extractPointers(snakWithoutNew));
assertEquals(Collections.singleton(newIdA), e.extractPointers(snakWithNew));
assertEmpty(e.extractPointers(Datamodel.makeNoValueSnak(pid)));
}
@Test
public void testSnakGroup() {
assertEmpty(e.extractPointers(snakGroupWithoutNew));
assertEquals(Collections.singleton(newIdA), e.extractPointers(snakGroupWithNew));
}
@Test
public void testStatement() {
assertEmpty(e.extractPointers(Datamodel.makeStatement(claimNewSubject,
Collections.emptyList(), StatementRank.NORMAL, "")));
assertEquals(Collections.singleton(newIdA), e.extractPointers(Datamodel.makeStatement(claimWithNew,
Collections.emptyList(), StatementRank.NORMAL, "")));
assertEquals(Collections.singleton(newIdA), e.extractPointers(Datamodel.makeStatement(claimNewQualifier,
Collections.emptyList(), StatementRank.NORMAL, "")));
Reference reference = Datamodel.makeReference(Collections.singletonList(snakGroupWithNew));
assertEquals(Collections.singleton(newIdA), e.extractPointers(Datamodel.makeStatement(claimNewSubject,
Collections.singletonList(reference), StatementRank.NORMAL, "")));
}
private static void assertEmpty(Value v) {
assertEmpty(e.extractPointers(v));
}
private static void assertEmpty(Set<ReconItemIdValue> pointers) {
assertEquals(Collections.emptySet(), pointers);
}
}

View File

@ -0,0 +1,51 @@
package org.openrefine.wikidata.updates.scheduler;
import static org.junit.Assert.assertEquals;
import java.util.Arrays;
import java.util.List;
import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
import org.testng.annotations.Test;
public class QuickStatementsUpdateSchedulerTest extends UpdateSchedulerTest {
@Test
public void testNoNewItem() throws ImpossibleSchedulingException {
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoB).build();
ItemUpdate updateB = new ItemUpdateBuilder(existingIdB).addStatement(sBtoA).build();
List<ItemUpdate> scheduled = schedule(updateA, updateB);
assertEquals(Arrays.asList(updateA,updateB), scheduled);
}
@Test
public void testSplitUpdate() throws ImpossibleSchedulingException {
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA)
.addStatement(sAtoNewA)
.addStatement(sAtoNewB)
.build();
ItemUpdate newUpdateA = new ItemUpdateBuilder(newIdA).build();
ItemUpdate newUpdateB = new ItemUpdateBuilder(newIdB).build();
ItemUpdate splitUpdateA = new ItemUpdateBuilder(existingIdA)
.addStatement(sAtoNewA)
.build();
ItemUpdate splitUpdateB = new ItemUpdateBuilder(existingIdA)
.addStatement(sAtoNewB)
.build();
List<ItemUpdate> scheduled = schedule(updateA);
assertSetEquals(Arrays.asList(newUpdateA, splitUpdateA, newUpdateB, splitUpdateB), scheduled);
}
@Test(expectedExceptions=ImpossibleSchedulingException.class)
public void testImpossibleForQS() throws ImpossibleSchedulingException {
ItemUpdate update = new ItemUpdateBuilder(newIdA).addStatement(sNewAtoNewB).build();
schedule(update);
}
@Override
public UpdateScheduler getScheduler() {
return new QuickStatementsUpdateScheduler();
}
}

View File

@ -0,0 +1,94 @@
package org.openrefine.wikidata.updates.scheduler;
import static org.junit.Assert.assertEquals;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.openrefine.wikidata.testing.TestingDataGenerator;
import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
import org.testng.annotations.Test;
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
import org.wikidata.wdtk.datamodel.interfaces.Claim;
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue;
import org.wikidata.wdtk.datamodel.interfaces.Statement;
import org.wikidata.wdtk.datamodel.interfaces.StatementRank;
public abstract class UpdateSchedulerTest {
protected ItemIdValue existingIdA = Datamodel.makeWikidataItemIdValue("Q43");
protected ItemIdValue existingIdB = Datamodel.makeWikidataItemIdValue("Q538");
protected ItemIdValue newIdA = TestingDataGenerator.makeNewItemIdValue(1234L, "new item A");
protected ItemIdValue newIdB = TestingDataGenerator.makeNewItemIdValue(5678L, "new item B");
protected static PropertyIdValue pid = Datamodel.makeWikidataPropertyIdValue("P38");
protected Statement sAtoB = generateStatement(existingIdA, existingIdB);
protected Statement sBtoA = generateStatement(existingIdB, existingIdA);
protected Statement sAtoNewA = generateStatement(existingIdA, newIdA);
protected Statement sAtoNewB = generateStatement(existingIdA, newIdB);
protected Statement sNewAtoB = generateStatement(newIdA, existingIdB);
protected Statement sNewAtoNewB = generateStatement(newIdA, newIdB);
public static Statement generateStatement(ItemIdValue from, ItemIdValue to) {
Claim claim = Datamodel.makeClaim(from, Datamodel.makeValueSnak(pid, to), Collections.emptyList());
return Datamodel.makeStatement(claim, Collections.emptyList(), StatementRank.NORMAL, "");
}
public abstract UpdateScheduler getScheduler();
protected List<ItemUpdate> schedule(ItemUpdate... itemUpdates) throws ImpossibleSchedulingException {
return getScheduler().schedule(Arrays.asList(itemUpdates));
}
protected static void assertSetEquals(List<ItemUpdate> expected, List<ItemUpdate> actual) {
assertEquals(expected.stream().collect(Collectors.toSet()),
actual.stream().collect(Collectors.toSet()));
}
@Test
public void testNewItemNotMentioned() throws ImpossibleSchedulingException {
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoNewA).build();
List<ItemUpdate> scheduled = schedule(updateA);
ItemUpdate newUpdate = new ItemUpdateBuilder(newIdA).build();
assertEquals(Arrays.asList(newUpdate, updateA), scheduled);
}
@Test
public void testNewItemMentioned() throws ImpossibleSchedulingException {
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoNewA).build();
ItemUpdate newUpdate = new ItemUpdateBuilder(newIdA).addStatement(sNewAtoB).build();
List<ItemUpdate> scheduled = schedule(updateA, newUpdate);
assertEquals(Arrays.asList(newUpdate, updateA), scheduled);
}
@Test
public void testMerge() throws ImpossibleSchedulingException {
ItemUpdate update1 = new ItemUpdateBuilder(existingIdA)
.addStatement(sAtoB)
.build();
ItemUpdate update2 = new ItemUpdateBuilder(existingIdA)
.addLabel(Datamodel.makeMonolingualTextValue("hello", "fr"))
.addStatement(sAtoB)
.build();
ItemUpdate merged = update1.merge(update2);
assertEquals(Collections.singletonList(merged), schedule(update1, update2));
}
@Test
public void testMergeNew() throws ImpossibleSchedulingException {
ItemUpdate update1 = new ItemUpdateBuilder(newIdA)
.addLabel(Datamodel.makeMonolingualTextValue("hello", "fr"))
.addStatement(sNewAtoB)
.build();
ItemUpdate update2 = new ItemUpdateBuilder(newIdA)
.addLabel(Datamodel.makeMonolingualTextValue("hello", "fr"))
.build();
ItemUpdate merged = update1.merge(update2);
assertEquals(Collections.singletonList(merged), schedule(update1, update2));
}
}

View File

@ -0,0 +1,58 @@
package org.openrefine.wikidata.updates.scheduler;
import static org.junit.Assert.assertEquals;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
import org.testng.annotations.Test;
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
public class WikibaseAPIUpdateSchedulerTest extends UpdateSchedulerTest {
@Test
public void testOrderPreserved() throws ImpossibleSchedulingException {
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoB).build();
ItemUpdate updateB = new ItemUpdateBuilder(existingIdB).addStatement(sBtoA).build();
List<ItemUpdate> scheduled = schedule(updateA, updateB);
assertEquals(Arrays.asList(updateA,updateB), scheduled);
}
@Test
public void testUpdateIsNotSplit() throws ImpossibleSchedulingException {
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA)
.addStatement(sAtoNewA)
.addStatement(sAtoNewB)
.build();
ItemUpdate newUpdateA = new ItemUpdateBuilder(newIdA).build();
ItemUpdate newUpdateB = new ItemUpdateBuilder(newIdB).build();
List<ItemUpdate> scheduled = schedule(updateA);
assertSetEquals(Arrays.asList(newUpdateA, newUpdateB, updateA), scheduled);
}
@Test
public void testMixedUpdate() throws ImpossibleSchedulingException {
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA)
.addStatement(sAtoNewA)
.addStatement(sAtoNewB)
.addStatement(sAtoB)
.build();
ItemUpdate newUpdateA = new ItemUpdateBuilder(newIdA)
.addStatement(sNewAtoB)
.build();
ItemUpdate newUpdateB = new ItemUpdateBuilder(newIdB)
.build();
List<ItemUpdate> scheduled = schedule(updateA, newUpdateA);
assertEquals(Arrays.asList(newUpdateA, newUpdateB, updateA), scheduled);
}
@Override
public UpdateScheduler getScheduler() {
return new WikibaseAPIUpdateScheduler();
}
}