Introduce a scheduler package to reorganize item updates
This commit is contained in:
parent
88178d7c04
commit
bb044612e0
@ -18,35 +18,15 @@ import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor;
|
||||
* Format documentation:
|
||||
* https://www.wikidata.org/wiki/Help:QuickStatements
|
||||
*
|
||||
* Any new entity id will be
|
||||
* assumed to be the last one created, represented with "LAST". It is
|
||||
* fine to do this assumption because we are working on edit batches
|
||||
* previously scheduled by {@link QuickStatementsUpdateScheduler}.
|
||||
*
|
||||
* @author Antonin Delpeuch
|
||||
*
|
||||
*/
|
||||
public class QSValuePrinter implements ValueVisitor<String> {
|
||||
|
||||
private final ReconEntityIdValue lastCreatedEntityIdValue;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* Creates a printer for a context where no entity was previously
|
||||
* created with the "CREATE" command. Any new entity id will not
|
||||
* be printed.
|
||||
*/
|
||||
public QSValuePrinter() {
|
||||
lastCreatedEntityIdValue = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a printer for a context where an entity was previously
|
||||
* created with the "CREATE" command. If this id is encountered,
|
||||
* it will be printed as "LAST".
|
||||
*
|
||||
* @param lastCreatedEntityIdValue
|
||||
* the virtual id of the last created entity
|
||||
*/
|
||||
public QSValuePrinter(ReconEntityIdValue lastCreatedEntityIdValue) {
|
||||
this.lastCreatedEntityIdValue = lastCreatedEntityIdValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String visit(DatatypeIdValue value) {
|
||||
@ -57,11 +37,8 @@ public class QSValuePrinter implements ValueVisitor<String> {
|
||||
|
||||
@Override
|
||||
public String visit(EntityIdValue value) {
|
||||
if (lastCreatedEntityIdValue != null && lastCreatedEntityIdValue.equals(value)) {
|
||||
if (ReconEntityIdValue.class.isInstance(value) && ((ReconEntityIdValue)value).isNew()) {
|
||||
return "LAST";
|
||||
} else if (ReconEntityIdValue.class.isInstance(value)) {
|
||||
// oops, we are trying to print another newly created entity (not the last one)
|
||||
return null;
|
||||
}
|
||||
return value.getId();
|
||||
}
|
||||
|
@ -12,6 +12,8 @@ import com.google.refine.model.Project;
|
||||
|
||||
import org.openrefine.wikidata.schema.WikibaseSchema;
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException;
|
||||
import org.openrefine.wikidata.updates.scheduler.QuickStatementsUpdateScheduler;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Claim;
|
||||
@ -27,6 +29,9 @@ import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor;
|
||||
public class QuickStatementsExporter implements WriterExporter {
|
||||
|
||||
final static Logger logger = LoggerFactory.getLogger("QuickStatementsExporter");
|
||||
|
||||
public static final String impossibleSchedulingErrorMessage =
|
||||
"This edit batch cannot be performed with QuickStatements due to the structure of its new items.";
|
||||
|
||||
public QuickStatementsExporter(){
|
||||
}
|
||||
@ -64,10 +69,17 @@ public class QuickStatementsExporter implements WriterExporter {
|
||||
translateItemList(items, writer);
|
||||
}
|
||||
|
||||
public void translateItemList(List<ItemUpdate> editBatch, Writer writer) throws IOException {
|
||||
for (ItemUpdate item : editBatch) {
|
||||
translateItem(item, writer);
|
||||
public void translateItemList(List<ItemUpdate> updates, Writer writer) throws IOException {
|
||||
QuickStatementsUpdateScheduler scheduler = new QuickStatementsUpdateScheduler();
|
||||
try {
|
||||
List<ItemUpdate> scheduled = scheduler.schedule(updates);
|
||||
for (ItemUpdate item : scheduled) {
|
||||
translateItem(item, writer);
|
||||
}
|
||||
} catch(ImpossibleSchedulingException e) {
|
||||
writer.write(impossibleSchedulingErrorMessage);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected void translateNameDescr(String qid, Set<MonolingualTextValue> values, String prefix, ItemIdValue id, Writer writer) throws IOException {
|
||||
@ -86,7 +98,7 @@ public class QuickStatementsExporter implements WriterExporter {
|
||||
if (item.isNew()) {
|
||||
writer.write("CREATE\n");
|
||||
qid = "LAST";
|
||||
item.normalizeLabelsAndAliases();
|
||||
item = item.normalizeLabelsAndAliases();
|
||||
}
|
||||
|
||||
translateNameDescr(qid, item.getLabels(), "L", item.getItemId(), writer);
|
||||
|
@ -17,6 +17,9 @@ import org.json.JSONWriter;
|
||||
import org.openrefine.wikidata.editing.ConnectionManager;
|
||||
import org.openrefine.wikidata.editing.NewItemLibrary;
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException;
|
||||
import org.openrefine.wikidata.updates.scheduler.UpdateScheduler;
|
||||
import org.openrefine.wikidata.updates.scheduler.WikibaseAPIUpdateScheduler;
|
||||
import org.openrefine.wikidata.schema.WikibaseSchema;
|
||||
import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue;
|
||||
import org.slf4j.Logger;
|
||||
@ -29,6 +32,7 @@ import org.wikidata.wdtk.datamodel.interfaces.ItemDocument;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
|
||||
import org.wikidata.wdtk.util.WebResourceFetcherImpl;
|
||||
import org.wikidata.wdtk.wikibaseapi.ApiConnection;
|
||||
import org.wikidata.wdtk.wikibaseapi.TermStatementUpdate;
|
||||
import org.wikidata.wdtk.wikibaseapi.WikibaseDataEditor;
|
||||
import org.wikidata.wdtk.wikibaseapi.WikibaseDataFetcher;
|
||||
import org.wikidata.wdtk.wikibaseapi.apierrors.MediaWikiApiErrorException;
|
||||
@ -215,8 +219,10 @@ public class PerformWikibaseEditsOperation extends EngineDependentOperation {
|
||||
// Evaluate the schema
|
||||
List<ItemUpdate> itemDocuments = _schema.evaluate(_project, _engine);
|
||||
|
||||
// Group statements by item
|
||||
Map<EntityIdValue, ItemUpdate> updates = ItemUpdate.groupBySubject(itemDocuments);
|
||||
// Schedule the edit batch
|
||||
WikibaseAPIUpdateScheduler scheduler = new WikibaseAPIUpdateScheduler();
|
||||
List<ItemUpdate> updates = null;
|
||||
updates = scheduler.schedule(itemDocuments);
|
||||
|
||||
/**
|
||||
* TODO:
|
||||
@ -228,7 +234,7 @@ public class PerformWikibaseEditsOperation extends EngineDependentOperation {
|
||||
NewItemLibrary newItemLibrary = new NewItemLibrary();
|
||||
DataObjectFactory factory = new DataObjectFactoryImpl();
|
||||
List<ItemUpdate> remainingItemUpdates = new ArrayList<>();
|
||||
remainingItemUpdates.addAll(updates.values());
|
||||
remainingItemUpdates.addAll(updates);
|
||||
int totalItemUpdates = updates.size();
|
||||
int updatesDone = 0;
|
||||
int batchSize = 50;
|
||||
@ -295,6 +301,20 @@ public class PerformWikibaseEditsOperation extends EngineDependentOperation {
|
||||
} else {
|
||||
// Existing item
|
||||
ItemDocument currentDocument = (ItemDocument)currentDocs.get(update.getItemId().getId());
|
||||
/*
|
||||
TermStatementUpdate tsUpdate = new TermStatementUpdate(
|
||||
currentDocument,
|
||||
update.getAddedStatements().stream().collect(Collectors.toList()),
|
||||
update.getDeletedStatements().stream().collect(Collectors.toList()),
|
||||
update.getLabels().stream().collect(Collectors.toList()),
|
||||
update.getDescriptions().stream().collect(Collectors.toList()),
|
||||
update.getAliases().stream().collect(Collectors.toList()),
|
||||
new ArrayList<MonolingualTextValue>()
|
||||
);
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
logger.info(mapper.writeValueAsString(update));
|
||||
logger.info(update.toString());
|
||||
logger.info(tsUpdate.getJsonUpdateString()); */
|
||||
wbde.updateTermsStatements(currentDocument,
|
||||
update.getLabels().stream().collect(Collectors.toList()),
|
||||
update.getDescriptions().stream().collect(Collectors.toList()),
|
||||
|
@ -18,11 +18,15 @@ import org.openrefine.wikidata.qa.scrutinizers.SingleValueScrutinizer;
|
||||
import org.openrefine.wikidata.qa.scrutinizers.UnsourcedScrutinizer;
|
||||
import org.openrefine.wikidata.qa.scrutinizers.WhitespaceScrutinizer;
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException;
|
||||
import org.openrefine.wikidata.updates.scheduler.UpdateScheduler;
|
||||
import org.openrefine.wikidata.updates.scheduler.WikibaseAPIUpdateScheduler;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
|
||||
|
||||
/**
|
||||
* Runs a collection of edit scrutinizers on an edit batch
|
||||
* @author antonin
|
||||
* Runs a collection of edit scrutinizers on an edit batch.
|
||||
*
|
||||
* @author Antonin Delpeuch
|
||||
*
|
||||
*/
|
||||
public class EditInspector {
|
||||
@ -63,10 +67,19 @@ public class EditInspector {
|
||||
* @param editBatch
|
||||
*/
|
||||
public void inspect(List<ItemUpdate> editBatch) {
|
||||
Map<EntityIdValue, ItemUpdate> updates = ItemUpdate.groupBySubject(editBatch);
|
||||
List<ItemUpdate> mergedUpdates = updates.values().stream().collect(Collectors.toList());
|
||||
for(EditScrutinizer scrutinizer : scrutinizers.values()) {
|
||||
scrutinizer.scrutinize(mergedUpdates);
|
||||
// First, schedule them with some scheduler,
|
||||
// so that all newly created entities appear in the batch
|
||||
UpdateScheduler scheduler = new WikibaseAPIUpdateScheduler();
|
||||
try {
|
||||
editBatch = scheduler.schedule(editBatch);
|
||||
Map<EntityIdValue, ItemUpdate> updates = ItemUpdate.groupBySubject(editBatch);
|
||||
List<ItemUpdate> mergedUpdates = updates.values().stream().collect(Collectors.toList());
|
||||
for(EditScrutinizer scrutinizer : scrutinizers.values()) {
|
||||
scrutinizer.scrutinize(mergedUpdates);
|
||||
}
|
||||
} catch(ImpossibleSchedulingException e) {
|
||||
warningStore.addWarning(new QAWarning(
|
||||
"scheduling-failed", null, QAWarning.Severity.CRITICAL, 1));
|
||||
}
|
||||
|
||||
if (warningStore.getNbWarnings() == 0) {
|
||||
|
@ -8,6 +8,8 @@ import org.wikidata.wdtk.datamodel.helpers.Hash;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
|
||||
import com.google.refine.model.Recon;
|
||||
|
||||
/**
|
||||
@ -38,11 +40,13 @@ public abstract class ReconEntityIdValue implements PrefetchedEntityIdValue {
|
||||
Recon.Judgment.New.equals(_recon.judgment));
|
||||
}
|
||||
|
||||
protected boolean isMatched() {
|
||||
@JsonIgnore
|
||||
public boolean isMatched() {
|
||||
return Recon.Judgment.Matched.equals(_recon.judgment) && _recon.match != null;
|
||||
}
|
||||
|
||||
protected boolean isNew() {
|
||||
@JsonIgnore
|
||||
public boolean isNew() {
|
||||
return !isMatched();
|
||||
}
|
||||
|
||||
|
@ -31,7 +31,7 @@ import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
*/
|
||||
public class ItemUpdate {
|
||||
private final ItemIdValue qid;
|
||||
private final Set<Statement> addedStatements;
|
||||
private final List<Statement> addedStatements;
|
||||
private final Set<Statement> deletedStatements;
|
||||
private final Set<MonolingualTextValue> labels;
|
||||
private final Set<MonolingualTextValue> descriptions;
|
||||
@ -42,11 +42,24 @@ public class ItemUpdate {
|
||||
*
|
||||
* @param qid
|
||||
* the subject of the document. It can be a reconciled item value for new items.
|
||||
* @param addedStatements
|
||||
* the statements to add on the item. They should be distinct. They
|
||||
* are modelled as a list because their insertion order matters.
|
||||
* @param deletedStatements
|
||||
* the statements to remove from the item
|
||||
* @param labels
|
||||
* the labels to add on the item
|
||||
* @param descriptions
|
||||
* the descriptions to add on the item
|
||||
* @param aliases
|
||||
* the aliases to add on the item. In theory their order should matter
|
||||
* but in practice people rarely rely on the order of aliases so this
|
||||
* is just kept as a set for simplicity.
|
||||
*/
|
||||
@JsonCreator
|
||||
public ItemUpdate(
|
||||
@JsonProperty("subject") ItemIdValue qid,
|
||||
@JsonProperty("addedStatements") Set<Statement> addedStatements,
|
||||
@JsonProperty("addedStatements") List<Statement> addedStatements,
|
||||
@JsonProperty("deletedStatements") Set<Statement> deletedStatements,
|
||||
@JsonProperty("labels") Set<MonolingualTextValue> labels,
|
||||
@JsonProperty("descriptions") Set<MonolingualTextValue> descriptions,
|
||||
@ -54,7 +67,7 @@ public class ItemUpdate {
|
||||
Validate.notNull(qid);
|
||||
this.qid = qid;
|
||||
if(addedStatements == null) {
|
||||
addedStatements = Collections.emptySet();
|
||||
addedStatements = Collections.emptyList();
|
||||
}
|
||||
this.addedStatements = addedStatements;
|
||||
if(deletedStatements == null) {
|
||||
@ -84,10 +97,13 @@ public class ItemUpdate {
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the set of all added statements
|
||||
* Added statements are recorded as a list because
|
||||
* their order of insertion matters.
|
||||
*
|
||||
* @return the list of all added statements
|
||||
*/
|
||||
@JsonProperty("addedStatements")
|
||||
public Set<Statement> getAddedStatements() {
|
||||
public List<Statement> getAddedStatements() {
|
||||
return addedStatements;
|
||||
}
|
||||
|
||||
@ -124,11 +140,18 @@ public class ItemUpdate {
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true when this change is empty
|
||||
* (no statements or terms changed)
|
||||
* @return true when this change is empty and its subject is not new
|
||||
*/
|
||||
@JsonIgnore
|
||||
public boolean isNull() {
|
||||
return isEmpty() && !isNew();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true when this change leaves the content of the document untouched
|
||||
*/
|
||||
@JsonIgnore
|
||||
public boolean isEmpty() {
|
||||
return (addedStatements.isEmpty()
|
||||
&& deletedStatements.isEmpty()
|
||||
&& labels.isEmpty()
|
||||
@ -145,8 +168,12 @@ public class ItemUpdate {
|
||||
*/
|
||||
public ItemUpdate merge(ItemUpdate other) {
|
||||
Validate.isTrue(qid.equals(other.getItemId()));
|
||||
Set<Statement> newAddedStatements = new HashSet<>(addedStatements);
|
||||
newAddedStatements.addAll(other.getAddedStatements());
|
||||
List<Statement> newAddedStatements = new ArrayList<>(addedStatements);
|
||||
for(Statement statement : other.getAddedStatements()) {
|
||||
if (!newAddedStatements.contains(statement)) {
|
||||
newAddedStatements.add(statement);
|
||||
}
|
||||
}
|
||||
Set<Statement> newDeletedStatements = new HashSet<>(deletedStatements);
|
||||
newDeletedStatements.addAll(other.getDeletedStatements());
|
||||
Set<MonolingualTextValue> newLabels = new HashSet<>(labels);
|
||||
@ -264,16 +291,29 @@ public class ItemUpdate {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("<Update on ");
|
||||
builder.append(qid);
|
||||
builder.append("\n Labels: ");
|
||||
builder.append(labels);
|
||||
builder.append("\n Descriptions: ");
|
||||
builder.append(descriptions);
|
||||
builder.append("\n Aliases: ");
|
||||
builder.append(aliases);
|
||||
builder.append("\n Added statements: ");
|
||||
builder.append(addedStatements);
|
||||
builder.append("\n Deleted statements: ");
|
||||
builder.append(deletedStatements);
|
||||
if (!labels.isEmpty()) {
|
||||
builder.append("\n Labels: ");
|
||||
builder.append(labels);
|
||||
}
|
||||
if (!descriptions.isEmpty()) {
|
||||
builder.append("\n Descriptions: ");
|
||||
builder.append(descriptions);
|
||||
}
|
||||
if (!aliases.isEmpty()) {
|
||||
builder.append("\n Aliases: ");
|
||||
builder.append(aliases);
|
||||
}
|
||||
if (!addedStatements.isEmpty()) {
|
||||
builder.append("\n Added statements: ");
|
||||
builder.append(addedStatements);
|
||||
}
|
||||
if (!deletedStatements.isEmpty()) {
|
||||
builder.append("\n Deleted statements: ");
|
||||
builder.append(deletedStatements);
|
||||
}
|
||||
if (isNull()) {
|
||||
builder.append(" (null update)");
|
||||
}
|
||||
builder.append("\n>");
|
||||
return builder.toString();
|
||||
}
|
||||
|
@ -1,7 +1,9 @@
|
||||
package org.openrefine.wikidata.updates;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
import org.jsoup.helper.Validate;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
|
||||
@ -17,7 +19,7 @@ import org.wikidata.wdtk.datamodel.interfaces.Statement;
|
||||
*/
|
||||
public class ItemUpdateBuilder {
|
||||
private ItemIdValue qid;
|
||||
private Set<Statement> addedStatements;
|
||||
private List<Statement> addedStatements;
|
||||
private Set<Statement> deletedStatements;
|
||||
private Set<MonolingualTextValue> labels;
|
||||
private Set<MonolingualTextValue> descriptions;
|
||||
@ -33,7 +35,7 @@ public class ItemUpdateBuilder {
|
||||
public ItemUpdateBuilder(ItemIdValue qid) {
|
||||
Validate.notNull(qid);
|
||||
this.qid = qid;
|
||||
this.addedStatements = new HashSet<>();
|
||||
this.addedStatements = new ArrayList<>();
|
||||
this.deletedStatements = new HashSet<Statement>();
|
||||
this.labels = new HashSet<MonolingualTextValue>();
|
||||
this.descriptions = new HashSet<MonolingualTextValue>();
|
||||
@ -103,6 +105,19 @@ public class ItemUpdateBuilder {
|
||||
labels.add(label);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a list of labels to the item. It will override any
|
||||
* existing label in each language.
|
||||
*
|
||||
* @param labels
|
||||
* the labels to add
|
||||
*/
|
||||
public ItemUpdateBuilder addLabels(Set<MonolingualTextValue> labels) {
|
||||
Validate.isTrue(!built, "ItemUpdate has already been built");
|
||||
this.labels.addAll(labels);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a description to the item. It will override any existing
|
||||
@ -116,6 +131,19 @@ public class ItemUpdateBuilder {
|
||||
descriptions.add(description);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a list of descriptions to the item. It will override any
|
||||
* existing description in each language.
|
||||
*
|
||||
* @param descriptions
|
||||
* the descriptions to add
|
||||
*/
|
||||
public ItemUpdateBuilder addDescriptions(Set<MonolingualTextValue> descriptions) {
|
||||
Validate.isTrue(!built, "ItemUpdate has already been built");
|
||||
this.descriptions.addAll(descriptions);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds an alias to the item. It will be added to any existing
|
||||
@ -129,6 +157,19 @@ public class ItemUpdateBuilder {
|
||||
aliases.add(alias);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a list of aliases to the item. They will be added to any
|
||||
* existing aliases in each language.
|
||||
*
|
||||
* @param aliases
|
||||
* the aliases to add
|
||||
*/
|
||||
public ItemUpdateBuilder addAliases(Set<MonolingualTextValue> aliases) {
|
||||
Validate.isTrue(!built, "ItemUpdate has already been built");
|
||||
this.aliases.addAll(aliases);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs the {@link ItemUpdate}.
|
||||
|
@ -0,0 +1,8 @@
|
||||
package org.openrefine.wikidata.updates.scheduler;
|
||||
|
||||
|
||||
public class ImpossibleSchedulingException extends Exception {
|
||||
|
||||
private static final long serialVersionUID = 6621563898380564148L;
|
||||
|
||||
}
|
@ -0,0 +1,152 @@
|
||||
package org.openrefine.wikidata.updates.scheduler;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue;
|
||||
import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.DatatypeIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.GlobeCoordinatesValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.QuantityValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Snak;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.SnakGroup;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Statement;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.StringValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.TimeValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Value;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor;
|
||||
|
||||
/**
|
||||
* A class that extracts the new entity ids referred to
|
||||
* in a statement.
|
||||
*
|
||||
* @author Antonin Delpeuch
|
||||
*
|
||||
*/
|
||||
public class PointerExtractor implements ValueVisitor<Set<ReconItemIdValue>> {
|
||||
|
||||
/**
|
||||
* Extracts all the new entities mentioned by this statement. This
|
||||
* does not include the subject of the statement.
|
||||
*
|
||||
* @param statement
|
||||
* the statement to inspect
|
||||
* @return
|
||||
* the set of all new entities mentioned by the statement
|
||||
*/
|
||||
public Set<ReconItemIdValue> extractPointers(Statement statement) {
|
||||
Set<ReconItemIdValue> result = new HashSet<>();
|
||||
result.addAll(extractPointers(statement.getClaim().getMainSnak()));
|
||||
result.addAll(extractPointers(statement.getClaim().getQualifiers()));
|
||||
statement.getReferences().stream()
|
||||
.map(l -> extractPointers(l.getSnakGroups()))
|
||||
.forEach(s -> result.addAll(s));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts all the new entities mentioned by this list of snak groups.
|
||||
*
|
||||
* @param snakGroups
|
||||
* @return
|
||||
*/
|
||||
public Set<ReconItemIdValue> extractPointers(List<SnakGroup> snakGroups) {
|
||||
Set<ReconItemIdValue> result = new HashSet<>();
|
||||
snakGroups.stream()
|
||||
.map(s -> extractPointers(s))
|
||||
.forEach(s -> result.addAll(s));
|
||||
return result;
|
||||
}
|
||||
|
||||
/***
|
||||
* Extracts all the new entities mentioned by this snak group.
|
||||
*
|
||||
* @param snakGroup
|
||||
* @return
|
||||
*/
|
||||
public Set<ReconItemIdValue> extractPointers(SnakGroup snakGroup) {
|
||||
Set<ReconItemIdValue> result = new HashSet<>();
|
||||
snakGroup.getSnaks().stream()
|
||||
.map(s -> extractPointers(s))
|
||||
.forEach(s -> result.addAll(s));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts all new entities mentioned by this snak group.
|
||||
* Currently there will be at most one: the target of the snak
|
||||
* (as property ids cannot be new for now).
|
||||
*
|
||||
* @param snak
|
||||
* @return
|
||||
*/
|
||||
public Set<ReconItemIdValue> extractPointers(Snak snak) {
|
||||
Set<ReconItemIdValue> result = new HashSet<>();
|
||||
result.addAll(extractPointers(snak.getPropertyId()));
|
||||
result.addAll(extractPointers(snak.getValue()));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts any new entity from the value.
|
||||
*
|
||||
* @param value
|
||||
* @return
|
||||
*/
|
||||
public Set<ReconItemIdValue> extractPointers(Value value) {
|
||||
if (value == null) {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
Set<ReconItemIdValue> pointers = value.accept(this);
|
||||
if (pointers == null) {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
return pointers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ReconItemIdValue> visit(DatatypeIdValue value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ReconItemIdValue> visit(EntityIdValue value) {
|
||||
if(ReconItemIdValue.class.isInstance(value)) {
|
||||
ReconItemIdValue recon = (ReconItemIdValue)value;
|
||||
if(recon.isNew()) {
|
||||
return Collections.singleton(recon);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ReconItemIdValue> visit(GlobeCoordinatesValue value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ReconItemIdValue> visit(MonolingualTextValue value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ReconItemIdValue> visit(QuantityValue value) {
|
||||
// units cannot be new because WDTK represents them as strings already
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ReconItemIdValue> visit(StringValue value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ReconItemIdValue> visit(TimeValue value) {
|
||||
return null;
|
||||
}
|
||||
}
|
@ -0,0 +1,118 @@
|
||||
package org.openrefine.wikidata.updates.scheduler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
||||
import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue;
|
||||
import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue;
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Statement;
|
||||
|
||||
|
||||
public class QuickStatementsUpdateScheduler implements UpdateScheduler {
|
||||
|
||||
private PointerExtractor extractor = new PointerExtractor();
|
||||
|
||||
/**
|
||||
* This map holds for each new entity id value a list of updates
|
||||
* that refer to this id (and should hence be scheduled right after
|
||||
* creation of that entity).
|
||||
*/
|
||||
private Map<ItemIdValue, UpdateSequence> pointerUpdates;
|
||||
|
||||
/**
|
||||
* This contains all updates which do not refer to any new entity
|
||||
* apart from possibly the subject, in the order that they were supplied to us.
|
||||
*/
|
||||
private UpdateSequence pointerFreeUpdates;
|
||||
|
||||
/**
|
||||
* Separates out the statements which refer to new items from the rest
|
||||
* of the update. The resulting updates are stored in {@link referencingUpdates}
|
||||
* and {@link updatesWithoutReferences}.
|
||||
*
|
||||
* @param update
|
||||
* @throws ImpossibleSchedulingException
|
||||
* if two new item ids are referred to in the same statement
|
||||
*/
|
||||
protected void splitUpdate(ItemUpdate update) throws ImpossibleSchedulingException {
|
||||
ItemUpdateBuilder remainingUpdateBuilder = new ItemUpdateBuilder(update.getItemId())
|
||||
.addLabels(update.getLabels())
|
||||
.addDescriptions(update.getDescriptions())
|
||||
.addAliases(update.getAliases())
|
||||
.deleteStatements(update.getDeletedStatements());
|
||||
Map<ItemIdValue, ItemUpdateBuilder> referencingUpdates = new HashMap<>();
|
||||
|
||||
for(Statement statement : update.getAddedStatements()) {
|
||||
Set<ReconItemIdValue> pointers = extractor.extractPointers(statement);
|
||||
if (pointers.isEmpty()) {
|
||||
remainingUpdateBuilder.addStatement(statement);
|
||||
} else if (pointers.size() == 1 && !update.isNew()) {
|
||||
ItemIdValue pointer = pointers.stream().findFirst().get();
|
||||
ItemUpdateBuilder referencingBuilder = referencingUpdates.get(pointer);
|
||||
if (referencingBuilder == null) {
|
||||
referencingBuilder = new ItemUpdateBuilder(update.getItemId());
|
||||
}
|
||||
referencingBuilder.addStatement(statement);
|
||||
referencingUpdates.put(pointer, referencingBuilder);
|
||||
} else {
|
||||
throw new ImpossibleSchedulingException();
|
||||
}
|
||||
}
|
||||
|
||||
// Add the update that is not referring to anything to the schedule
|
||||
ItemUpdate pointerFree = remainingUpdateBuilder.build();
|
||||
if (!pointerFree.isNull()) {
|
||||
pointerFreeUpdates.add(pointerFree);
|
||||
}
|
||||
// Add the other updates to the map
|
||||
for(Entry<ItemIdValue, ItemUpdateBuilder> entry : referencingUpdates.entrySet()) {
|
||||
ItemUpdate pointerUpdate = entry.getValue().build();
|
||||
UpdateSequence pointerUpdatesForKey = pointerUpdates.get(entry.getKey());
|
||||
if (pointerUpdatesForKey == null) {
|
||||
pointerUpdatesForKey = new UpdateSequence();
|
||||
}
|
||||
pointerUpdatesForKey.add(pointerUpdate);
|
||||
pointerUpdates.put(entry.getKey(), pointerUpdatesForKey);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ItemUpdate> schedule(List<ItemUpdate> updates) throws ImpossibleSchedulingException {
|
||||
pointerUpdates = new HashMap<>();
|
||||
pointerFreeUpdates = new UpdateSequence();
|
||||
|
||||
for(ItemUpdate update : updates) {
|
||||
splitUpdate(update);
|
||||
}
|
||||
|
||||
// Reconstruct
|
||||
List<ItemUpdate> fullSchedule = new ArrayList<>();
|
||||
Set<ItemIdValue> mentionedNewEntities = new HashSet<>(pointerUpdates.keySet());
|
||||
for(ItemUpdate update : pointerFreeUpdates.getUpdates()) {
|
||||
fullSchedule.add(update);
|
||||
UpdateSequence backPointers = pointerUpdates.get(update.getItemId());
|
||||
if (backPointers != null) {
|
||||
fullSchedule.addAll(backPointers.getUpdates());
|
||||
}
|
||||
mentionedNewEntities.remove(update.getItemId());
|
||||
}
|
||||
|
||||
// Create any item that was referred to but untouched
|
||||
// (this is just for the sake of correctness, it would be bad to do that
|
||||
// as the items would remain blank in this batch).
|
||||
for(ItemIdValue missingId : mentionedNewEntities) {
|
||||
fullSchedule.add(new ItemUpdateBuilder(missingId).build());
|
||||
fullSchedule.addAll(pointerUpdates.get(missingId).getUpdates());
|
||||
}
|
||||
return fullSchedule;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
package org.openrefine.wikidata.updates.scheduler;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
|
||||
/**
|
||||
* A scheduling strategy for item updates.
|
||||
* Given a list of initial updates, the scheduler
|
||||
* reorganizes these updates (possibly splitting them
|
||||
* or merging them) to create a sequence that is suitable
|
||||
* for a particular import process.
|
||||
*
|
||||
* @author Antonin Delpeuch
|
||||
*
|
||||
*/
|
||||
public interface UpdateScheduler {
|
||||
|
||||
/**
|
||||
* Performs the scheduling. The initial updates are provided
|
||||
* as a list so that the scheduler can attempt to respect the
|
||||
* initial order (but no guarantee is made for that in general).
|
||||
*
|
||||
* @param updates
|
||||
* the updates to schedule
|
||||
* @return
|
||||
* the reorganized updates
|
||||
* @throws ImpossibleSchedulingException
|
||||
* when the scheduler cannot cope with a particular edit plan.
|
||||
*/
|
||||
public List<ItemUpdate> schedule(List<ItemUpdate> updates) throws ImpossibleSchedulingException;
|
||||
}
|
@ -0,0 +1,59 @@
|
||||
package org.openrefine.wikidata.updates.scheduler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
|
||||
|
||||
/**
|
||||
* Helper class to store a list of updates where each subject
|
||||
* appears at most once. It preserves order of insertion.
|
||||
*
|
||||
* @author Antonin Delpeuch
|
||||
*/
|
||||
public class UpdateSequence {
|
||||
/**
|
||||
* The list of updates stored by this container
|
||||
*/
|
||||
private List<ItemUpdate> updates = new ArrayList<>();
|
||||
/**
|
||||
* An index to keep track of where each item is touched in the sequence
|
||||
*/
|
||||
private Map<ItemIdValue, Integer> index = new HashMap<>();
|
||||
|
||||
/**
|
||||
* Adds a new update to the list, merging it with any existing
|
||||
* one with the same subject.
|
||||
*
|
||||
* @param update
|
||||
*/
|
||||
public void add(ItemUpdate update) {
|
||||
ItemIdValue subject = update.getItemId();
|
||||
if(index.containsKey(subject)) {
|
||||
int i = index.get(subject);
|
||||
ItemUpdate oldUpdate = updates.get(i);
|
||||
updates.set(i, oldUpdate.merge(update));
|
||||
} else {
|
||||
index.put(subject, updates.size());
|
||||
updates.add(update);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the list of merged updates
|
||||
*/
|
||||
public List<ItemUpdate> getUpdates() {
|
||||
return updates;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the set of touched subjects
|
||||
*/
|
||||
public Set<ItemIdValue> getSubjects() {
|
||||
return index.keySet();
|
||||
}
|
||||
}
|
@ -0,0 +1,115 @@
|
||||
package org.openrefine.wikidata.updates.scheduler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue;
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Statement;
|
||||
|
||||
/**
|
||||
* A simple scheduler for batches commited via the Wikibase API.
|
||||
*
|
||||
* The strategy is quite simple and makes at most two edits
|
||||
* per touched item (which is not minimal though). Each update
|
||||
* is split between statements making references to new items,
|
||||
* and statements not making these references. All updates with no
|
||||
* references to new items are done first (which creates all new
|
||||
* items), then all other updates are done.
|
||||
*
|
||||
* @author Antonin Delpeuch
|
||||
*
|
||||
*/
|
||||
public class WikibaseAPIUpdateScheduler implements UpdateScheduler {
|
||||
|
||||
/**
|
||||
* The first part of updates: the ones which create new items
|
||||
* without referring to any other new item.
|
||||
*/
|
||||
private UpdateSequence pointerFreeUpdates;
|
||||
/**
|
||||
* The second part of the updates: all existing items, plus
|
||||
* all parts of new items that refer to other new items.
|
||||
*/
|
||||
private UpdateSequence pointerFullUpdates;
|
||||
/**
|
||||
* The set of all new items referred to in the whole batch.
|
||||
*/
|
||||
private Set<ItemIdValue> allPointers;
|
||||
|
||||
private PointerExtractor extractor = new PointerExtractor();
|
||||
|
||||
@Override
|
||||
public List<ItemUpdate> schedule(List<ItemUpdate> updates) {
|
||||
List<ItemUpdate> result = new ArrayList<>();
|
||||
pointerFreeUpdates = new UpdateSequence();
|
||||
pointerFullUpdates = new UpdateSequence();
|
||||
allPointers = new HashSet<>();
|
||||
|
||||
for(ItemUpdate update : updates) {
|
||||
splitUpdate(update);
|
||||
}
|
||||
|
||||
// Part 1: add all the pointer free updates
|
||||
result.addAll(pointerFreeUpdates.getUpdates());
|
||||
|
||||
// Part 1': add the remaining new items that have not been touched
|
||||
Set<ItemIdValue> unseenPointers = new HashSet<>(allPointers);
|
||||
unseenPointers.removeAll(pointerFreeUpdates.getSubjects());
|
||||
|
||||
result.addAll(unseenPointers.stream()
|
||||
.map(e -> new ItemUpdateBuilder(e).build())
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
// Part 2: add all the pointer full updates
|
||||
result.addAll(pointerFullUpdates.getUpdates());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits an update into two parts
|
||||
* @param update
|
||||
*/
|
||||
protected void splitUpdate(ItemUpdate update) {
|
||||
ItemUpdateBuilder pointerFreeBuilder = new ItemUpdateBuilder(update.getItemId())
|
||||
.addLabels(update.getLabels())
|
||||
.addDescriptions(update.getDescriptions())
|
||||
.addAliases(update.getAliases())
|
||||
.deleteStatements(update.getDeletedStatements());
|
||||
ItemUpdateBuilder pointerFullBuilder = new ItemUpdateBuilder(update.getItemId());
|
||||
|
||||
for(Statement statement : update.getAddedStatements()) {
|
||||
Set<ReconItemIdValue> pointers = extractor.extractPointers(statement);
|
||||
if (pointers.isEmpty()) {
|
||||
pointerFreeBuilder.addStatement(statement);
|
||||
} else {
|
||||
pointerFullBuilder.addStatement(statement);
|
||||
}
|
||||
allPointers.addAll(pointers);
|
||||
}
|
||||
|
||||
if(update.isNew()) {
|
||||
// If the update is new, we might need to split it
|
||||
// in two (if it refers to any other new entity).
|
||||
ItemUpdate pointerFree = pointerFreeBuilder.build();
|
||||
if (!pointerFree.isNull()) {
|
||||
pointerFreeUpdates.add(pointerFree);
|
||||
}
|
||||
ItemUpdate pointerFull = pointerFullBuilder.build();
|
||||
if (!pointerFull.isEmpty()) {
|
||||
pointerFullUpdates.add(pointerFull);
|
||||
}
|
||||
} else {
|
||||
// Otherwise, we just make sure this edit is done after
|
||||
// all item creations.
|
||||
pointerFullUpdates.add(update);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -40,15 +40,11 @@ public class QSValuePrinterTest {
|
||||
@Test
|
||||
public void printNewItemId() {
|
||||
ReconEntityIdValue id = TestingDataGenerator.makeNewItemIdValue(12345L, "my new item");
|
||||
assertNull(id.accept(printer));
|
||||
assertEquals("LAST", id.accept(printer));
|
||||
// because no entity was previously created
|
||||
|
||||
QSValuePrinter printerAfterCreate = new QSValuePrinter(id);
|
||||
ReconEntityIdValue equalId = TestingDataGenerator.makeNewItemIdValue(12345L, "my other new item");
|
||||
assertEquals("LAST", printerAfterCreate.visit(equalId));
|
||||
|
||||
ReconEntityIdValue differentId = TestingDataGenerator.makeNewItemIdValue(34567L, "my new item");
|
||||
assertNull(printerAfterCreate.visit(differentId));
|
||||
ReconEntityIdValue differentId = TestingDataGenerator.makeMatchedItemIdValue("Q78", "my existing item");
|
||||
assertEquals("Q78", differentId.accept(printer));
|
||||
}
|
||||
|
||||
// Globe coordinates
|
||||
|
@ -18,6 +18,7 @@ import org.json.JSONWriter;
|
||||
import org.openrefine.wikidata.testing.TestingDataGenerator;
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.Test;
|
||||
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Claim;
|
||||
@ -63,12 +64,24 @@ public class WikibaseSchemaTest extends RefineTest {
|
||||
Collections.singletonList(Datamodel.makeReference(Collections.singletonList(retrievedSnakGroup))),
|
||||
StatementRank.NORMAL, "");
|
||||
|
||||
private Project project;
|
||||
|
||||
static JSONObject jsonFromFile(String filename) throws IOException, JSONException {
|
||||
byte[] contents = Files.readAllBytes(Paths.get(filename));
|
||||
String decoded = new String(contents, "utf-8");
|
||||
return ParsingUtilities.evaluateJsonStringToObject(decoded);
|
||||
}
|
||||
|
||||
@BeforeMethod
|
||||
public void setUpProject() {
|
||||
project = this.createCSVProject(
|
||||
"subject,inception,reference\n"+
|
||||
"Q1377,1919,http://www.ljubljana-slovenia.com/university-ljubljana\n"+
|
||||
"Q865528,1965,");
|
||||
project.rows.get(0).cells.set(0, TestingDataGenerator.makeMatchedCell("Q1377", "University of Ljubljana"));
|
||||
project.rows.get(1).cells.set(0, TestingDataGenerator.makeMatchedCell("Q865528", "University of Warwick"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSerialize() throws JSONException, IOException {
|
||||
JSONObject serialized = jsonFromFile("data/schema/history_of_medicine.json");
|
||||
@ -94,12 +107,6 @@ public class WikibaseSchemaTest extends RefineTest {
|
||||
public void testEvaluate() throws JSONException, IOException {
|
||||
JSONObject serialized = jsonFromFile("data/schema/inception.json");
|
||||
WikibaseSchema schema = WikibaseSchema.reconstruct(serialized);
|
||||
Project project = this.createCSVProject(
|
||||
"subject,inception,reference\n"+
|
||||
"Q1377,1919,http://www.ljubljana-slovenia.com/university-ljubljana\n"+
|
||||
"Q865528,1965,");
|
||||
project.rows.get(0).cells.set(0, TestingDataGenerator.makeMatchedCell("Q1377", "University of Ljubljana"));
|
||||
project.rows.get(1).cells.set(0, TestingDataGenerator.makeMatchedCell("Q865528", "University of Warwick"));
|
||||
Engine engine = new Engine(project);
|
||||
List<ItemUpdate> updates = schema.evaluate(project, engine);
|
||||
List<ItemUpdate> expected = new ArrayList<>();
|
||||
@ -109,4 +116,31 @@ public class WikibaseSchemaTest extends RefineTest {
|
||||
expected.add(update2);
|
||||
assertEquals(expected, updates);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvaluateRespectsFacets() throws JSONException, IOException {
|
||||
JSONObject serialized = jsonFromFile("data/schema/inception.json");
|
||||
WikibaseSchema schema = WikibaseSchema.reconstruct(serialized);
|
||||
Engine engine = new Engine(project);
|
||||
JSONObject engineConfig = new JSONObject("{\n" +
|
||||
" \"mode\": \"row-based\",\n" +
|
||||
" \"facets\": [\n" +
|
||||
" {\n" +
|
||||
" \"mode\": \"text\",\n" +
|
||||
" \"invert\": false,\n" +
|
||||
" \"caseSensitive\": false,\n" +
|
||||
" \"query\": \"www\",\n" +
|
||||
" \"name\": \"reference\",\n" +
|
||||
" \"type\": \"text\",\n" +
|
||||
" \"columnName\": \"reference\"\n" +
|
||||
" }\n" +
|
||||
" ]\n" +
|
||||
" }");
|
||||
engine.initializeFromJSON(engineConfig);
|
||||
List<ItemUpdate> updates = schema.evaluate(project, engine);
|
||||
List<ItemUpdate> expected = new ArrayList<>();
|
||||
ItemUpdate update1 = new ItemUpdateBuilder(qid1).addStatement(statement1).build();
|
||||
expected.add(update1);
|
||||
assertEquals(expected, updates);
|
||||
}
|
||||
}
|
||||
|
@ -62,6 +62,16 @@ public class ItemUpdateTest {
|
||||
public void testIsNull() {
|
||||
ItemUpdate update = new ItemUpdateBuilder(existingSubject).build();
|
||||
assertTrue(update.isNull());
|
||||
ItemUpdate update2 = new ItemUpdateBuilder(newSubject).build();
|
||||
assertFalse(update2.isNull());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIsEmpty() {
|
||||
ItemUpdate update = new ItemUpdateBuilder(existingSubject).build();
|
||||
assertTrue(update.isEmpty());
|
||||
ItemUpdate update2 = new ItemUpdateBuilder(newSubject).build();
|
||||
assertTrue(update2.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -78,8 +88,8 @@ public class ItemUpdateTest {
|
||||
.addStatement(statement1)
|
||||
.addStatement(statement2)
|
||||
.build();
|
||||
assertEquals(Arrays.asList(statement1, statement2).stream().collect(Collectors.toSet()),
|
||||
update.getAddedStatements());
|
||||
assertFalse(update.isNull());
|
||||
assertEquals(Arrays.asList(statement1, statement2), update.getAddedStatements());
|
||||
assertEquals(statementGroups, update.getAddedStatementGroups().stream().collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
@ -130,6 +140,7 @@ public class ItemUpdateTest {
|
||||
.addAlias(aliasEn)
|
||||
.addAlias(aliasFr)
|
||||
.build();
|
||||
assertFalse(updateA.isNull());
|
||||
ItemUpdate normalized = updateA.normalizeLabelsAndAliases();
|
||||
ItemUpdate expectedUpdate = new ItemUpdateBuilder(newSubject)
|
||||
.addLabel(label)
|
||||
|
@ -0,0 +1,95 @@
|
||||
package org.openrefine.wikidata.updates.scheduler;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue;
|
||||
import org.openrefine.wikidata.testing.TestingDataGenerator;
|
||||
import org.testng.annotations.Test;
|
||||
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Claim;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.GlobeCoordinatesValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Reference;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Snak;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.SnakGroup;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.StatementRank;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.TimeValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Value;
|
||||
|
||||
public class PointerExtractorTest {
|
||||
|
||||
private ItemIdValue existingId = Datamodel.makeWikidataItemIdValue("Q43");
|
||||
private ItemIdValue matchedId = TestingDataGenerator.makeMatchedItemIdValue("Q89","eist");
|
||||
private ItemIdValue newIdA = TestingDataGenerator.makeNewItemIdValue(1234L, "new item A");
|
||||
private ItemIdValue newIdB = TestingDataGenerator.makeNewItemIdValue(4567L, "new item B");
|
||||
|
||||
private PropertyIdValue pid = Datamodel.makeWikidataPropertyIdValue("P89");
|
||||
private Snak snakWithNew = Datamodel.makeValueSnak(pid, newIdA);
|
||||
private Snak snakWithoutNew = Datamodel.makeValueSnak(pid, matchedId);
|
||||
private SnakGroup snakGroupWithNew = Datamodel.makeSnakGroup(Collections.singletonList(snakWithNew));
|
||||
private SnakGroup snakGroupWithoutNew = Datamodel.makeSnakGroup(Collections.singletonList(snakWithoutNew));
|
||||
private Claim claimWithNew = Datamodel.makeClaim(existingId, snakWithNew, Collections.emptyList());
|
||||
private Claim claimNewSubject = Datamodel.makeClaim(newIdB, snakWithoutNew, Collections.emptyList());
|
||||
private Claim claimNewQualifier = Datamodel.makeClaim(matchedId, snakWithoutNew,
|
||||
Collections.singletonList(snakGroupWithNew));
|
||||
|
||||
private static PointerExtractor e = new PointerExtractor();
|
||||
|
||||
@Test
|
||||
public void testExtractEntityId() {
|
||||
assertEquals(Collections.singleton(newIdA), e.extractPointers(newIdA));
|
||||
assertEmpty(e.extractPointers(existingId));
|
||||
assertEmpty(e.extractPointers(matchedId));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractDatavalues() {
|
||||
assertEmpty(Datamodel.makeDatatypeIdValue("string"));
|
||||
assertEmpty(Datamodel.makeGlobeCoordinatesValue(1.34, 2.354, 0.1, GlobeCoordinatesValue.GLOBE_EARTH));
|
||||
assertEmpty(Datamodel.makeStringValue("est"));
|
||||
assertEmpty(Datamodel.makeMonolingualTextValue("srtu", "en"));
|
||||
assertEmpty(Datamodel.makeWikidataPropertyIdValue("P78"));
|
||||
assertEmpty(Datamodel.makeQuantityValue(new BigDecimal("898")));
|
||||
assertEmpty(Datamodel.makeQuantityValue(new BigDecimal("7.87"), "http://www.wikidata.org/entity/Q34"));
|
||||
assertEmpty(Datamodel.makeTimeValue(1898, (byte)2, (byte)3, TimeValue.CM_GREGORIAN_PRO));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSnak() {
|
||||
assertEmpty(e.extractPointers(snakWithoutNew));
|
||||
assertEquals(Collections.singleton(newIdA), e.extractPointers(snakWithNew));
|
||||
assertEmpty(e.extractPointers(Datamodel.makeNoValueSnak(pid)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSnakGroup() {
|
||||
assertEmpty(e.extractPointers(snakGroupWithoutNew));
|
||||
assertEquals(Collections.singleton(newIdA), e.extractPointers(snakGroupWithNew));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStatement() {
|
||||
assertEmpty(e.extractPointers(Datamodel.makeStatement(claimNewSubject,
|
||||
Collections.emptyList(), StatementRank.NORMAL, "")));
|
||||
assertEquals(Collections.singleton(newIdA), e.extractPointers(Datamodel.makeStatement(claimWithNew,
|
||||
Collections.emptyList(), StatementRank.NORMAL, "")));
|
||||
assertEquals(Collections.singleton(newIdA), e.extractPointers(Datamodel.makeStatement(claimNewQualifier,
|
||||
Collections.emptyList(), StatementRank.NORMAL, "")));
|
||||
Reference reference = Datamodel.makeReference(Collections.singletonList(snakGroupWithNew));
|
||||
assertEquals(Collections.singleton(newIdA), e.extractPointers(Datamodel.makeStatement(claimNewSubject,
|
||||
Collections.singletonList(reference), StatementRank.NORMAL, "")));
|
||||
}
|
||||
|
||||
private static void assertEmpty(Value v) {
|
||||
assertEmpty(e.extractPointers(v));
|
||||
}
|
||||
|
||||
private static void assertEmpty(Set<ReconItemIdValue> pointers) {
|
||||
assertEquals(Collections.emptySet(), pointers);
|
||||
}
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
package org.openrefine.wikidata.updates.scheduler;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
||||
public class QuickStatementsUpdateSchedulerTest extends UpdateSchedulerTest {
|
||||
|
||||
@Test
|
||||
public void testNoNewItem() throws ImpossibleSchedulingException {
|
||||
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoB).build();
|
||||
ItemUpdate updateB = new ItemUpdateBuilder(existingIdB).addStatement(sBtoA).build();
|
||||
List<ItemUpdate> scheduled = schedule(updateA, updateB);
|
||||
assertEquals(Arrays.asList(updateA,updateB), scheduled);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSplitUpdate() throws ImpossibleSchedulingException {
|
||||
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA)
|
||||
.addStatement(sAtoNewA)
|
||||
.addStatement(sAtoNewB)
|
||||
.build();
|
||||
ItemUpdate newUpdateA = new ItemUpdateBuilder(newIdA).build();
|
||||
ItemUpdate newUpdateB = new ItemUpdateBuilder(newIdB).build();
|
||||
ItemUpdate splitUpdateA = new ItemUpdateBuilder(existingIdA)
|
||||
.addStatement(sAtoNewA)
|
||||
.build();
|
||||
ItemUpdate splitUpdateB = new ItemUpdateBuilder(existingIdA)
|
||||
.addStatement(sAtoNewB)
|
||||
.build();
|
||||
List<ItemUpdate> scheduled = schedule(updateA);
|
||||
assertSetEquals(Arrays.asList(newUpdateA, splitUpdateA, newUpdateB, splitUpdateB), scheduled);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions=ImpossibleSchedulingException.class)
|
||||
public void testImpossibleForQS() throws ImpossibleSchedulingException {
|
||||
ItemUpdate update = new ItemUpdateBuilder(newIdA).addStatement(sNewAtoNewB).build();
|
||||
schedule(update);
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateScheduler getScheduler() {
|
||||
return new QuickStatementsUpdateScheduler();
|
||||
}
|
||||
}
|
@ -0,0 +1,94 @@
|
||||
package org.openrefine.wikidata.updates.scheduler;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.openrefine.wikidata.testing.TestingDataGenerator;
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
|
||||
import org.testng.annotations.Test;
|
||||
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Claim;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Statement;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.StatementRank;
|
||||
|
||||
public abstract class UpdateSchedulerTest {
|
||||
|
||||
protected ItemIdValue existingIdA = Datamodel.makeWikidataItemIdValue("Q43");
|
||||
protected ItemIdValue existingIdB = Datamodel.makeWikidataItemIdValue("Q538");
|
||||
protected ItemIdValue newIdA = TestingDataGenerator.makeNewItemIdValue(1234L, "new item A");
|
||||
protected ItemIdValue newIdB = TestingDataGenerator.makeNewItemIdValue(5678L, "new item B");
|
||||
|
||||
protected static PropertyIdValue pid = Datamodel.makeWikidataPropertyIdValue("P38");
|
||||
|
||||
protected Statement sAtoB = generateStatement(existingIdA, existingIdB);
|
||||
protected Statement sBtoA = generateStatement(existingIdB, existingIdA);
|
||||
protected Statement sAtoNewA = generateStatement(existingIdA, newIdA);
|
||||
protected Statement sAtoNewB = generateStatement(existingIdA, newIdB);
|
||||
protected Statement sNewAtoB = generateStatement(newIdA, existingIdB);
|
||||
protected Statement sNewAtoNewB = generateStatement(newIdA, newIdB);
|
||||
|
||||
public static Statement generateStatement(ItemIdValue from, ItemIdValue to) {
|
||||
Claim claim = Datamodel.makeClaim(from, Datamodel.makeValueSnak(pid, to), Collections.emptyList());
|
||||
return Datamodel.makeStatement(claim, Collections.emptyList(), StatementRank.NORMAL, "");
|
||||
}
|
||||
|
||||
public abstract UpdateScheduler getScheduler();
|
||||
|
||||
protected List<ItemUpdate> schedule(ItemUpdate... itemUpdates) throws ImpossibleSchedulingException {
|
||||
return getScheduler().schedule(Arrays.asList(itemUpdates));
|
||||
}
|
||||
|
||||
protected static void assertSetEquals(List<ItemUpdate> expected, List<ItemUpdate> actual) {
|
||||
assertEquals(expected.stream().collect(Collectors.toSet()),
|
||||
actual.stream().collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNewItemNotMentioned() throws ImpossibleSchedulingException {
|
||||
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoNewA).build();
|
||||
List<ItemUpdate> scheduled = schedule(updateA);
|
||||
ItemUpdate newUpdate = new ItemUpdateBuilder(newIdA).build();
|
||||
assertEquals(Arrays.asList(newUpdate, updateA), scheduled);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNewItemMentioned() throws ImpossibleSchedulingException {
|
||||
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoNewA).build();
|
||||
ItemUpdate newUpdate = new ItemUpdateBuilder(newIdA).addStatement(sNewAtoB).build();
|
||||
List<ItemUpdate> scheduled = schedule(updateA, newUpdate);
|
||||
assertEquals(Arrays.asList(newUpdate, updateA), scheduled);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMerge() throws ImpossibleSchedulingException {
|
||||
ItemUpdate update1 = new ItemUpdateBuilder(existingIdA)
|
||||
.addStatement(sAtoB)
|
||||
.build();
|
||||
ItemUpdate update2 = new ItemUpdateBuilder(existingIdA)
|
||||
.addLabel(Datamodel.makeMonolingualTextValue("hello", "fr"))
|
||||
.addStatement(sAtoB)
|
||||
.build();
|
||||
ItemUpdate merged = update1.merge(update2);
|
||||
assertEquals(Collections.singletonList(merged), schedule(update1, update2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMergeNew() throws ImpossibleSchedulingException {
|
||||
ItemUpdate update1 = new ItemUpdateBuilder(newIdA)
|
||||
.addLabel(Datamodel.makeMonolingualTextValue("hello", "fr"))
|
||||
.addStatement(sNewAtoB)
|
||||
.build();
|
||||
ItemUpdate update2 = new ItemUpdateBuilder(newIdA)
|
||||
.addLabel(Datamodel.makeMonolingualTextValue("hello", "fr"))
|
||||
.build();
|
||||
ItemUpdate merged = update1.merge(update2);
|
||||
assertEquals(Collections.singletonList(merged), schedule(update1, update2));
|
||||
}
|
||||
}
|
@ -0,0 +1,58 @@
|
||||
package org.openrefine.wikidata.updates.scheduler;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.openrefine.wikidata.updates.ItemUpdate;
|
||||
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
|
||||
import org.testng.annotations.Test;
|
||||
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
|
||||
|
||||
|
||||
public class WikibaseAPIUpdateSchedulerTest extends UpdateSchedulerTest {
|
||||
|
||||
@Test
|
||||
public void testOrderPreserved() throws ImpossibleSchedulingException {
|
||||
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoB).build();
|
||||
ItemUpdate updateB = new ItemUpdateBuilder(existingIdB).addStatement(sBtoA).build();
|
||||
List<ItemUpdate> scheduled = schedule(updateA, updateB);
|
||||
assertEquals(Arrays.asList(updateA,updateB), scheduled);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUpdateIsNotSplit() throws ImpossibleSchedulingException {
|
||||
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA)
|
||||
.addStatement(sAtoNewA)
|
||||
.addStatement(sAtoNewB)
|
||||
.build();
|
||||
ItemUpdate newUpdateA = new ItemUpdateBuilder(newIdA).build();
|
||||
ItemUpdate newUpdateB = new ItemUpdateBuilder(newIdB).build();
|
||||
List<ItemUpdate> scheduled = schedule(updateA);
|
||||
assertSetEquals(Arrays.asList(newUpdateA, newUpdateB, updateA), scheduled);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMixedUpdate() throws ImpossibleSchedulingException {
|
||||
ItemUpdate updateA = new ItemUpdateBuilder(existingIdA)
|
||||
.addStatement(sAtoNewA)
|
||||
.addStatement(sAtoNewB)
|
||||
.addStatement(sAtoB)
|
||||
.build();
|
||||
ItemUpdate newUpdateA = new ItemUpdateBuilder(newIdA)
|
||||
.addStatement(sNewAtoB)
|
||||
.build();
|
||||
ItemUpdate newUpdateB = new ItemUpdateBuilder(newIdB)
|
||||
.build();
|
||||
List<ItemUpdate> scheduled = schedule(updateA, newUpdateA);
|
||||
assertEquals(Arrays.asList(newUpdateA, newUpdateB, updateA), scheduled);
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateScheduler getScheduler() {
|
||||
return new WikibaseAPIUpdateScheduler();
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user