From bb044612e064123b305c2cb583806ce3e444d82d Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Thu, 1 Mar 2018 22:20:29 +0000 Subject: [PATCH] Introduce a scheduler package to reorganize item updates --- .../wikidata/exporters/QSValuePrinter.java | 35 +--- .../exporters/QuickStatementsExporter.java | 20 ++- .../PerformWikibaseEditsOperation.java | 26 ++- .../openrefine/wikidata/qa/EditInspector.java | 25 ++- .../entityvalues/ReconEntityIdValue.java | 8 +- .../wikidata/updates/ItemUpdate.java | 78 ++++++--- .../wikidata/updates/ItemUpdateBuilder.java | 45 +++++- .../ImpossibleSchedulingException.java | 8 + .../updates/scheduler/PointerExtractor.java | 152 ++++++++++++++++++ .../QuickStatementsUpdateScheduler.java | 118 ++++++++++++++ .../updates/scheduler/UpdateScheduler.java | 32 ++++ .../updates/scheduler/UpdateSequence.java | 59 +++++++ .../scheduler/WikibaseAPIUpdateScheduler.java | 115 +++++++++++++ .../exporters/QSValuePrinterTest.java | 10 +- .../wikidata/schema/WikibaseSchemaTest.java | 46 +++++- .../wikidata/updates/ItemUpdateTest.java | 15 +- .../scheduler/PointerExtractorTest.java | 95 +++++++++++ .../QuickStatementsUpdateSchedulerTest.java | 51 ++++++ .../scheduler/UpdateSchedulerTest.java | 94 +++++++++++ .../WikibaseAPIUpdateSchedulerTest.java | 58 +++++++ 20 files changed, 1010 insertions(+), 80 deletions(-) create mode 100644 extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/ImpossibleSchedulingException.java create mode 100644 extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/PointerExtractor.java create mode 100644 extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/QuickStatementsUpdateScheduler.java create mode 100644 extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/UpdateScheduler.java create mode 100644 extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/UpdateSequence.java create mode 100644 extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/WikibaseAPIUpdateScheduler.java create mode 100644 extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/PointerExtractorTest.java create mode 100644 extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/QuickStatementsUpdateSchedulerTest.java create mode 100644 extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/UpdateSchedulerTest.java create mode 100644 extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/WikibaseAPIUpdateSchedulerTest.java diff --git a/extensions/wikidata/src/org/openrefine/wikidata/exporters/QSValuePrinter.java b/extensions/wikidata/src/org/openrefine/wikidata/exporters/QSValuePrinter.java index f5b5ae3b6..6bf3e14ba 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/exporters/QSValuePrinter.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/exporters/QSValuePrinter.java @@ -18,35 +18,15 @@ import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor; * Format documentation: * https://www.wikidata.org/wiki/Help:QuickStatements * + * Any new entity id will be + * assumed to be the last one created, represented with "LAST". It is + * fine to do this assumption because we are working on edit batches + * previously scheduled by {@link QuickStatementsUpdateScheduler}. + * * @author Antonin Delpeuch * */ public class QSValuePrinter implements ValueVisitor { - - private final ReconEntityIdValue lastCreatedEntityIdValue; - - /** - * Constructor. - * - * Creates a printer for a context where no entity was previously - * created with the "CREATE" command. Any new entity id will not - * be printed. - */ - public QSValuePrinter() { - lastCreatedEntityIdValue = null; - } - - /** - * Creates a printer for a context where an entity was previously - * created with the "CREATE" command. If this id is encountered, - * it will be printed as "LAST". - * - * @param lastCreatedEntityIdValue - * the virtual id of the last created entity - */ - public QSValuePrinter(ReconEntityIdValue lastCreatedEntityIdValue) { - this.lastCreatedEntityIdValue = lastCreatedEntityIdValue; - } @Override public String visit(DatatypeIdValue value) { @@ -57,11 +37,8 @@ public class QSValuePrinter implements ValueVisitor { @Override public String visit(EntityIdValue value) { - if (lastCreatedEntityIdValue != null && lastCreatedEntityIdValue.equals(value)) { + if (ReconEntityIdValue.class.isInstance(value) && ((ReconEntityIdValue)value).isNew()) { return "LAST"; - } else if (ReconEntityIdValue.class.isInstance(value)) { - // oops, we are trying to print another newly created entity (not the last one) - return null; } return value.getId(); } diff --git a/extensions/wikidata/src/org/openrefine/wikidata/exporters/QuickStatementsExporter.java b/extensions/wikidata/src/org/openrefine/wikidata/exporters/QuickStatementsExporter.java index 44459da7e..413b22f19 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/exporters/QuickStatementsExporter.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/exporters/QuickStatementsExporter.java @@ -12,6 +12,8 @@ import com.google.refine.model.Project; import org.openrefine.wikidata.schema.WikibaseSchema; import org.openrefine.wikidata.updates.ItemUpdate; +import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException; +import org.openrefine.wikidata.updates.scheduler.QuickStatementsUpdateScheduler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.wikidata.wdtk.datamodel.interfaces.Claim; @@ -27,6 +29,9 @@ import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor; public class QuickStatementsExporter implements WriterExporter { final static Logger logger = LoggerFactory.getLogger("QuickStatementsExporter"); + + public static final String impossibleSchedulingErrorMessage = + "This edit batch cannot be performed with QuickStatements due to the structure of its new items."; public QuickStatementsExporter(){ } @@ -64,10 +69,17 @@ public class QuickStatementsExporter implements WriterExporter { translateItemList(items, writer); } - public void translateItemList(List editBatch, Writer writer) throws IOException { - for (ItemUpdate item : editBatch) { - translateItem(item, writer); + public void translateItemList(List updates, Writer writer) throws IOException { + QuickStatementsUpdateScheduler scheduler = new QuickStatementsUpdateScheduler(); + try { + List scheduled = scheduler.schedule(updates); + for (ItemUpdate item : scheduled) { + translateItem(item, writer); + } + } catch(ImpossibleSchedulingException e) { + writer.write(impossibleSchedulingErrorMessage); } + } protected void translateNameDescr(String qid, Set values, String prefix, ItemIdValue id, Writer writer) throws IOException { @@ -86,7 +98,7 @@ public class QuickStatementsExporter implements WriterExporter { if (item.isNew()) { writer.write("CREATE\n"); qid = "LAST"; - item.normalizeLabelsAndAliases(); + item = item.normalizeLabelsAndAliases(); } translateNameDescr(qid, item.getLabels(), "L", item.getItemId(), writer); diff --git a/extensions/wikidata/src/org/openrefine/wikidata/operations/PerformWikibaseEditsOperation.java b/extensions/wikidata/src/org/openrefine/wikidata/operations/PerformWikibaseEditsOperation.java index 3062604ec..5315759db 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/operations/PerformWikibaseEditsOperation.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/operations/PerformWikibaseEditsOperation.java @@ -17,6 +17,9 @@ import org.json.JSONWriter; import org.openrefine.wikidata.editing.ConnectionManager; import org.openrefine.wikidata.editing.NewItemLibrary; import org.openrefine.wikidata.updates.ItemUpdate; +import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException; +import org.openrefine.wikidata.updates.scheduler.UpdateScheduler; +import org.openrefine.wikidata.updates.scheduler.WikibaseAPIUpdateScheduler; import org.openrefine.wikidata.schema.WikibaseSchema; import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue; import org.slf4j.Logger; @@ -29,6 +32,7 @@ import org.wikidata.wdtk.datamodel.interfaces.ItemDocument; import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue; import org.wikidata.wdtk.util.WebResourceFetcherImpl; import org.wikidata.wdtk.wikibaseapi.ApiConnection; +import org.wikidata.wdtk.wikibaseapi.TermStatementUpdate; import org.wikidata.wdtk.wikibaseapi.WikibaseDataEditor; import org.wikidata.wdtk.wikibaseapi.WikibaseDataFetcher; import org.wikidata.wdtk.wikibaseapi.apierrors.MediaWikiApiErrorException; @@ -215,8 +219,10 @@ public class PerformWikibaseEditsOperation extends EngineDependentOperation { // Evaluate the schema List itemDocuments = _schema.evaluate(_project, _engine); - // Group statements by item - Map updates = ItemUpdate.groupBySubject(itemDocuments); + // Schedule the edit batch + WikibaseAPIUpdateScheduler scheduler = new WikibaseAPIUpdateScheduler(); + List updates = null; + updates = scheduler.schedule(itemDocuments); /** * TODO: @@ -228,7 +234,7 @@ public class PerformWikibaseEditsOperation extends EngineDependentOperation { NewItemLibrary newItemLibrary = new NewItemLibrary(); DataObjectFactory factory = new DataObjectFactoryImpl(); List remainingItemUpdates = new ArrayList<>(); - remainingItemUpdates.addAll(updates.values()); + remainingItemUpdates.addAll(updates); int totalItemUpdates = updates.size(); int updatesDone = 0; int batchSize = 50; @@ -295,6 +301,20 @@ public class PerformWikibaseEditsOperation extends EngineDependentOperation { } else { // Existing item ItemDocument currentDocument = (ItemDocument)currentDocs.get(update.getItemId().getId()); + /* + TermStatementUpdate tsUpdate = new TermStatementUpdate( + currentDocument, + update.getAddedStatements().stream().collect(Collectors.toList()), + update.getDeletedStatements().stream().collect(Collectors.toList()), + update.getLabels().stream().collect(Collectors.toList()), + update.getDescriptions().stream().collect(Collectors.toList()), + update.getAliases().stream().collect(Collectors.toList()), + new ArrayList() + ); + ObjectMapper mapper = new ObjectMapper(); + logger.info(mapper.writeValueAsString(update)); + logger.info(update.toString()); + logger.info(tsUpdate.getJsonUpdateString()); */ wbde.updateTermsStatements(currentDocument, update.getLabels().stream().collect(Collectors.toList()), update.getDescriptions().stream().collect(Collectors.toList()), diff --git a/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java b/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java index c40acf16a..530b1da4a 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java @@ -18,11 +18,15 @@ import org.openrefine.wikidata.qa.scrutinizers.SingleValueScrutinizer; import org.openrefine.wikidata.qa.scrutinizers.UnsourcedScrutinizer; import org.openrefine.wikidata.qa.scrutinizers.WhitespaceScrutinizer; import org.openrefine.wikidata.updates.ItemUpdate; +import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException; +import org.openrefine.wikidata.updates.scheduler.UpdateScheduler; +import org.openrefine.wikidata.updates.scheduler.WikibaseAPIUpdateScheduler; import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue; /** - * Runs a collection of edit scrutinizers on an edit batch - * @author antonin + * Runs a collection of edit scrutinizers on an edit batch. + * + * @author Antonin Delpeuch * */ public class EditInspector { @@ -63,10 +67,19 @@ public class EditInspector { * @param editBatch */ public void inspect(List editBatch) { - Map updates = ItemUpdate.groupBySubject(editBatch); - List mergedUpdates = updates.values().stream().collect(Collectors.toList()); - for(EditScrutinizer scrutinizer : scrutinizers.values()) { - scrutinizer.scrutinize(mergedUpdates); + // First, schedule them with some scheduler, + // so that all newly created entities appear in the batch + UpdateScheduler scheduler = new WikibaseAPIUpdateScheduler(); + try { + editBatch = scheduler.schedule(editBatch); + Map updates = ItemUpdate.groupBySubject(editBatch); + List mergedUpdates = updates.values().stream().collect(Collectors.toList()); + for(EditScrutinizer scrutinizer : scrutinizers.values()) { + scrutinizer.scrutinize(mergedUpdates); + } + } catch(ImpossibleSchedulingException e) { + warningStore.addWarning(new QAWarning( + "scheduling-failed", null, QAWarning.Severity.CRITICAL, 1)); } if (warningStore.getNbWarnings() == 0) { diff --git a/extensions/wikidata/src/org/openrefine/wikidata/schema/entityvalues/ReconEntityIdValue.java b/extensions/wikidata/src/org/openrefine/wikidata/schema/entityvalues/ReconEntityIdValue.java index 9b7320f68..4975aa98a 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/schema/entityvalues/ReconEntityIdValue.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/schema/entityvalues/ReconEntityIdValue.java @@ -8,6 +8,8 @@ import org.wikidata.wdtk.datamodel.helpers.Hash; import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue; import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor; +import com.fasterxml.jackson.annotation.JsonIgnore; + import com.google.refine.model.Recon; /** @@ -38,11 +40,13 @@ public abstract class ReconEntityIdValue implements PrefetchedEntityIdValue { Recon.Judgment.New.equals(_recon.judgment)); } - protected boolean isMatched() { + @JsonIgnore + public boolean isMatched() { return Recon.Judgment.Matched.equals(_recon.judgment) && _recon.match != null; } - protected boolean isNew() { + @JsonIgnore + public boolean isNew() { return !isMatched(); } diff --git a/extensions/wikidata/src/org/openrefine/wikidata/updates/ItemUpdate.java b/extensions/wikidata/src/org/openrefine/wikidata/updates/ItemUpdate.java index d0722237c..19ecf8623 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/updates/ItemUpdate.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/updates/ItemUpdate.java @@ -31,7 +31,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; */ public class ItemUpdate { private final ItemIdValue qid; - private final Set addedStatements; + private final List addedStatements; private final Set deletedStatements; private final Set labels; private final Set descriptions; @@ -42,11 +42,24 @@ public class ItemUpdate { * * @param qid * the subject of the document. It can be a reconciled item value for new items. + * @param addedStatements + * the statements to add on the item. They should be distinct. They + * are modelled as a list because their insertion order matters. + * @param deletedStatements + * the statements to remove from the item + * @param labels + * the labels to add on the item + * @param descriptions + * the descriptions to add on the item + * @param aliases + * the aliases to add on the item. In theory their order should matter + * but in practice people rarely rely on the order of aliases so this + * is just kept as a set for simplicity. */ @JsonCreator public ItemUpdate( @JsonProperty("subject") ItemIdValue qid, - @JsonProperty("addedStatements") Set addedStatements, + @JsonProperty("addedStatements") List addedStatements, @JsonProperty("deletedStatements") Set deletedStatements, @JsonProperty("labels") Set labels, @JsonProperty("descriptions") Set descriptions, @@ -54,7 +67,7 @@ public class ItemUpdate { Validate.notNull(qid); this.qid = qid; if(addedStatements == null) { - addedStatements = Collections.emptySet(); + addedStatements = Collections.emptyList(); } this.addedStatements = addedStatements; if(deletedStatements == null) { @@ -84,10 +97,13 @@ public class ItemUpdate { } /** - * @return the set of all added statements + * Added statements are recorded as a list because + * their order of insertion matters. + * + * @return the list of all added statements */ @JsonProperty("addedStatements") - public Set getAddedStatements() { + public List getAddedStatements() { return addedStatements; } @@ -124,11 +140,18 @@ public class ItemUpdate { } /** - * @return true when this change is empty - * (no statements or terms changed) + * @return true when this change is empty and its subject is not new */ @JsonIgnore public boolean isNull() { + return isEmpty() && !isNew(); + } + + /** + * @return true when this change leaves the content of the document untouched + */ + @JsonIgnore + public boolean isEmpty() { return (addedStatements.isEmpty() && deletedStatements.isEmpty() && labels.isEmpty() @@ -145,8 +168,12 @@ public class ItemUpdate { */ public ItemUpdate merge(ItemUpdate other) { Validate.isTrue(qid.equals(other.getItemId())); - Set newAddedStatements = new HashSet<>(addedStatements); - newAddedStatements.addAll(other.getAddedStatements()); + List newAddedStatements = new ArrayList<>(addedStatements); + for(Statement statement : other.getAddedStatements()) { + if (!newAddedStatements.contains(statement)) { + newAddedStatements.add(statement); + } + } Set newDeletedStatements = new HashSet<>(deletedStatements); newDeletedStatements.addAll(other.getDeletedStatements()); Set newLabels = new HashSet<>(labels); @@ -264,16 +291,29 @@ public class ItemUpdate { StringBuilder builder = new StringBuilder(); builder.append(""); return builder.toString(); } diff --git a/extensions/wikidata/src/org/openrefine/wikidata/updates/ItemUpdateBuilder.java b/extensions/wikidata/src/org/openrefine/wikidata/updates/ItemUpdateBuilder.java index 8c3bc27fd..925ff23bb 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/updates/ItemUpdateBuilder.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/updates/ItemUpdateBuilder.java @@ -1,7 +1,9 @@ package org.openrefine.wikidata.updates; import java.util.Set; +import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import org.jsoup.helper.Validate; import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue; @@ -17,7 +19,7 @@ import org.wikidata.wdtk.datamodel.interfaces.Statement; */ public class ItemUpdateBuilder { private ItemIdValue qid; - private Set addedStatements; + private List addedStatements; private Set deletedStatements; private Set labels; private Set descriptions; @@ -33,7 +35,7 @@ public class ItemUpdateBuilder { public ItemUpdateBuilder(ItemIdValue qid) { Validate.notNull(qid); this.qid = qid; - this.addedStatements = new HashSet<>(); + this.addedStatements = new ArrayList<>(); this.deletedStatements = new HashSet(); this.labels = new HashSet(); this.descriptions = new HashSet(); @@ -103,6 +105,19 @@ public class ItemUpdateBuilder { labels.add(label); return this; } + + /** + * Adds a list of labels to the item. It will override any + * existing label in each language. + * + * @param labels + * the labels to add + */ + public ItemUpdateBuilder addLabels(Set labels) { + Validate.isTrue(!built, "ItemUpdate has already been built"); + this.labels.addAll(labels); + return this; + } /** * Adds a description to the item. It will override any existing @@ -116,6 +131,19 @@ public class ItemUpdateBuilder { descriptions.add(description); return this; } + + /** + * Adds a list of descriptions to the item. It will override any + * existing description in each language. + * + * @param descriptions + * the descriptions to add + */ + public ItemUpdateBuilder addDescriptions(Set descriptions) { + Validate.isTrue(!built, "ItemUpdate has already been built"); + this.descriptions.addAll(descriptions); + return this; + } /** * Adds an alias to the item. It will be added to any existing @@ -129,6 +157,19 @@ public class ItemUpdateBuilder { aliases.add(alias); return this; } + + /** + * Adds a list of aliases to the item. They will be added to any + * existing aliases in each language. + * + * @param aliases + * the aliases to add + */ + public ItemUpdateBuilder addAliases(Set aliases) { + Validate.isTrue(!built, "ItemUpdate has already been built"); + this.aliases.addAll(aliases); + return this; + } /** * Constructs the {@link ItemUpdate}. diff --git a/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/ImpossibleSchedulingException.java b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/ImpossibleSchedulingException.java new file mode 100644 index 000000000..c50833d1b --- /dev/null +++ b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/ImpossibleSchedulingException.java @@ -0,0 +1,8 @@ +package org.openrefine.wikidata.updates.scheduler; + + +public class ImpossibleSchedulingException extends Exception { + + private static final long serialVersionUID = 6621563898380564148L; + +} diff --git a/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/PointerExtractor.java b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/PointerExtractor.java new file mode 100644 index 000000000..546b93e9a --- /dev/null +++ b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/PointerExtractor.java @@ -0,0 +1,152 @@ +package org.openrefine.wikidata.updates.scheduler; + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue; +import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue; +import org.wikidata.wdtk.datamodel.interfaces.DatatypeIdValue; +import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue; +import org.wikidata.wdtk.datamodel.interfaces.GlobeCoordinatesValue; +import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue; +import org.wikidata.wdtk.datamodel.interfaces.QuantityValue; +import org.wikidata.wdtk.datamodel.interfaces.Snak; +import org.wikidata.wdtk.datamodel.interfaces.SnakGroup; +import org.wikidata.wdtk.datamodel.interfaces.Statement; +import org.wikidata.wdtk.datamodel.interfaces.StringValue; +import org.wikidata.wdtk.datamodel.interfaces.TimeValue; +import org.wikidata.wdtk.datamodel.interfaces.Value; +import org.wikidata.wdtk.datamodel.interfaces.ValueVisitor; + +/** + * A class that extracts the new entity ids referred to + * in a statement. + * + * @author Antonin Delpeuch + * + */ +public class PointerExtractor implements ValueVisitor> { + + /** + * Extracts all the new entities mentioned by this statement. This + * does not include the subject of the statement. + * + * @param statement + * the statement to inspect + * @return + * the set of all new entities mentioned by the statement + */ + public Set extractPointers(Statement statement) { + Set result = new HashSet<>(); + result.addAll(extractPointers(statement.getClaim().getMainSnak())); + result.addAll(extractPointers(statement.getClaim().getQualifiers())); + statement.getReferences().stream() + .map(l -> extractPointers(l.getSnakGroups())) + .forEach(s -> result.addAll(s)); + return result; + } + + /** + * Extracts all the new entities mentioned by this list of snak groups. + * + * @param snakGroups + * @return + */ + public Set extractPointers(List snakGroups) { + Set result = new HashSet<>(); + snakGroups.stream() + .map(s -> extractPointers(s)) + .forEach(s -> result.addAll(s)); + return result; + } + + /*** + * Extracts all the new entities mentioned by this snak group. + * + * @param snakGroup + * @return + */ + public Set extractPointers(SnakGroup snakGroup) { + Set result = new HashSet<>(); + snakGroup.getSnaks().stream() + .map(s -> extractPointers(s)) + .forEach(s -> result.addAll(s)); + return result; + } + + /** + * Extracts all new entities mentioned by this snak group. + * Currently there will be at most one: the target of the snak + * (as property ids cannot be new for now). + * + * @param snak + * @return + */ + public Set extractPointers(Snak snak) { + Set result = new HashSet<>(); + result.addAll(extractPointers(snak.getPropertyId())); + result.addAll(extractPointers(snak.getValue())); + return result; + } + + /** + * Extracts any new entity from the value. + * + * @param value + * @return + */ + public Set extractPointers(Value value) { + if (value == null) { + return Collections.emptySet(); + } + Set pointers = value.accept(this); + if (pointers == null) { + return Collections.emptySet(); + } + return pointers; + } + + @Override + public Set visit(DatatypeIdValue value) { + return null; + } + + @Override + public Set visit(EntityIdValue value) { + if(ReconItemIdValue.class.isInstance(value)) { + ReconItemIdValue recon = (ReconItemIdValue)value; + if(recon.isNew()) { + return Collections.singleton(recon); + } + } + return null; + } + + @Override + public Set visit(GlobeCoordinatesValue value) { + return null; + } + + @Override + public Set visit(MonolingualTextValue value) { + return null; + } + + @Override + public Set visit(QuantityValue value) { + // units cannot be new because WDTK represents them as strings already + return null; + } + + @Override + public Set visit(StringValue value) { + return null; + } + + @Override + public Set visit(TimeValue value) { + return null; + } +} diff --git a/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/QuickStatementsUpdateScheduler.java b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/QuickStatementsUpdateScheduler.java new file mode 100644 index 000000000..86cd049db --- /dev/null +++ b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/QuickStatementsUpdateScheduler.java @@ -0,0 +1,118 @@ +package org.openrefine.wikidata.updates.scheduler; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue; +import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue; +import org.openrefine.wikidata.updates.ItemUpdate; +import org.openrefine.wikidata.updates.ItemUpdateBuilder; +import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue; +import org.wikidata.wdtk.datamodel.interfaces.Statement; + + +public class QuickStatementsUpdateScheduler implements UpdateScheduler { + + private PointerExtractor extractor = new PointerExtractor(); + + /** + * This map holds for each new entity id value a list of updates + * that refer to this id (and should hence be scheduled right after + * creation of that entity). + */ + private Map pointerUpdates; + + /** + * This contains all updates which do not refer to any new entity + * apart from possibly the subject, in the order that they were supplied to us. + */ + private UpdateSequence pointerFreeUpdates; + + /** + * Separates out the statements which refer to new items from the rest + * of the update. The resulting updates are stored in {@link referencingUpdates} + * and {@link updatesWithoutReferences}. + * + * @param update + * @throws ImpossibleSchedulingException + * if two new item ids are referred to in the same statement + */ + protected void splitUpdate(ItemUpdate update) throws ImpossibleSchedulingException { + ItemUpdateBuilder remainingUpdateBuilder = new ItemUpdateBuilder(update.getItemId()) + .addLabels(update.getLabels()) + .addDescriptions(update.getDescriptions()) + .addAliases(update.getAliases()) + .deleteStatements(update.getDeletedStatements()); + Map referencingUpdates = new HashMap<>(); + + for(Statement statement : update.getAddedStatements()) { + Set pointers = extractor.extractPointers(statement); + if (pointers.isEmpty()) { + remainingUpdateBuilder.addStatement(statement); + } else if (pointers.size() == 1 && !update.isNew()) { + ItemIdValue pointer = pointers.stream().findFirst().get(); + ItemUpdateBuilder referencingBuilder = referencingUpdates.get(pointer); + if (referencingBuilder == null) { + referencingBuilder = new ItemUpdateBuilder(update.getItemId()); + } + referencingBuilder.addStatement(statement); + referencingUpdates.put(pointer, referencingBuilder); + } else { + throw new ImpossibleSchedulingException(); + } + } + + // Add the update that is not referring to anything to the schedule + ItemUpdate pointerFree = remainingUpdateBuilder.build(); + if (!pointerFree.isNull()) { + pointerFreeUpdates.add(pointerFree); + } + // Add the other updates to the map + for(Entry entry : referencingUpdates.entrySet()) { + ItemUpdate pointerUpdate = entry.getValue().build(); + UpdateSequence pointerUpdatesForKey = pointerUpdates.get(entry.getKey()); + if (pointerUpdatesForKey == null) { + pointerUpdatesForKey = new UpdateSequence(); + } + pointerUpdatesForKey.add(pointerUpdate); + pointerUpdates.put(entry.getKey(), pointerUpdatesForKey); + } + } + + @Override + public List schedule(List updates) throws ImpossibleSchedulingException { + pointerUpdates = new HashMap<>(); + pointerFreeUpdates = new UpdateSequence(); + + for(ItemUpdate update : updates) { + splitUpdate(update); + } + + // Reconstruct + List fullSchedule = new ArrayList<>(); + Set mentionedNewEntities = new HashSet<>(pointerUpdates.keySet()); + for(ItemUpdate update : pointerFreeUpdates.getUpdates()) { + fullSchedule.add(update); + UpdateSequence backPointers = pointerUpdates.get(update.getItemId()); + if (backPointers != null) { + fullSchedule.addAll(backPointers.getUpdates()); + } + mentionedNewEntities.remove(update.getItemId()); + } + + // Create any item that was referred to but untouched + // (this is just for the sake of correctness, it would be bad to do that + // as the items would remain blank in this batch). + for(ItemIdValue missingId : mentionedNewEntities) { + fullSchedule.add(new ItemUpdateBuilder(missingId).build()); + fullSchedule.addAll(pointerUpdates.get(missingId).getUpdates()); + } + return fullSchedule; + } + +} diff --git a/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/UpdateScheduler.java b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/UpdateScheduler.java new file mode 100644 index 000000000..dfd2c8436 --- /dev/null +++ b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/UpdateScheduler.java @@ -0,0 +1,32 @@ +package org.openrefine.wikidata.updates.scheduler; + +import java.util.List; + +import org.openrefine.wikidata.updates.ItemUpdate; + +/** + * A scheduling strategy for item updates. + * Given a list of initial updates, the scheduler + * reorganizes these updates (possibly splitting them + * or merging them) to create a sequence that is suitable + * for a particular import process. + * + * @author Antonin Delpeuch + * + */ +public interface UpdateScheduler { + + /** + * Performs the scheduling. The initial updates are provided + * as a list so that the scheduler can attempt to respect the + * initial order (but no guarantee is made for that in general). + * + * @param updates + * the updates to schedule + * @return + * the reorganized updates + * @throws ImpossibleSchedulingException + * when the scheduler cannot cope with a particular edit plan. + */ + public List schedule(List updates) throws ImpossibleSchedulingException; +} diff --git a/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/UpdateSequence.java b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/UpdateSequence.java new file mode 100644 index 000000000..37cbecc3f --- /dev/null +++ b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/UpdateSequence.java @@ -0,0 +1,59 @@ +package org.openrefine.wikidata.updates.scheduler; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.openrefine.wikidata.updates.ItemUpdate; +import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue; + +/** + * Helper class to store a list of updates where each subject + * appears at most once. It preserves order of insertion. + * + * @author Antonin Delpeuch + */ +public class UpdateSequence { + /** + * The list of updates stored by this container + */ + private List updates = new ArrayList<>(); + /** + * An index to keep track of where each item is touched in the sequence + */ + private Map index = new HashMap<>(); + + /** + * Adds a new update to the list, merging it with any existing + * one with the same subject. + * + * @param update + */ + public void add(ItemUpdate update) { + ItemIdValue subject = update.getItemId(); + if(index.containsKey(subject)) { + int i = index.get(subject); + ItemUpdate oldUpdate = updates.get(i); + updates.set(i, oldUpdate.merge(update)); + } else { + index.put(subject, updates.size()); + updates.add(update); + } + } + + /** + * @return the list of merged updates + */ + public List getUpdates() { + return updates; + } + + /** + * @return the set of touched subjects + */ + public Set getSubjects() { + return index.keySet(); + } +} \ No newline at end of file diff --git a/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/WikibaseAPIUpdateScheduler.java b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/WikibaseAPIUpdateScheduler.java new file mode 100644 index 000000000..2833cb904 --- /dev/null +++ b/extensions/wikidata/src/org/openrefine/wikidata/updates/scheduler/WikibaseAPIUpdateScheduler.java @@ -0,0 +1,115 @@ +package org.openrefine.wikidata.updates.scheduler; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue; +import org.openrefine.wikidata.updates.ItemUpdate; +import org.openrefine.wikidata.updates.ItemUpdateBuilder; +import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue; +import org.wikidata.wdtk.datamodel.interfaces.Statement; + +/** + * A simple scheduler for batches commited via the Wikibase API. + * + * The strategy is quite simple and makes at most two edits + * per touched item (which is not minimal though). Each update + * is split between statements making references to new items, + * and statements not making these references. All updates with no + * references to new items are done first (which creates all new + * items), then all other updates are done. + * + * @author Antonin Delpeuch + * + */ +public class WikibaseAPIUpdateScheduler implements UpdateScheduler { + + /** + * The first part of updates: the ones which create new items + * without referring to any other new item. + */ + private UpdateSequence pointerFreeUpdates; + /** + * The second part of the updates: all existing items, plus + * all parts of new items that refer to other new items. + */ + private UpdateSequence pointerFullUpdates; + /** + * The set of all new items referred to in the whole batch. + */ + private Set allPointers; + + private PointerExtractor extractor = new PointerExtractor(); + + @Override + public List schedule(List updates) { + List result = new ArrayList<>(); + pointerFreeUpdates = new UpdateSequence(); + pointerFullUpdates = new UpdateSequence(); + allPointers = new HashSet<>(); + + for(ItemUpdate update : updates) { + splitUpdate(update); + } + + // Part 1: add all the pointer free updates + result.addAll(pointerFreeUpdates.getUpdates()); + + // Part 1': add the remaining new items that have not been touched + Set unseenPointers = new HashSet<>(allPointers); + unseenPointers.removeAll(pointerFreeUpdates.getSubjects()); + + result.addAll(unseenPointers.stream() + .map(e -> new ItemUpdateBuilder(e).build()) + .collect(Collectors.toList())); + + // Part 2: add all the pointer full updates + result.addAll(pointerFullUpdates.getUpdates()); + + return result; + } + + /** + * Splits an update into two parts + * @param update + */ + protected void splitUpdate(ItemUpdate update) { + ItemUpdateBuilder pointerFreeBuilder = new ItemUpdateBuilder(update.getItemId()) + .addLabels(update.getLabels()) + .addDescriptions(update.getDescriptions()) + .addAliases(update.getAliases()) + .deleteStatements(update.getDeletedStatements()); + ItemUpdateBuilder pointerFullBuilder = new ItemUpdateBuilder(update.getItemId()); + + for(Statement statement : update.getAddedStatements()) { + Set pointers = extractor.extractPointers(statement); + if (pointers.isEmpty()) { + pointerFreeBuilder.addStatement(statement); + } else { + pointerFullBuilder.addStatement(statement); + } + allPointers.addAll(pointers); + } + + if(update.isNew()) { + // If the update is new, we might need to split it + // in two (if it refers to any other new entity). + ItemUpdate pointerFree = pointerFreeBuilder.build(); + if (!pointerFree.isNull()) { + pointerFreeUpdates.add(pointerFree); + } + ItemUpdate pointerFull = pointerFullBuilder.build(); + if (!pointerFull.isEmpty()) { + pointerFullUpdates.add(pointerFull); + } + } else { + // Otherwise, we just make sure this edit is done after + // all item creations. + pointerFullUpdates.add(update); + } + } + +} diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/exporters/QSValuePrinterTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/exporters/QSValuePrinterTest.java index 72b79ac1f..8382364f7 100644 --- a/extensions/wikidata/tests/src/org/openrefine/wikidata/exporters/QSValuePrinterTest.java +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/exporters/QSValuePrinterTest.java @@ -40,15 +40,11 @@ public class QSValuePrinterTest { @Test public void printNewItemId() { ReconEntityIdValue id = TestingDataGenerator.makeNewItemIdValue(12345L, "my new item"); - assertNull(id.accept(printer)); + assertEquals("LAST", id.accept(printer)); // because no entity was previously created - - QSValuePrinter printerAfterCreate = new QSValuePrinter(id); - ReconEntityIdValue equalId = TestingDataGenerator.makeNewItemIdValue(12345L, "my other new item"); - assertEquals("LAST", printerAfterCreate.visit(equalId)); - ReconEntityIdValue differentId = TestingDataGenerator.makeNewItemIdValue(34567L, "my new item"); - assertNull(printerAfterCreate.visit(differentId)); + ReconEntityIdValue differentId = TestingDataGenerator.makeMatchedItemIdValue("Q78", "my existing item"); + assertEquals("Q78", differentId.accept(printer)); } // Globe coordinates diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WikibaseSchemaTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WikibaseSchemaTest.java index 7628f955f..b0baa1478 100644 --- a/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WikibaseSchemaTest.java +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WikibaseSchemaTest.java @@ -18,6 +18,7 @@ import org.json.JSONWriter; import org.openrefine.wikidata.testing.TestingDataGenerator; import org.openrefine.wikidata.updates.ItemUpdate; import org.openrefine.wikidata.updates.ItemUpdateBuilder; +import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; import org.wikidata.wdtk.datamodel.helpers.Datamodel; import org.wikidata.wdtk.datamodel.interfaces.Claim; @@ -63,12 +64,24 @@ public class WikibaseSchemaTest extends RefineTest { Collections.singletonList(Datamodel.makeReference(Collections.singletonList(retrievedSnakGroup))), StatementRank.NORMAL, ""); + private Project project; + static JSONObject jsonFromFile(String filename) throws IOException, JSONException { byte[] contents = Files.readAllBytes(Paths.get(filename)); String decoded = new String(contents, "utf-8"); return ParsingUtilities.evaluateJsonStringToObject(decoded); } + @BeforeMethod + public void setUpProject() { + project = this.createCSVProject( + "subject,inception,reference\n"+ + "Q1377,1919,http://www.ljubljana-slovenia.com/university-ljubljana\n"+ + "Q865528,1965,"); + project.rows.get(0).cells.set(0, TestingDataGenerator.makeMatchedCell("Q1377", "University of Ljubljana")); + project.rows.get(1).cells.set(0, TestingDataGenerator.makeMatchedCell("Q865528", "University of Warwick")); + } + @Test public void testSerialize() throws JSONException, IOException { JSONObject serialized = jsonFromFile("data/schema/history_of_medicine.json"); @@ -94,12 +107,6 @@ public class WikibaseSchemaTest extends RefineTest { public void testEvaluate() throws JSONException, IOException { JSONObject serialized = jsonFromFile("data/schema/inception.json"); WikibaseSchema schema = WikibaseSchema.reconstruct(serialized); - Project project = this.createCSVProject( - "subject,inception,reference\n"+ - "Q1377,1919,http://www.ljubljana-slovenia.com/university-ljubljana\n"+ - "Q865528,1965,"); - project.rows.get(0).cells.set(0, TestingDataGenerator.makeMatchedCell("Q1377", "University of Ljubljana")); - project.rows.get(1).cells.set(0, TestingDataGenerator.makeMatchedCell("Q865528", "University of Warwick")); Engine engine = new Engine(project); List updates = schema.evaluate(project, engine); List expected = new ArrayList<>(); @@ -109,4 +116,31 @@ public class WikibaseSchemaTest extends RefineTest { expected.add(update2); assertEquals(expected, updates); } + + @Test + public void testEvaluateRespectsFacets() throws JSONException, IOException { + JSONObject serialized = jsonFromFile("data/schema/inception.json"); + WikibaseSchema schema = WikibaseSchema.reconstruct(serialized); + Engine engine = new Engine(project); + JSONObject engineConfig = new JSONObject("{\n" + + " \"mode\": \"row-based\",\n" + + " \"facets\": [\n" + + " {\n" + + " \"mode\": \"text\",\n" + + " \"invert\": false,\n" + + " \"caseSensitive\": false,\n" + + " \"query\": \"www\",\n" + + " \"name\": \"reference\",\n" + + " \"type\": \"text\",\n" + + " \"columnName\": \"reference\"\n" + + " }\n" + + " ]\n" + + " }"); + engine.initializeFromJSON(engineConfig); + List updates = schema.evaluate(project, engine); + List expected = new ArrayList<>(); + ItemUpdate update1 = new ItemUpdateBuilder(qid1).addStatement(statement1).build(); + expected.add(update1); + assertEquals(expected, updates); + } } diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/ItemUpdateTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/ItemUpdateTest.java index 99b82cda0..ba8c4fbc2 100644 --- a/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/ItemUpdateTest.java +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/ItemUpdateTest.java @@ -62,6 +62,16 @@ public class ItemUpdateTest { public void testIsNull() { ItemUpdate update = new ItemUpdateBuilder(existingSubject).build(); assertTrue(update.isNull()); + ItemUpdate update2 = new ItemUpdateBuilder(newSubject).build(); + assertFalse(update2.isNull()); + } + + @Test + public void testIsEmpty() { + ItemUpdate update = new ItemUpdateBuilder(existingSubject).build(); + assertTrue(update.isEmpty()); + ItemUpdate update2 = new ItemUpdateBuilder(newSubject).build(); + assertTrue(update2.isEmpty()); } @Test @@ -78,8 +88,8 @@ public class ItemUpdateTest { .addStatement(statement1) .addStatement(statement2) .build(); - assertEquals(Arrays.asList(statement1, statement2).stream().collect(Collectors.toSet()), - update.getAddedStatements()); + assertFalse(update.isNull()); + assertEquals(Arrays.asList(statement1, statement2), update.getAddedStatements()); assertEquals(statementGroups, update.getAddedStatementGroups().stream().collect(Collectors.toSet())); } @@ -130,6 +140,7 @@ public class ItemUpdateTest { .addAlias(aliasEn) .addAlias(aliasFr) .build(); + assertFalse(updateA.isNull()); ItemUpdate normalized = updateA.normalizeLabelsAndAliases(); ItemUpdate expectedUpdate = new ItemUpdateBuilder(newSubject) .addLabel(label) diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/PointerExtractorTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/PointerExtractorTest.java new file mode 100644 index 000000000..441c70d7c --- /dev/null +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/PointerExtractorTest.java @@ -0,0 +1,95 @@ +package org.openrefine.wikidata.updates.scheduler; + +import static org.junit.Assert.assertEquals; + +import java.math.BigDecimal; +import java.util.Collections; +import java.util.Set; + +import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue; +import org.openrefine.wikidata.testing.TestingDataGenerator; +import org.testng.annotations.Test; +import org.wikidata.wdtk.datamodel.helpers.Datamodel; +import org.wikidata.wdtk.datamodel.interfaces.Claim; +import org.wikidata.wdtk.datamodel.interfaces.GlobeCoordinatesValue; +import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue; +import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue; +import org.wikidata.wdtk.datamodel.interfaces.Reference; +import org.wikidata.wdtk.datamodel.interfaces.Snak; +import org.wikidata.wdtk.datamodel.interfaces.SnakGroup; +import org.wikidata.wdtk.datamodel.interfaces.StatementRank; +import org.wikidata.wdtk.datamodel.interfaces.TimeValue; +import org.wikidata.wdtk.datamodel.interfaces.Value; + +public class PointerExtractorTest { + + private ItemIdValue existingId = Datamodel.makeWikidataItemIdValue("Q43"); + private ItemIdValue matchedId = TestingDataGenerator.makeMatchedItemIdValue("Q89","eist"); + private ItemIdValue newIdA = TestingDataGenerator.makeNewItemIdValue(1234L, "new item A"); + private ItemIdValue newIdB = TestingDataGenerator.makeNewItemIdValue(4567L, "new item B"); + + private PropertyIdValue pid = Datamodel.makeWikidataPropertyIdValue("P89"); + private Snak snakWithNew = Datamodel.makeValueSnak(pid, newIdA); + private Snak snakWithoutNew = Datamodel.makeValueSnak(pid, matchedId); + private SnakGroup snakGroupWithNew = Datamodel.makeSnakGroup(Collections.singletonList(snakWithNew)); + private SnakGroup snakGroupWithoutNew = Datamodel.makeSnakGroup(Collections.singletonList(snakWithoutNew)); + private Claim claimWithNew = Datamodel.makeClaim(existingId, snakWithNew, Collections.emptyList()); + private Claim claimNewSubject = Datamodel.makeClaim(newIdB, snakWithoutNew, Collections.emptyList()); + private Claim claimNewQualifier = Datamodel.makeClaim(matchedId, snakWithoutNew, + Collections.singletonList(snakGroupWithNew)); + + private static PointerExtractor e = new PointerExtractor(); + + @Test + public void testExtractEntityId() { + assertEquals(Collections.singleton(newIdA), e.extractPointers(newIdA)); + assertEmpty(e.extractPointers(existingId)); + assertEmpty(e.extractPointers(matchedId)); + } + + @Test + public void testExtractDatavalues() { + assertEmpty(Datamodel.makeDatatypeIdValue("string")); + assertEmpty(Datamodel.makeGlobeCoordinatesValue(1.34, 2.354, 0.1, GlobeCoordinatesValue.GLOBE_EARTH)); + assertEmpty(Datamodel.makeStringValue("est")); + assertEmpty(Datamodel.makeMonolingualTextValue("srtu", "en")); + assertEmpty(Datamodel.makeWikidataPropertyIdValue("P78")); + assertEmpty(Datamodel.makeQuantityValue(new BigDecimal("898"))); + assertEmpty(Datamodel.makeQuantityValue(new BigDecimal("7.87"), "http://www.wikidata.org/entity/Q34")); + assertEmpty(Datamodel.makeTimeValue(1898, (byte)2, (byte)3, TimeValue.CM_GREGORIAN_PRO)); + } + + @Test + public void testSnak() { + assertEmpty(e.extractPointers(snakWithoutNew)); + assertEquals(Collections.singleton(newIdA), e.extractPointers(snakWithNew)); + assertEmpty(e.extractPointers(Datamodel.makeNoValueSnak(pid))); + } + + @Test + public void testSnakGroup() { + assertEmpty(e.extractPointers(snakGroupWithoutNew)); + assertEquals(Collections.singleton(newIdA), e.extractPointers(snakGroupWithNew)); + } + + @Test + public void testStatement() { + assertEmpty(e.extractPointers(Datamodel.makeStatement(claimNewSubject, + Collections.emptyList(), StatementRank.NORMAL, ""))); + assertEquals(Collections.singleton(newIdA), e.extractPointers(Datamodel.makeStatement(claimWithNew, + Collections.emptyList(), StatementRank.NORMAL, ""))); + assertEquals(Collections.singleton(newIdA), e.extractPointers(Datamodel.makeStatement(claimNewQualifier, + Collections.emptyList(), StatementRank.NORMAL, ""))); + Reference reference = Datamodel.makeReference(Collections.singletonList(snakGroupWithNew)); + assertEquals(Collections.singleton(newIdA), e.extractPointers(Datamodel.makeStatement(claimNewSubject, + Collections.singletonList(reference), StatementRank.NORMAL, ""))); + } + + private static void assertEmpty(Value v) { + assertEmpty(e.extractPointers(v)); + } + + private static void assertEmpty(Set pointers) { + assertEquals(Collections.emptySet(), pointers); + } +} diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/QuickStatementsUpdateSchedulerTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/QuickStatementsUpdateSchedulerTest.java new file mode 100644 index 000000000..c46eae437 --- /dev/null +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/QuickStatementsUpdateSchedulerTest.java @@ -0,0 +1,51 @@ +package org.openrefine.wikidata.updates.scheduler; + +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; +import java.util.List; + +import org.openrefine.wikidata.updates.ItemUpdate; +import org.openrefine.wikidata.updates.ItemUpdateBuilder; +import org.testng.annotations.Test; + + +public class QuickStatementsUpdateSchedulerTest extends UpdateSchedulerTest { + + @Test + public void testNoNewItem() throws ImpossibleSchedulingException { + ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoB).build(); + ItemUpdate updateB = new ItemUpdateBuilder(existingIdB).addStatement(sBtoA).build(); + List scheduled = schedule(updateA, updateB); + assertEquals(Arrays.asList(updateA,updateB), scheduled); + } + + @Test + public void testSplitUpdate() throws ImpossibleSchedulingException { + ItemUpdate updateA = new ItemUpdateBuilder(existingIdA) + .addStatement(sAtoNewA) + .addStatement(sAtoNewB) + .build(); + ItemUpdate newUpdateA = new ItemUpdateBuilder(newIdA).build(); + ItemUpdate newUpdateB = new ItemUpdateBuilder(newIdB).build(); + ItemUpdate splitUpdateA = new ItemUpdateBuilder(existingIdA) + .addStatement(sAtoNewA) + .build(); + ItemUpdate splitUpdateB = new ItemUpdateBuilder(existingIdA) + .addStatement(sAtoNewB) + .build(); + List scheduled = schedule(updateA); + assertSetEquals(Arrays.asList(newUpdateA, splitUpdateA, newUpdateB, splitUpdateB), scheduled); + } + + @Test(expectedExceptions=ImpossibleSchedulingException.class) + public void testImpossibleForQS() throws ImpossibleSchedulingException { + ItemUpdate update = new ItemUpdateBuilder(newIdA).addStatement(sNewAtoNewB).build(); + schedule(update); + } + + @Override + public UpdateScheduler getScheduler() { + return new QuickStatementsUpdateScheduler(); + } +} diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/UpdateSchedulerTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/UpdateSchedulerTest.java new file mode 100644 index 000000000..3980d9cac --- /dev/null +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/UpdateSchedulerTest.java @@ -0,0 +1,94 @@ +package org.openrefine.wikidata.updates.scheduler; + +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import org.openrefine.wikidata.testing.TestingDataGenerator; +import org.openrefine.wikidata.updates.ItemUpdate; +import org.openrefine.wikidata.updates.ItemUpdateBuilder; +import org.testng.annotations.Test; +import org.wikidata.wdtk.datamodel.helpers.Datamodel; +import org.wikidata.wdtk.datamodel.interfaces.Claim; +import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue; +import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue; +import org.wikidata.wdtk.datamodel.interfaces.Statement; +import org.wikidata.wdtk.datamodel.interfaces.StatementRank; + +public abstract class UpdateSchedulerTest { + + protected ItemIdValue existingIdA = Datamodel.makeWikidataItemIdValue("Q43"); + protected ItemIdValue existingIdB = Datamodel.makeWikidataItemIdValue("Q538"); + protected ItemIdValue newIdA = TestingDataGenerator.makeNewItemIdValue(1234L, "new item A"); + protected ItemIdValue newIdB = TestingDataGenerator.makeNewItemIdValue(5678L, "new item B"); + + protected static PropertyIdValue pid = Datamodel.makeWikidataPropertyIdValue("P38"); + + protected Statement sAtoB = generateStatement(existingIdA, existingIdB); + protected Statement sBtoA = generateStatement(existingIdB, existingIdA); + protected Statement sAtoNewA = generateStatement(existingIdA, newIdA); + protected Statement sAtoNewB = generateStatement(existingIdA, newIdB); + protected Statement sNewAtoB = generateStatement(newIdA, existingIdB); + protected Statement sNewAtoNewB = generateStatement(newIdA, newIdB); + + public static Statement generateStatement(ItemIdValue from, ItemIdValue to) { + Claim claim = Datamodel.makeClaim(from, Datamodel.makeValueSnak(pid, to), Collections.emptyList()); + return Datamodel.makeStatement(claim, Collections.emptyList(), StatementRank.NORMAL, ""); + } + + public abstract UpdateScheduler getScheduler(); + + protected List schedule(ItemUpdate... itemUpdates) throws ImpossibleSchedulingException { + return getScheduler().schedule(Arrays.asList(itemUpdates)); + } + + protected static void assertSetEquals(List expected, List actual) { + assertEquals(expected.stream().collect(Collectors.toSet()), + actual.stream().collect(Collectors.toSet())); + } + + @Test + public void testNewItemNotMentioned() throws ImpossibleSchedulingException { + ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoNewA).build(); + List scheduled = schedule(updateA); + ItemUpdate newUpdate = new ItemUpdateBuilder(newIdA).build(); + assertEquals(Arrays.asList(newUpdate, updateA), scheduled); + } + + @Test + public void testNewItemMentioned() throws ImpossibleSchedulingException { + ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoNewA).build(); + ItemUpdate newUpdate = new ItemUpdateBuilder(newIdA).addStatement(sNewAtoB).build(); + List scheduled = schedule(updateA, newUpdate); + assertEquals(Arrays.asList(newUpdate, updateA), scheduled); + } + + @Test + public void testMerge() throws ImpossibleSchedulingException { + ItemUpdate update1 = new ItemUpdateBuilder(existingIdA) + .addStatement(sAtoB) + .build(); + ItemUpdate update2 = new ItemUpdateBuilder(existingIdA) + .addLabel(Datamodel.makeMonolingualTextValue("hello", "fr")) + .addStatement(sAtoB) + .build(); + ItemUpdate merged = update1.merge(update2); + assertEquals(Collections.singletonList(merged), schedule(update1, update2)); + } + + @Test + public void testMergeNew() throws ImpossibleSchedulingException { + ItemUpdate update1 = new ItemUpdateBuilder(newIdA) + .addLabel(Datamodel.makeMonolingualTextValue("hello", "fr")) + .addStatement(sNewAtoB) + .build(); + ItemUpdate update2 = new ItemUpdateBuilder(newIdA) + .addLabel(Datamodel.makeMonolingualTextValue("hello", "fr")) + .build(); + ItemUpdate merged = update1.merge(update2); + assertEquals(Collections.singletonList(merged), schedule(update1, update2)); + } +} diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/WikibaseAPIUpdateSchedulerTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/WikibaseAPIUpdateSchedulerTest.java new file mode 100644 index 000000000..27505e664 --- /dev/null +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/updates/scheduler/WikibaseAPIUpdateSchedulerTest.java @@ -0,0 +1,58 @@ +package org.openrefine.wikidata.updates.scheduler; + +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.openrefine.wikidata.updates.ItemUpdate; +import org.openrefine.wikidata.updates.ItemUpdateBuilder; +import org.testng.annotations.Test; +import org.wikidata.wdtk.datamodel.helpers.Datamodel; + + +public class WikibaseAPIUpdateSchedulerTest extends UpdateSchedulerTest { + + @Test + public void testOrderPreserved() throws ImpossibleSchedulingException { + ItemUpdate updateA = new ItemUpdateBuilder(existingIdA).addStatement(sAtoB).build(); + ItemUpdate updateB = new ItemUpdateBuilder(existingIdB).addStatement(sBtoA).build(); + List scheduled = schedule(updateA, updateB); + assertEquals(Arrays.asList(updateA,updateB), scheduled); + } + + @Test + public void testUpdateIsNotSplit() throws ImpossibleSchedulingException { + ItemUpdate updateA = new ItemUpdateBuilder(existingIdA) + .addStatement(sAtoNewA) + .addStatement(sAtoNewB) + .build(); + ItemUpdate newUpdateA = new ItemUpdateBuilder(newIdA).build(); + ItemUpdate newUpdateB = new ItemUpdateBuilder(newIdB).build(); + List scheduled = schedule(updateA); + assertSetEquals(Arrays.asList(newUpdateA, newUpdateB, updateA), scheduled); + } + + @Test + public void testMixedUpdate() throws ImpossibleSchedulingException { + ItemUpdate updateA = new ItemUpdateBuilder(existingIdA) + .addStatement(sAtoNewA) + .addStatement(sAtoNewB) + .addStatement(sAtoB) + .build(); + ItemUpdate newUpdateA = new ItemUpdateBuilder(newIdA) + .addStatement(sNewAtoB) + .build(); + ItemUpdate newUpdateB = new ItemUpdateBuilder(newIdB) + .build(); + List scheduled = schedule(updateA, newUpdateA); + assertEquals(Arrays.asList(newUpdateA, newUpdateB, updateA), scheduled); + } + + @Override + public UpdateScheduler getScheduler() { + return new WikibaseAPIUpdateScheduler(); + } + +}