Introduce a rewriter to update references to new items after creation

This commit is contained in:
Antonin Delpeuch 2018-03-03 02:36:32 +00:00
parent 7cb8757028
commit 773be2e161
5 changed files with 258 additions and 10 deletions

View File

@ -7,6 +7,10 @@ import java.util.Set;
import java.util.HashSet; import java.util.HashSet;
import com.google.refine.model.Project; import com.google.refine.model.Project;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.refine.model.Cell; import com.google.refine.model.Cell;
import com.google.refine.model.Column; import com.google.refine.model.Column;
import com.google.refine.model.Recon; import com.google.refine.model.Recon;
@ -29,6 +33,11 @@ public class NewItemLibrary {
map = new HashMap<>(); map = new HashMap<>();
} }
@JsonCreator
public NewItemLibrary(@JsonProperty("qidMap") Map<Long, String> map) {
this.map = map;
}
/** /**
* Retrieves the Qid allocated to a given new cell * Retrieves the Qid allocated to a given new cell
* @param id: the fake ItemId generated by the cell * @param id: the fake ItemId generated by the cell
@ -67,23 +76,26 @@ public class NewItemLibrary {
*/ */
for(Row row : project.rows) { for(Row row : project.rows) {
for(Cell cell : row.cells) { for(int i = 0; i != row.cells.size(); i++) {
Cell cell = row.cells.get(i);
if (cell == null || cell.recon == null) { if (cell == null || cell.recon == null) {
continue; continue;
} }
Recon recon = cell.recon; Recon recon = cell.recon;
if (Recon.Judgment.New.equals(recon.judgment) && !reset && if (Recon.Judgment.New.equals(recon.judgment) && !reset &&
map.containsKey(recon.id)) { map.containsKey(recon.judgmentHistoryEntry)) {
recon.judgment = Recon.Judgment.Matched; recon.judgment = Recon.Judgment.Matched;
recon.match = new ReconCandidate( recon.match = new ReconCandidate(
map.get(recon.id), map.get(recon.judgmentHistoryEntry),
cell.value.toString(), cell.value.toString(),
new String[0], new String[0],
100); 100);
impactedColumns.add(i);
} else if (Recon.Judgment.Matched.equals(recon.judgment) && reset && } else if (Recon.Judgment.Matched.equals(recon.judgment) && reset &&
map.containsKey(recon.id)) { map.containsKey(recon.judgmentHistoryEntry)) {
recon.judgment = Recon.Judgment.New; recon.judgment = Recon.Judgment.New;
recon.match = null; recon.match = null;
impactedColumns.add(i);
} }
} }
} }
@ -98,14 +110,27 @@ public class NewItemLibrary {
* Getter, only meant to be used by Jackson * Getter, only meant to be used by Jackson
* @return the underlying map * @return the underlying map
*/ */
@JsonProperty("qidMap")
public Map<Long, String> getQidMap() { public Map<Long, String> getQidMap() {
return map; return map;
} }
/** @Override
* Setter, only meant to be used by Jackson public boolean equals(Object other) {
*/ if(other == null || !NewItemLibrary.class.isInstance(other)) {
public void setQidMap(Map<Long, String> newMap) { return false;
map = newMap; }
NewItemLibrary otherLibrary = (NewItemLibrary)other;
return map.equals(otherLibrary.getQidMap());
}
@Override
public int hashCode() {
return map.hashCode();
}
@Override
public String toString() {
return map.toString();
} }
} }

View File

@ -0,0 +1,88 @@
package org.openrefine.wikidata.editing;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.openrefine.wikidata.schema.entityvalues.ReconEntityIdValue;
import org.openrefine.wikidata.schema.entityvalues.ReconItemIdValue;
import org.openrefine.wikidata.updates.ItemUpdate;
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
import org.wikidata.wdtk.datamodel.helpers.DatamodelConverter;
import org.wikidata.wdtk.datamodel.implementation.DataObjectFactoryImpl;
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
import org.wikidata.wdtk.datamodel.interfaces.Statement;
import org.wikidata.wdtk.datamodel.interfaces.Value;
/**
* A class that rewrites an {@link ItemUpdate},
* replacing reconciled entity id values by their concrete
* values after creation of all the new items involved.
*
* If an item has not been created yet, an {@link IllegalArgumentException}
* will be raised.
*
* The subject is treated as a special case: it is returned unchanged.
* This is because it is guaranteed not to appear in the update (but
* it does appear in the datamodel representation as the subject is passed around
* to the Claim objects its document contains).
*
* @author Antonin Delpeuch
*
*/
public class ReconEntityRewriter extends DatamodelConverter {
private NewItemLibrary library;
private ItemIdValue subject;
/**
* Constructor. Sets up a rewriter which uses the provided library
* to look up qids of new items, and the subject (which should not be
* rewritten).
*
* @param library
* @param subject
*/
public ReconEntityRewriter(NewItemLibrary library, ItemIdValue subject) {
super(new DataObjectFactoryImpl());
this.library = library;
this.subject = subject;
}
@Override
public ItemIdValue copy(ItemIdValue value) {
if(subject.equals(value)) {
return value;
}
if(value instanceof ReconItemIdValue) {
ReconItemIdValue recon = (ReconItemIdValue)value;
if(recon.isNew()) {
String newId = library.getQid(recon.getReconInternalId());
if(newId == null) {
throw new IllegalArgumentException(
"Trying to rewrite an update where a new item was not created yet.");
}
return Datamodel.makeItemIdValue(newId,
recon.getSiteIri());
}
}
return super.copy(value);
}
public ItemUpdate rewrite(ItemUpdate update) {
Set<MonolingualTextValue> labels = update.getLabels().stream()
.map(l -> copy(l)).collect(Collectors.toSet());
Set<MonolingualTextValue> descriptions = update.getDescriptions().stream()
.map(l -> copy(l)).collect(Collectors.toSet());
Set<MonolingualTextValue> aliases = update.getAliases().stream()
.map(l -> copy(l)).collect(Collectors.toSet());
List<Statement> addedStatements = update.getAddedStatements().stream()
.map(l -> copy(l)).collect(Collectors.toList());
Set<Statement> deletedStatements = update.getDeletedStatements().stream()
.map(l -> copy(l)).collect(Collectors.toSet());
return new ItemUpdate(update.getItemId(), addedStatements,
deletedStatements, labels, descriptions, aliases);
}
}

View File

@ -16,6 +16,7 @@ import org.json.JSONWriter;
import org.openrefine.wikidata.editing.ConnectionManager; import org.openrefine.wikidata.editing.ConnectionManager;
import org.openrefine.wikidata.editing.NewItemLibrary; import org.openrefine.wikidata.editing.NewItemLibrary;
import org.openrefine.wikidata.editing.ReconEntityRewriter;
import org.openrefine.wikidata.updates.ItemUpdate; import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException; import org.openrefine.wikidata.updates.scheduler.ImpossibleSchedulingException;
import org.openrefine.wikidata.updates.scheduler.UpdateScheduler; import org.openrefine.wikidata.updates.scheduler.UpdateScheduler;
@ -279,10 +280,13 @@ public class PerformWikibaseEditsOperation extends EngineDependentOperation {
logger.info("Performing edits"); logger.info("Performing edits");
for(ItemUpdate update : batch) { for(ItemUpdate update : batch) {
// Rewrite the update
ReconEntityRewriter rewriter = new ReconEntityRewriter(newItemLibrary, update.getItemId());
update = rewriter.rewrite(update);
try { try {
// New item // New item
if (update.getItemId().getId() == "Q0") { if (update.getItemId().getId().equals("Q0")) {
ReconEntityIdValue newCell = (ReconEntityIdValue)update.getItemId(); ReconEntityIdValue newCell = (ReconEntityIdValue)update.getItemId();
update.normalizeLabelsAndAliases(); update.normalizeLabelsAndAliases();

View File

@ -0,0 +1,64 @@
package org.openrefine.wikidata.editing;
import static org.junit.Assert.assertEquals;
import org.openrefine.wikidata.testing.JacksonSerializationTest;
import org.openrefine.wikidata.testing.TestingData;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import com.google.refine.model.Cell;
import com.google.refine.model.Project;
import com.google.refine.model.Recon;
import com.google.refine.tests.RefineTest;
public class NewItemLibraryTest extends RefineTest {
private NewItemLibrary library;
@BeforeMethod
public void setUp() {
library = new NewItemLibrary();
library.setQid(1234L, "Q345");
library.setQid(3289L, "Q384");
}
@Test
public void testRetrieveItem() {
assertEquals("Q345", library.getQid(1234L));
}
@Test
public void testUpdateReconciledCells() {
Project project = createCSVProject(TestingData.inceptionWithNewCsv);
project.rows.get(0).cells.set(0, TestingData.makeNewItemCell(3289L, "University of Ljubljana"));
project.rows.get(1).cells.set(0, TestingData.makeMatchedCell("Q865528", "University of Warwick"));
project.rows.get(2).cells.set(0, TestingData.makeNewItemCell(1234L, "new uni"));
isNewTo(3289L, project.rows.get(0).cells.get(0));
isMatchedTo("Q865528", project.rows.get(1).cells.get(0));
isNewTo(1234L, project.rows.get(2).cells.get(0));
library.updateReconciledCells(project, false);
isMatchedTo("Q384", project.rows.get(0).cells.get(0));
isMatchedTo("Q865528", project.rows.get(1).cells.get(0));
isMatchedTo("Q345", project.rows.get(2).cells.get(0));
library.updateReconciledCells(project, true);
isNewTo(3289L, project.rows.get(0).cells.get(0));
isMatchedTo("Q865528", project.rows.get(1).cells.get(0));
isNewTo(1234L, project.rows.get(2).cells.get(0));
}
@Test
public void testSerialize() {
JacksonSerializationTest.canonicalSerialization(NewItemLibrary.class, library,
"{\"qidMap\":{\"1234\":\"Q345\",\"3289\":\"Q384\"}}");
}
private void isMatchedTo(String qid, Cell cell) {
assertEquals(Recon.Judgment.Matched, cell.recon.judgment);
assertEquals(qid, cell.recon.match.id);
}
private void isNewTo(long id, Cell cell) {
assertEquals(Recon.Judgment.New, cell.recon.judgment);
assertEquals(id, cell.recon.judgmentHistoryEntry);
}
}

View File

@ -0,0 +1,67 @@
package org.openrefine.wikidata.editing;
import static org.junit.Assert.assertEquals;
import org.openrefine.wikidata.testing.TestingData;
import org.openrefine.wikidata.updates.ItemUpdate;
import org.openrefine.wikidata.updates.ItemUpdateBuilder;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
public class ReconEntityRewriterTest {
NewItemLibrary library = null;
ReconEntityRewriter rewriter = null;
ItemIdValue subject = TestingData.newIdA;
ItemIdValue newlyCreated = Datamodel.makeWikidataItemIdValue("Q1234");
@BeforeMethod
public void setUp() {
library = new NewItemLibrary();
rewriter = new ReconEntityRewriter(library, subject);
}
@Test(expectedExceptions=IllegalArgumentException.class)
public void testNotCreatedYet() {
rewriter.copy(TestingData.newIdB);
}
@Test
public void testSuccessfulRewrite() {
library.setQid(4567L, "Q1234");
assertEquals(newlyCreated, rewriter.copy(TestingData.newIdB));
}
@Test
public void testSubjectNotRewriten() {
assertEquals(subject, rewriter.copy(subject));
}
@Test
public void testMatched() {
assertEquals(TestingData.matchedId, rewriter.copy(TestingData.matchedId));
}
@Test
public void testRewriteUpdate() {
library.setQid(4567L, "Q1234");
ItemUpdate update = new ItemUpdateBuilder(subject)
.addStatement(TestingData.generateStatement(subject, TestingData.newIdB))
.deleteStatement(TestingData.generateStatement(subject, TestingData.existingId))
.addLabel(Datamodel.makeMonolingualTextValue("label", "de"))
.addDescription(Datamodel.makeMonolingualTextValue("beschreibung", "de"))
.addAlias(Datamodel.makeMonolingualTextValue("darstellung", "de"))
.build();
ItemUpdate rewritten = rewriter.rewrite(update);
ItemUpdate expected = new ItemUpdateBuilder(subject)
.addStatement(TestingData.generateStatement(subject, newlyCreated))
.deleteStatement(TestingData.generateStatement(subject, TestingData.existingId))
.addLabel(Datamodel.makeMonolingualTextValue("label", "de"))
.addDescription(Datamodel.makeMonolingualTextValue("beschreibung", "de"))
.addAlias(Datamodel.makeMonolingualTextValue("darstellung", "de"))
.build();
assertEquals(expected, rewritten);
}
}