* Harden reconciliation - Fixes #2590 - check for non-JSON / unparseable JSON returns - handle malformed results response with no name for candidates - catch any Exception, not just IOExceptions - call processManager.onFailedProcess() for cleanup on error * Add default constructor for Jackson Jackson complains about needing a default constructor for the NON_DEFAULT annotation, but I'm not sure why this worked before. * Clean up indentation and unused variable - no functional changes Make indentation consistent throughout the module, changing recently added lines to use the standard all spaces convention. Remove unused count variable * Simplify control flow * Update limit parameter comment. No functional change. * Replace ternary expression which is causing NPE * Add reconciliation tests using mock HTTP server
This commit is contained in:
parent
6e66cb5144
commit
1849e62234
@ -306,12 +306,21 @@ public class StandardReconConfig extends ReconConfig {
|
||||
@JsonInclude(Include.NON_EMPTY)
|
||||
protected List<QueryProperty> properties;
|
||||
|
||||
// Only send limit if it's non-default to preserve backward compatibility with
|
||||
// services which might choke on this
|
||||
// Only send limit if it's non-default (default = 0) to preserve backward
|
||||
// compatibility with services which might choke on this (pre-2013)
|
||||
@JsonProperty("limit")
|
||||
@JsonInclude(Include.NON_DEFAULT)
|
||||
protected int limit;
|
||||
|
||||
public ReconQuery() {
|
||||
super();
|
||||
this.query = "";
|
||||
this.typeID = null;
|
||||
this.properties = null;
|
||||
this.limit = 0;
|
||||
}
|
||||
|
||||
@JsonCreator
|
||||
public ReconQuery(
|
||||
String query,
|
||||
String typeID,
|
||||
@ -351,12 +360,7 @@ public class StandardReconConfig extends ReconConfig {
|
||||
for (int i = 0; i != types.size(); i++) {
|
||||
bareTypes[i] = types.get(i).id;
|
||||
}
|
||||
ReconCandidate result = new ReconCandidate(
|
||||
id,
|
||||
name,
|
||||
bareTypes,
|
||||
score
|
||||
);
|
||||
ReconCandidate result = new ReconCandidate(id, name, bareTypes, score);
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -419,6 +423,7 @@ public class StandardReconConfig extends ReconConfig {
|
||||
job.code = ParsingUtilities.defaultWriter.writeValueAsString(query);
|
||||
} catch (JsonProcessingException e) {
|
||||
e.printStackTrace();
|
||||
return null; // TODO: Throw exception instead?
|
||||
}
|
||||
return job;
|
||||
}
|
||||
@ -446,7 +451,7 @@ public class StandardReconConfig extends ReconConfig {
|
||||
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
|
||||
{
|
||||
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
|
||||
connection.setConnectTimeout(30000);
|
||||
connection.setConnectTimeout(30000); // TODO parameterize
|
||||
connection.setDoOutput(true);
|
||||
|
||||
DataOutputStream dos = new DataOutputStream(connection.getOutputStream());
|
||||
@ -464,16 +469,18 @@ public class StandardReconConfig extends ReconConfig {
|
||||
|
||||
if (connection.getResponseCode() >= 400) {
|
||||
InputStream is = connection.getErrorStream();
|
||||
String msg = is == null ? "" : ParsingUtilities.inputStreamToString(is);
|
||||
logger.error("Failed - code: "
|
||||
+ Integer.toString(connection.getResponseCode())
|
||||
+ " message: " + is == null ? ""
|
||||
: ParsingUtilities.inputStreamToString(is));
|
||||
+ " message: " + msg);
|
||||
} else {
|
||||
InputStream is = connection.getInputStream();
|
||||
try {
|
||||
String s = ParsingUtilities.inputStreamToString(is);
|
||||
ObjectNode o = ParsingUtilities.evaluateJsonStringToObjectNode(s);
|
||||
|
||||
if (o == null) { // utility method returns null instead of throwing
|
||||
logger.error("Failed to parse string as JSON: " + s);
|
||||
} else {
|
||||
for (int i = 0; i < jobs.size(); i++) {
|
||||
StandardReconJob job = (StandardReconJob) jobs.get(i);
|
||||
Recon recon = null;
|
||||
@ -490,6 +497,7 @@ public class StandardReconConfig extends ReconConfig {
|
||||
logger.warn("Service error for text: " + text + "\n Job code: " + job.code + "\n Response: " + o2.toString());
|
||||
}
|
||||
} else {
|
||||
// TODO: better error reporting
|
||||
logger.warn("Service error for text: " + text + "\n Job code: " + job.code);
|
||||
}
|
||||
|
||||
@ -498,14 +506,16 @@ public class StandardReconConfig extends ReconConfig {
|
||||
}
|
||||
recons.add(recon);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
is.close();
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to batch recon with load:\n" + queriesString, e);
|
||||
}
|
||||
|
||||
// TODO: This code prevents the retry mechanism in ReconOperation from working
|
||||
while (recons.size() < jobs.size()) {
|
||||
Recon recon = new Recon(historyEntryID, identifierSpace, schemaSpace);
|
||||
recon.service = service;
|
||||
@ -538,7 +548,6 @@ public class StandardReconConfig extends ReconConfig {
|
||||
});
|
||||
|
||||
int length = results.size();
|
||||
int count = 0;
|
||||
for (int i = 0; i < length; i++) {
|
||||
ReconResult result = results.get(i);
|
||||
|
||||
@ -552,7 +561,6 @@ public class StandardReconConfig extends ReconConfig {
|
||||
}
|
||||
|
||||
recon.addCandidate(candidate);
|
||||
count++;
|
||||
}
|
||||
|
||||
computeFeatures(recon, text);
|
||||
@ -563,18 +571,18 @@ public class StandardReconConfig extends ReconConfig {
|
||||
* Recomputes the features associated with this reconciliation
|
||||
* object (only if we have at least one candidate).
|
||||
*
|
||||
* @param text
|
||||
* the cell value to compare the reconciliation data to
|
||||
* @param text the cell value to compare the reconciliation data to
|
||||
*/
|
||||
public void computeFeatures(Recon recon, String text) {
|
||||
if (recon.candidates != null && !recon.candidates.isEmpty() && text != null) {
|
||||
ReconCandidate candidate = recon.candidates.get(0);
|
||||
|
||||
if (candidate.name != null) {
|
||||
recon.setFeature(Recon.Feature_nameMatch, text.equalsIgnoreCase(candidate.name));
|
||||
recon.setFeature(Recon.Feature_nameLevenshtein,
|
||||
StringUtils.getLevenshteinDistance(StringUtils.lowerCase(text), StringUtils.lowerCase(candidate.name)));
|
||||
recon.setFeature(Recon.Feature_nameWordDistance, wordDistance(text, candidate.name));
|
||||
|
||||
}
|
||||
recon.setFeature(Recon.Feature_typeMatch, false);
|
||||
if (this.typeID != null) {
|
||||
for (String typeID : candidate.types) {
|
||||
|
@ -286,22 +286,26 @@ public class ReconOperation extends EngineDependentOperation {
|
||||
JobGroup group = jobToGroup.get(job);
|
||||
List<ReconEntry> entries = group.entries;
|
||||
|
||||
/*
|
||||
* TODO: Not sure what this retry is meant to handle, but it's currently
|
||||
* non-functional due the code at the end of StandardReconConfig#batchRecon()
|
||||
* which tops up any missing entries.
|
||||
*/
|
||||
if (recon == null) {
|
||||
group.trials++;
|
||||
if (group.trials < 3) {
|
||||
logger.warn("Re-trying job including cell containing: " + entries.get(0).cell.value);
|
||||
continue; // try again next time
|
||||
}
|
||||
logger.warn("Failed after 3 trials for job including cell containing: " + entries.get(0).cell.value);
|
||||
String msg = "Failed after 3 trials for job including cell containing: " + entries.get(0).cell.value;
|
||||
logger.warn(msg);
|
||||
recon = _reconConfig.createNewRecon(_historyEntryID);
|
||||
}
|
||||
|
||||
jobToGroup.remove(job);
|
||||
jobs.remove(j);
|
||||
done++;
|
||||
|
||||
if (recon == null) {
|
||||
recon = _reconConfig.createNewRecon(_historyEntryID);
|
||||
}
|
||||
recon.judgmentBatchSize = entries.size();
|
||||
|
||||
for (ReconEntry entry : entries) {
|
||||
@ -328,6 +332,7 @@ public class ReconOperation extends EngineDependentOperation {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Option to keep partial results after cancellation?
|
||||
if (!_canceled) {
|
||||
Change reconChange = new ReconChange(
|
||||
cellChanges,
|
||||
|
@ -32,7 +32,9 @@ import static org.testng.Assert.assertNull;
|
||||
import static org.testng.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URLEncoder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testng.Assert;
|
||||
@ -43,9 +45,13 @@ import org.testng.annotations.Test;
|
||||
import com.fasterxml.jackson.core.JsonParseException;
|
||||
import com.fasterxml.jackson.databind.JsonMappingException;
|
||||
import com.fasterxml.jackson.databind.node.ArrayNode;
|
||||
|
||||
import com.google.refine.RefineTest;
|
||||
import com.google.refine.browsing.EngineConfig;
|
||||
import com.google.refine.model.Cell;
|
||||
import com.google.refine.model.Project;
|
||||
import com.google.refine.model.Recon;
|
||||
import com.google.refine.model.ReconCandidate;
|
||||
import com.google.refine.model.Row;
|
||||
import com.google.refine.model.recon.ReconConfig;
|
||||
import com.google.refine.model.recon.ReconJob;
|
||||
@ -54,9 +60,16 @@ import com.google.refine.model.recon.StandardReconConfig.ColumnDetail;
|
||||
import com.google.refine.model.recon.StandardReconConfig.ReconResult;
|
||||
import com.google.refine.operations.OperationRegistry;
|
||||
import com.google.refine.operations.recon.ReconOperation;
|
||||
import com.google.refine.process.Process;
|
||||
import com.google.refine.process.ProcessManager;
|
||||
import com.google.refine.util.ParsingUtilities;
|
||||
import com.google.refine.util.TestUtils;
|
||||
|
||||
import okhttp3.HttpUrl;
|
||||
import okhttp3.mockwebserver.MockResponse;
|
||||
import okhttp3.mockwebserver.MockWebServer;
|
||||
import okhttp3.mockwebserver.RecordedRequest;
|
||||
|
||||
public class StandardReconConfigTests extends RefineTest {
|
||||
|
||||
@BeforeMethod
|
||||
@ -202,6 +215,183 @@ public class StandardReconConfigTests extends RefineTest {
|
||||
+ "\"type_strict\":\"should\"}", job.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void reconNonJsonTest() throws Exception {
|
||||
Project project = createCSVProject("title,director\n"
|
||||
+ "mulholland drive,david lynch");
|
||||
|
||||
String nonJsonResponse = "<!DOCTYPE html>\n" +
|
||||
"<html lang=\"en\">\n" +
|
||||
" <head>\n" +
|
||||
" <meta charset=\"utf-8\">\n" +
|
||||
" <title>Error</title>\n" +
|
||||
" </head>\n" +
|
||||
" <body>\n" +
|
||||
" You have reached an error page.\n" +
|
||||
" </body>\n" +
|
||||
"</html>";
|
||||
|
||||
|
||||
try (MockWebServer server = new MockWebServer()) {
|
||||
server.start();
|
||||
HttpUrl url = server.url("/openrefine-wikidata/en/api");
|
||||
server.enqueue(new MockResponse().setBody(nonJsonResponse));
|
||||
server.enqueue(new MockResponse());
|
||||
|
||||
String configJson = " {\n" +
|
||||
" \"mode\": \"standard-service\",\n" +
|
||||
" \"service\": \"" + url + "\",\n" +
|
||||
" \"identifierSpace\": \"http://www.wikidata.org/entity/\",\n" +
|
||||
" \"schemaSpace\": \"http://www.wikidata.org/prop/direct/\",\n" +
|
||||
" \"type\": {\n" +
|
||||
" \"id\": \"Q11424\",\n" +
|
||||
" \"name\": \"film\"\n" +
|
||||
" },\n" +
|
||||
" \"autoMatch\": true,\n" +
|
||||
" \"columnDetails\": [\n" +
|
||||
" {\n" +
|
||||
" \"column\": \"director\",\n" +
|
||||
" \"propertyName\": \"Director\",\n" +
|
||||
" \"propertyID\": \"P57\"\n" +
|
||||
" }\n" +
|
||||
" ]}";
|
||||
StandardReconConfig config = StandardReconConfig.reconstruct(configJson);
|
||||
ReconOperation op = new ReconOperation(EngineConfig.reconstruct(null), "director", config);
|
||||
Process process = op.createProcess(project, new Properties());
|
||||
ProcessManager pm = project.getProcessManager();
|
||||
process.startPerforming(pm);
|
||||
Assert.assertTrue(process.isRunning());
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch (InterruptedException e) {
|
||||
Assert.fail("Test interrupted");
|
||||
}
|
||||
Assert.assertFalse(process.isRunning());
|
||||
|
||||
RecordedRequest request1 = server.takeRequest();
|
||||
|
||||
assertNotNull(request1);
|
||||
|
||||
// We won't have gotten a result, but we want to make sure things didn't die
|
||||
Row row = project.rows.get(0);
|
||||
Cell cell = row.cells.get(1);
|
||||
assertNotNull(cell.recon);
|
||||
assertEquals(cell.recon.service, url.toString());
|
||||
ReconCandidate candidate = cell.recon.getBestCandidate();
|
||||
assertNull(candidate);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void reconTest() throws Exception {
|
||||
Project project = createCSVProject("title,director\n"
|
||||
+ "mulholland drive,david lynch");
|
||||
|
||||
String reconResponse = "{\n" +
|
||||
"q0: {\n" +
|
||||
" result: [\n" +
|
||||
" {\n" +
|
||||
" P57: {\n" +
|
||||
"score: 100,\n" +
|
||||
"weighted: 40\n" +
|
||||
"},\n" +
|
||||
"all_labels: {\n" +
|
||||
"score: 59,\n" +
|
||||
"weighted: 59\n" +
|
||||
"},\n" +
|
||||
"score: 70.71428571428572,\n" +
|
||||
"id: \"Q3989262\",\n" +
|
||||
"name: \"The Short Films of David Lynch\",\n" +
|
||||
"type: [\n" +
|
||||
"{\n" +
|
||||
"id: \"Q24862\",\n" +
|
||||
"name: \"short film\"\n" +
|
||||
"},\n" +
|
||||
"{\n" +
|
||||
"id: \"Q202866\",\n" +
|
||||
"name: \"animated film\"\n" +
|
||||
"}\n" +
|
||||
"],\n" +
|
||||
"match: false\n" +
|
||||
"},\n" +
|
||||
"{\n" +
|
||||
"P57: {\n" +
|
||||
"score: 100,\n" +
|
||||
"weighted: 40\n" +
|
||||
"},\n" +
|
||||
"all_labels: {\n" +
|
||||
"score: 44,\n" +
|
||||
"weighted: 44\n" +
|
||||
"},\n" +
|
||||
"score: 60.00000000000001,\n" +
|
||||
"id: \"Q83365219\",\n" +
|
||||
"name: \"What Did Jack Do?\",\n" +
|
||||
"type: [\n" +
|
||||
"{\n" +
|
||||
"id: \"Q24862\",\n" +
|
||||
"name: \"short film\"\n" +
|
||||
"}\n" +
|
||||
"],\n" +
|
||||
"match: false\n" +
|
||||
" }\n" +
|
||||
" ]\n" +
|
||||
" }\n" +
|
||||
"}\n";
|
||||
try (MockWebServer server = new MockWebServer()) {
|
||||
server.start();
|
||||
HttpUrl url = server.url("/openrefine-wikidata/en/api");
|
||||
// FIXME: Retry doesn't currently work, but should be tested
|
||||
// server.enqueue(new MockResponse().setResponseCode(503)); // service overloaded
|
||||
server.enqueue(new MockResponse().setBody(reconResponse));
|
||||
server.enqueue(new MockResponse());
|
||||
|
||||
String configJson = " {\n" +
|
||||
" \"mode\": \"standard-service\",\n" +
|
||||
" \"service\": \"" + url + "\",\n" +
|
||||
" \"identifierSpace\": \"http://www.wikidata.org/entity/\",\n" +
|
||||
" \"schemaSpace\": \"http://www.wikidata.org/prop/direct/\",\n" +
|
||||
" \"type\": {\n" +
|
||||
" \"id\": \"Q11424\",\n" +
|
||||
" \"name\": \"film\"\n" +
|
||||
" },\n" +
|
||||
" \"autoMatch\": true,\n" +
|
||||
" \"columnDetails\": [\n" +
|
||||
" {\n" +
|
||||
" \"column\": \"director\",\n" +
|
||||
" \"propertyName\": \"Director\",\n" +
|
||||
" \"propertyID\": \"P57\"\n" +
|
||||
" }\n" +
|
||||
" ]}";
|
||||
StandardReconConfig config = StandardReconConfig.reconstruct(configJson);
|
||||
ReconOperation op = new ReconOperation(EngineConfig.reconstruct(null), "director", config);
|
||||
Process process = op.createProcess(project, new Properties());
|
||||
ProcessManager pm = project.getProcessManager();
|
||||
process.startPerforming(pm);
|
||||
Assert.assertTrue(process.isRunning());
|
||||
try {
|
||||
Thread.sleep(1000); // TODO: timeout will need to increase for retries
|
||||
} catch (InterruptedException e) {
|
||||
Assert.fail("Test interrupted");
|
||||
}
|
||||
Assert.assertFalse(process.isRunning());
|
||||
|
||||
// RecordedRequest scratchFirstRquest = server.takeRequest();
|
||||
RecordedRequest request1 = server.takeRequest();
|
||||
|
||||
assertNotNull(request1);
|
||||
String query = request1.getBody().readUtf8Line();
|
||||
assertNotNull(query);
|
||||
String expected = "queries=" + URLEncoder.encode("{\"q0\":{\"query\":\"david lynch\",\"type\":\"Q11424\",\"properties\":[{\"pid\":\"P57\",\"v\":\"david lynch\"}],\"type_strict\":\"should\"}}", "UTF-8");
|
||||
assertEquals(query, expected);
|
||||
|
||||
Row row = project.rows.get(0);
|
||||
Cell cell = row.cells.get(1);
|
||||
assertNotNull(cell.recon);
|
||||
assertEquals(cell.recon.service, url.toString());
|
||||
assertEquals(cell.recon.getBestCandidate().types[0], "Q24862");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The UI format and the backend format differ for serialization
|
||||
* (the UI never deserializes and the backend serialization did not matter).
|
||||
|
Loading…
Reference in New Issue
Block a user