From 343c8afbea9bc5a1825a20ad53ba0058a9b0c3f1 Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Sun, 4 Apr 2021 19:25:07 +0200 Subject: [PATCH] Ignore invalid regexes from Wikibase format constraints. (#3721) * Ignore invalid regexes from Wikibase format constraints. Closes #3274. * Add logging --- .../qa/scrutinizers/FormatScrutinizer.java | 14 +++++- .../scrutinizers/FormatScrutinizerTest.java | 43 ++++++++++++------- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/FormatScrutinizer.java b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/FormatScrutinizer.java index 2151687fc..bacdcc936 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/FormatScrutinizer.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/FormatScrutinizer.java @@ -24,6 +24,8 @@ package org.openrefine.wikidata.qa.scrutinizers; import org.openrefine.wikidata.qa.QAWarning; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue; import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue; import org.wikidata.wdtk.datamodel.interfaces.Snak; @@ -38,6 +40,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; /** * A scrutinizer that detects incorrect formats in text values (mostly @@ -48,6 +51,8 @@ import java.util.regex.Pattern; */ public class FormatScrutinizer extends SnakScrutinizer { + private static final Logger logger = LoggerFactory.getLogger(FormatScrutinizer.class); + public static final String type = "add-statements-with-invalid-format"; public String formatConstraintQid; public String formatRegexPid; @@ -97,9 +102,14 @@ public class FormatScrutinizer extends SnakScrutinizer { String regex = constraint.regularExpressionFormat; Pattern pattern = null; if (regex != null) { - pattern = Pattern.compile(regex); + try { + pattern = Pattern.compile(regex); + patterns.add(pattern); + } catch(PatternSyntaxException e) { + logger.info(String.format("Ignoring invalid format constraint for property %s. Regex %s is invalid: %s", + pid.getId(), regex, e.getMessage())); + } } - patterns.add(pattern); } _patterns.put(pid, patterns); return patterns; diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/FormatScrutinizerTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/FormatScrutinizerTest.java index c13247fba..f13885f7e 100644 --- a/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/FormatScrutinizerTest.java +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/FormatScrutinizerTest.java @@ -54,6 +54,7 @@ public class FormatScrutinizerTest extends ScrutinizerTest { public static Value noMatchValue = Datamodel.makeStringValue("image"); public static Value incompleteMatchValue = Datamodel.makeStringValue(".jpg"); public static String regularExpression = "(?i).+\\.(jpg|jpeg|jpe|png|svg|tif|tiff|gif|xcf|pdf|djvu|webp)"; + public static String invalidRegularExpression = "(?[A-Za-z]+)"; public static ItemIdValue entityIdValue = Datamodel.makeWikidataItemIdValue(FORMAT_CONSTRAINT_QID); public static PropertyIdValue regularExpressionParameter = Datamodel.makeWikidataPropertyIdValue(FORMAT_REGEX_PID); @@ -72,11 +73,7 @@ public class FormatScrutinizerTest extends ScrutinizerTest { Statement statement = new StatementImpl("P18", value, idA); ItemUpdate updateA = new ItemUpdateBuilder(idA).addStatement(statement).build(); - Snak qualifierSnak = Datamodel.makeValueSnak(regularExpressionParameter, regularExpressionFormat); - List qualifierSnakList = Collections.singletonList(qualifierSnak); - SnakGroup qualifierSnakGroup = Datamodel.makeSnakGroup(qualifierSnakList); - List constraintQualifiers = Collections.singletonList(qualifierSnakGroup); - List constraintDefinitions = constraintParameterStatementList(entityIdValue, constraintQualifiers); + List constraintDefinitions = generateFormatConstraint(regularExpression); ConstraintFetcher fetcher = mock(ConstraintFetcher.class); when(fetcher.getConstraintsByType(propertyIdValue, FORMAT_CONSTRAINT_QID)).thenReturn(constraintDefinitions); @@ -92,11 +89,7 @@ public class FormatScrutinizerTest extends ScrutinizerTest { Statement statement = new StatementImpl("P18", value, idA); ItemUpdate updateA = new ItemUpdateBuilder(idA).addStatement(statement).build(); - Snak qualifierSnak = Datamodel.makeValueSnak(regularExpressionParameter, regularExpressionFormat); - List qualifierSnakList = Collections.singletonList(qualifierSnak); - SnakGroup qualifierSnakGroup = Datamodel.makeSnakGroup(qualifierSnakList); - List constraintQualifiers = Collections.singletonList(qualifierSnakGroup); - List constraintDefinitions = constraintParameterStatementList(entityIdValue, constraintQualifiers); + List constraintDefinitions = generateFormatConstraint(regularExpression); ConstraintFetcher fetcher = mock(ConstraintFetcher.class); when(fetcher.getConstraintsByType(propertyIdValue, FORMAT_CONSTRAINT_QID)).thenReturn(constraintDefinitions); @@ -112,11 +105,7 @@ public class FormatScrutinizerTest extends ScrutinizerTest { Statement statement = new StatementImpl("P18", value, idA); ItemUpdate updateA = new ItemUpdateBuilder(idA).addStatement(statement).build(); - Snak qualifierSnak = Datamodel.makeValueSnak(regularExpressionParameter, regularExpressionFormat); - List qualifierSnakList = Collections.singletonList(qualifierSnak); - SnakGroup qualifierSnakGroup = Datamodel.makeSnakGroup(qualifierSnakList); - List constraintQualifiers = Collections.singletonList(qualifierSnakGroup); - List constraintDefinitions = constraintParameterStatementList(entityIdValue, constraintQualifiers); + List constraintDefinitions = generateFormatConstraint(regularExpression); ConstraintFetcher fetcher = mock(ConstraintFetcher.class); when(fetcher.getConstraintsByType(propertyIdValue, FORMAT_CONSTRAINT_QID)).thenReturn(constraintDefinitions); @@ -124,5 +113,29 @@ public class FormatScrutinizerTest extends ScrutinizerTest { scrutinize(updateA); assertWarningsRaised(FormatScrutinizer.type); } + + @Test + public void testInvalidRegex() { + ItemIdValue idA = TestingData.existingId; + ValueSnak value = Datamodel.makeValueSnak(propertyIdValue, incompleteMatchValue); + Statement statement = new StatementImpl("P18", value, idA); + ItemUpdate updateA = new ItemUpdateBuilder(idA).addStatement(statement).build(); + + List constraintDefinitions = generateFormatConstraint(invalidRegularExpression); + + ConstraintFetcher fetcher = mock(ConstraintFetcher.class); + when(fetcher.getConstraintsByType(propertyIdValue, FORMAT_CONSTRAINT_QID)).thenReturn(constraintDefinitions); + setFetcher(fetcher); + scrutinize(updateA); + assertNoWarningRaised(); + } + + protected List generateFormatConstraint(String regex) { + Snak qualifierSnak = Datamodel.makeValueSnak(regularExpressionParameter, Datamodel.makeStringValue(regex)); + List qualifierSnakList = Collections.singletonList(qualifierSnak); + SnakGroup qualifierSnakGroup = Datamodel.makeSnakGroup(qualifierSnakList); + List constraintQualifiers = Collections.singletonList(qualifierSnakGroup); + return constraintParameterStatementList(entityIdValue, constraintQualifiers); + } }