Ignore invalid regexes from Wikibase format constraints. (#3721)

* Ignore invalid regexes from Wikibase format constraints. Closes #3274.

* Add logging
This commit is contained in:
Antonin Delpeuch 2021-04-04 19:25:07 +02:00 committed by GitHub
parent 4c930fed9a
commit 343c8afbea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 40 additions and 17 deletions

View File

@ -24,6 +24,8 @@
package org.openrefine.wikidata.qa.scrutinizers;
import org.openrefine.wikidata.qa.QAWarning;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue;
import org.wikidata.wdtk.datamodel.interfaces.Snak;
@ -38,6 +40,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* A scrutinizer that detects incorrect formats in text values (mostly
@ -48,6 +51,8 @@ import java.util.regex.Pattern;
*/
public class FormatScrutinizer extends SnakScrutinizer {
private static final Logger logger = LoggerFactory.getLogger(FormatScrutinizer.class);
public static final String type = "add-statements-with-invalid-format";
public String formatConstraintQid;
public String formatRegexPid;
@ -97,9 +102,14 @@ public class FormatScrutinizer extends SnakScrutinizer {
String regex = constraint.regularExpressionFormat;
Pattern pattern = null;
if (regex != null) {
pattern = Pattern.compile(regex);
try {
pattern = Pattern.compile(regex);
patterns.add(pattern);
} catch(PatternSyntaxException e) {
logger.info(String.format("Ignoring invalid format constraint for property %s. Regex %s is invalid: %s",
pid.getId(), regex, e.getMessage()));
}
}
patterns.add(pattern);
}
_patterns.put(pid, patterns);
return patterns;

View File

@ -54,6 +54,7 @@ public class FormatScrutinizerTest extends ScrutinizerTest {
public static Value noMatchValue = Datamodel.makeStringValue("image");
public static Value incompleteMatchValue = Datamodel.makeStringValue(".jpg");
public static String regularExpression = "(?i).+\\.(jpg|jpeg|jpe|png|svg|tif|tiff|gif|xcf|pdf|djvu|webp)";
public static String invalidRegularExpression = "(?[A-Za-z]+)";
public static ItemIdValue entityIdValue = Datamodel.makeWikidataItemIdValue(FORMAT_CONSTRAINT_QID);
public static PropertyIdValue regularExpressionParameter = Datamodel.makeWikidataPropertyIdValue(FORMAT_REGEX_PID);
@ -72,11 +73,7 @@ public class FormatScrutinizerTest extends ScrutinizerTest {
Statement statement = new StatementImpl("P18", value, idA);
ItemUpdate updateA = new ItemUpdateBuilder(idA).addStatement(statement).build();
Snak qualifierSnak = Datamodel.makeValueSnak(regularExpressionParameter, regularExpressionFormat);
List<Snak> qualifierSnakList = Collections.singletonList(qualifierSnak);
SnakGroup qualifierSnakGroup = Datamodel.makeSnakGroup(qualifierSnakList);
List<SnakGroup> constraintQualifiers = Collections.singletonList(qualifierSnakGroup);
List<Statement> constraintDefinitions = constraintParameterStatementList(entityIdValue, constraintQualifiers);
List<Statement> constraintDefinitions = generateFormatConstraint(regularExpression);
ConstraintFetcher fetcher = mock(ConstraintFetcher.class);
when(fetcher.getConstraintsByType(propertyIdValue, FORMAT_CONSTRAINT_QID)).thenReturn(constraintDefinitions);
@ -92,11 +89,7 @@ public class FormatScrutinizerTest extends ScrutinizerTest {
Statement statement = new StatementImpl("P18", value, idA);
ItemUpdate updateA = new ItemUpdateBuilder(idA).addStatement(statement).build();
Snak qualifierSnak = Datamodel.makeValueSnak(regularExpressionParameter, regularExpressionFormat);
List<Snak> qualifierSnakList = Collections.singletonList(qualifierSnak);
SnakGroup qualifierSnakGroup = Datamodel.makeSnakGroup(qualifierSnakList);
List<SnakGroup> constraintQualifiers = Collections.singletonList(qualifierSnakGroup);
List<Statement> constraintDefinitions = constraintParameterStatementList(entityIdValue, constraintQualifiers);
List<Statement> constraintDefinitions = generateFormatConstraint(regularExpression);
ConstraintFetcher fetcher = mock(ConstraintFetcher.class);
when(fetcher.getConstraintsByType(propertyIdValue, FORMAT_CONSTRAINT_QID)).thenReturn(constraintDefinitions);
@ -112,11 +105,7 @@ public class FormatScrutinizerTest extends ScrutinizerTest {
Statement statement = new StatementImpl("P18", value, idA);
ItemUpdate updateA = new ItemUpdateBuilder(idA).addStatement(statement).build();
Snak qualifierSnak = Datamodel.makeValueSnak(regularExpressionParameter, regularExpressionFormat);
List<Snak> qualifierSnakList = Collections.singletonList(qualifierSnak);
SnakGroup qualifierSnakGroup = Datamodel.makeSnakGroup(qualifierSnakList);
List<SnakGroup> constraintQualifiers = Collections.singletonList(qualifierSnakGroup);
List<Statement> constraintDefinitions = constraintParameterStatementList(entityIdValue, constraintQualifiers);
List<Statement> constraintDefinitions = generateFormatConstraint(regularExpression);
ConstraintFetcher fetcher = mock(ConstraintFetcher.class);
when(fetcher.getConstraintsByType(propertyIdValue, FORMAT_CONSTRAINT_QID)).thenReturn(constraintDefinitions);
@ -125,4 +114,28 @@ public class FormatScrutinizerTest extends ScrutinizerTest {
assertWarningsRaised(FormatScrutinizer.type);
}
@Test
public void testInvalidRegex() {
ItemIdValue idA = TestingData.existingId;
ValueSnak value = Datamodel.makeValueSnak(propertyIdValue, incompleteMatchValue);
Statement statement = new StatementImpl("P18", value, idA);
ItemUpdate updateA = new ItemUpdateBuilder(idA).addStatement(statement).build();
List<Statement> constraintDefinitions = generateFormatConstraint(invalidRegularExpression);
ConstraintFetcher fetcher = mock(ConstraintFetcher.class);
when(fetcher.getConstraintsByType(propertyIdValue, FORMAT_CONSTRAINT_QID)).thenReturn(constraintDefinitions);
setFetcher(fetcher);
scrutinize(updateA);
assertNoWarningRaised();
}
protected List<Statement> generateFormatConstraint(String regex) {
Snak qualifierSnak = Datamodel.makeValueSnak(regularExpressionParameter, Datamodel.makeStringValue(regex));
List<Snak> qualifierSnakList = Collections.singletonList(qualifierSnak);
SnakGroup qualifierSnakGroup = Datamodel.makeSnakGroup(qualifierSnakList);
List<SnakGroup> constraintQualifiers = Collections.singletonList(qualifierSnakGroup);
return constraintParameterStatementList(entityIdValue, constraintQualifiers);
}
}