From 0d5875b35b14c1614d4e1d64db4455f1514df739 Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Tue, 9 Jan 2018 10:21:50 +0000 Subject: [PATCH] Add format scrutinizer --- .../wikidata/module/langs/translation-en.json | 10 +++- .../openrefine/wikidata/qa/EditInspector.java | 3 + .../FormatConstraintScrutinizer.java | 60 +++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/FormatConstraintScrutinizer.java diff --git a/extensions/wikidata/module/langs/translation-en.json b/extensions/wikidata/module/langs/translation-en.json index 66cf8f014..9e32dfd41 100644 --- a/extensions/wikidata/module/langs/translation-en.json +++ b/extensions/wikidata/module/langs/translation-en.json @@ -24,7 +24,7 @@ }, "new-item-without-descriptions": { "title": "Some new items will be created without any description.", - "body": "Adding descriptions will make it easier to disambiguate them from namesakes." + "body": "Adding descriptions will make it easier to disambiguate the items from namesakes." }, "new-item-with-deleted-statements": { "title": "You are trying to delete statements on new items.", @@ -37,6 +37,14 @@ "statement-without-reference": { "title": "Some statements are not referenced.", "body": "Please provide references for the statements that you add." + }, + "add-statements-with-invalid-format": { + "title": "Invalid format for some text statements.", + "body": "Please consult the documentation of the properties to find out the correct format for their values." + }, + "remove-statements-with-invalid-format": { + "title": "Statements with invalid format will be removed.", + "body": "If these statements currently exist on Wikidata, this will solve constraint violations." } } } diff --git a/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java b/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java index cf09be4e0..59b8016b5 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java @@ -5,6 +5,8 @@ import java.util.List; import java.util.Map; import org.openrefine.wikidata.qa.scrutinizers.EditScrutinizer; +import org.openrefine.wikidata.qa.scrutinizers.FormatConstraintScrutinizer; +import org.openrefine.wikidata.qa.scrutinizers.InverseConstraintScrutinizer; import org.openrefine.wikidata.qa.scrutinizers.NewItemScrutinizer; import org.openrefine.wikidata.schema.ItemUpdate; @@ -23,6 +25,7 @@ public class EditInspector { // Register all known scrutinizers here register(new NewItemScrutinizer()); + register(new FormatConstraintScrutinizer()); } /** diff --git a/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/FormatConstraintScrutinizer.java b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/FormatConstraintScrutinizer.java new file mode 100644 index 000000000..66b6f8d51 --- /dev/null +++ b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/FormatConstraintScrutinizer.java @@ -0,0 +1,60 @@ +package org.openrefine.wikidata.qa.scrutinizers; + +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Pattern; + +import org.openrefine.wikidata.qa.ConstraintFetcher; +import org.wikidata.wdtk.datamodel.interfaces.Snak; +import org.wikidata.wdtk.datamodel.interfaces.StringValue; + + +public class FormatConstraintScrutinizer extends SnakScrutinizer { + + private Map _patterns; + private ConstraintFetcher _fetcher; + + public FormatConstraintScrutinizer() { + _patterns = new HashMap<>(); + _fetcher = new ConstraintFetcher(); + } + + /** + * Loads the regex for a property and compiles it to a pattern + * (this is cached upstream, plus we are doing it only once per + * property and batch). + * @param pid the id of the property to fetch the constraints for + * @return + */ + protected Pattern getPattern(String pid) { + if(_patterns.containsKey(pid)) { + return _patterns.get(pid); + } else { + String regex = _fetcher.getFormatRegex(pid); + Pattern pattern = null; + if (regex != null) { + pattern = Pattern.compile(regex); + } + _patterns.put(pid, pattern); + return pattern; + } + } + + @Override + public void scrutinize(Snak snak, boolean added) { + if(StringValue.class.isInstance(snak.getValue())) { + String value = ((StringValue) snak.getValue()).getString(); + String pid = snak.getPropertyId().getId(); + Pattern pattern = getPattern(pid); + if (!pattern.matcher(value).matches()) { + if (added) { + important("add-statements-with-invalid-format"); + } else { + info("remove-statements-with-invalid-format"); + } + } + } + + } + +}