Add scrutinizer for whitespace issues
This commit is contained in:
parent
42d9ca0393
commit
784e866b4e
@ -116,6 +116,22 @@
|
|||||||
"monolingual-text-without-language": {
|
"monolingual-text-without-language": {
|
||||||
"title": "No language provided for monolingual text.",
|
"title": "No language provided for monolingual text.",
|
||||||
"body": "Some label, description, alias or monolingual text value have been skipped because no language was provided. Example value: <span class=\"wb-issue-preformat\">{example_text}</span>."
|
"body": "Some label, description, alias or monolingual text value have been skipped because no language was provided. Example value: <span class=\"wb-issue-preformat\">{example_text}</span>."
|
||||||
|
},
|
||||||
|
"leading-whitespace": {
|
||||||
|
"title": "Leading whitespace in strings.",
|
||||||
|
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> have leading whitespace."
|
||||||
|
},
|
||||||
|
"trailing-whitespace": {
|
||||||
|
"title": "Trailing whitespace in strings.",
|
||||||
|
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> have trailing whitespace."
|
||||||
|
},
|
||||||
|
"duplicate-whitespace": {
|
||||||
|
"title": "Duplicate whitespace in strings.",
|
||||||
|
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> contain duplicate whitespace."
|
||||||
|
},
|
||||||
|
"non-printable-characters": {
|
||||||
|
"title": "Non-printable characters in strings.",
|
||||||
|
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> contain non-printable characters."
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,7 @@ import org.openrefine.wikidata.qa.scrutinizers.RestrictedPositionScrutinizer;
|
|||||||
import org.openrefine.wikidata.qa.scrutinizers.SelfReferentialScrutinizer;
|
import org.openrefine.wikidata.qa.scrutinizers.SelfReferentialScrutinizer;
|
||||||
import org.openrefine.wikidata.qa.scrutinizers.SingleValueScrutinizer;
|
import org.openrefine.wikidata.qa.scrutinizers.SingleValueScrutinizer;
|
||||||
import org.openrefine.wikidata.qa.scrutinizers.UnsourcedScrutinizer;
|
import org.openrefine.wikidata.qa.scrutinizers.UnsourcedScrutinizer;
|
||||||
|
import org.openrefine.wikidata.qa.scrutinizers.WhitespaceScrutinizer;
|
||||||
import org.openrefine.wikidata.schema.ItemUpdate;
|
import org.openrefine.wikidata.schema.ItemUpdate;
|
||||||
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
|
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
|
||||||
|
|
||||||
@ -43,6 +44,7 @@ public class EditInspector {
|
|||||||
register(new SingleValueScrutinizer());
|
register(new SingleValueScrutinizer());
|
||||||
register(new DistinctValuesScrutinizer());
|
register(new DistinctValuesScrutinizer());
|
||||||
register(new NoEditsMadeScrutinizer());
|
register(new NoEditsMadeScrutinizer());
|
||||||
|
register(new WhitespaceScrutinizer());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -0,0 +1,38 @@
|
|||||||
|
package org.openrefine.wikidata.qa.scrutinizers;
|
||||||
|
|
||||||
|
import org.openrefine.wikidata.schema.ItemUpdate;
|
||||||
|
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
|
||||||
|
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
|
||||||
|
import org.wikidata.wdtk.datamodel.interfaces.Snak;
|
||||||
|
import org.wikidata.wdtk.datamodel.interfaces.Value;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A scrutinizer that inspects the values of snaks and terms
|
||||||
|
* @author antonin
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public abstract class ValueScrutinizer extends SnakScrutinizer {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void scrutinize(ItemUpdate update) {
|
||||||
|
super.scrutinize(update);
|
||||||
|
|
||||||
|
for(MonolingualTextValue label : update.getLabels()) {
|
||||||
|
scrutinize(label);
|
||||||
|
}
|
||||||
|
for(MonolingualTextValue alias : update.getAliases()) {
|
||||||
|
scrutinize(alias);
|
||||||
|
}
|
||||||
|
for(MonolingualTextValue description : update.getDescriptions()) {
|
||||||
|
scrutinize(description);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract void scrutinize(Value value);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void scrutinize(Snak snak, EntityIdValue entityId, boolean added) {
|
||||||
|
scrutinize(snak.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,56 @@
|
|||||||
|
package org.openrefine.wikidata.qa.scrutinizers;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.openrefine.wikidata.qa.QAWarning;
|
||||||
|
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
|
||||||
|
import org.wikidata.wdtk.datamodel.interfaces.StringValue;
|
||||||
|
import org.wikidata.wdtk.datamodel.interfaces.Value;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scrutinizes strings for trailing / leading whitespace, and others
|
||||||
|
* @author antonin
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class WhitespaceScrutinizer extends ValueScrutinizer {
|
||||||
|
|
||||||
|
private Map<String,Pattern> _issuesMap;
|
||||||
|
|
||||||
|
public WhitespaceScrutinizer() {
|
||||||
|
_issuesMap = new HashMap<>();
|
||||||
|
_issuesMap.put("leading-whitespace", Pattern.compile("^\\s"));
|
||||||
|
_issuesMap.put("trailing-whitespace", Pattern.compile("\\s$"));
|
||||||
|
_issuesMap.put("duplicate-whitespace", Pattern.compile("\\s\\s"));
|
||||||
|
|
||||||
|
// https://stackoverflow.com/questions/14565934/regular-expression-to-remove-all-non-printable-characters
|
||||||
|
_issuesMap.put("non-printable-characters", Pattern.compile("[\\x00\\x08\\x0B\\x0C\\x0E-\\x1F]"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void scrutinize(Value value) {
|
||||||
|
String str = null;
|
||||||
|
if(MonolingualTextValue.class.isInstance(value)) {
|
||||||
|
str = ((MonolingualTextValue)value).getText();
|
||||||
|
} else if (StringValue.class.isInstance(value)) {
|
||||||
|
str = ((StringValue)value).getString();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (str != null) {
|
||||||
|
for(Entry<String,Pattern> entry : _issuesMap.entrySet()) {
|
||||||
|
if(entry.getValue().matcher(str).find()) {
|
||||||
|
emitWarning(entry.getKey(), str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void emitWarning(String type, String example) {
|
||||||
|
QAWarning warning = new QAWarning(type, null, QAWarning.Severity.WARNING, 1);
|
||||||
|
warning.setProperty("example_string", example);
|
||||||
|
addIssue(warning);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user