Add scrutinizer for whitespace issues
This commit is contained in:
parent
42d9ca0393
commit
784e866b4e
@ -116,6 +116,22 @@
|
||||
"monolingual-text-without-language": {
|
||||
"title": "No language provided for monolingual text.",
|
||||
"body": "Some label, description, alias or monolingual text value have been skipped because no language was provided. Example value: <span class=\"wb-issue-preformat\">{example_text}</span>."
|
||||
},
|
||||
"leading-whitespace": {
|
||||
"title": "Leading whitespace in strings.",
|
||||
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> have leading whitespace."
|
||||
},
|
||||
"trailing-whitespace": {
|
||||
"title": "Trailing whitespace in strings.",
|
||||
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> have trailing whitespace."
|
||||
},
|
||||
"duplicate-whitespace": {
|
||||
"title": "Duplicate whitespace in strings.",
|
||||
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> contain duplicate whitespace."
|
||||
},
|
||||
"non-printable-characters": {
|
||||
"title": "Non-printable characters in strings.",
|
||||
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> contain non-printable characters."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ import org.openrefine.wikidata.qa.scrutinizers.RestrictedPositionScrutinizer;
|
||||
import org.openrefine.wikidata.qa.scrutinizers.SelfReferentialScrutinizer;
|
||||
import org.openrefine.wikidata.qa.scrutinizers.SingleValueScrutinizer;
|
||||
import org.openrefine.wikidata.qa.scrutinizers.UnsourcedScrutinizer;
|
||||
import org.openrefine.wikidata.qa.scrutinizers.WhitespaceScrutinizer;
|
||||
import org.openrefine.wikidata.schema.ItemUpdate;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
|
||||
|
||||
@ -43,6 +44,7 @@ public class EditInspector {
|
||||
register(new SingleValueScrutinizer());
|
||||
register(new DistinctValuesScrutinizer());
|
||||
register(new NoEditsMadeScrutinizer());
|
||||
register(new WhitespaceScrutinizer());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -0,0 +1,38 @@
|
||||
package org.openrefine.wikidata.qa.scrutinizers;
|
||||
|
||||
import org.openrefine.wikidata.schema.ItemUpdate;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Snak;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Value;
|
||||
|
||||
/**
|
||||
* A scrutinizer that inspects the values of snaks and terms
|
||||
* @author antonin
|
||||
*
|
||||
*/
|
||||
public abstract class ValueScrutinizer extends SnakScrutinizer {
|
||||
|
||||
@Override
|
||||
public void scrutinize(ItemUpdate update) {
|
||||
super.scrutinize(update);
|
||||
|
||||
for(MonolingualTextValue label : update.getLabels()) {
|
||||
scrutinize(label);
|
||||
}
|
||||
for(MonolingualTextValue alias : update.getAliases()) {
|
||||
scrutinize(alias);
|
||||
}
|
||||
for(MonolingualTextValue description : update.getDescriptions()) {
|
||||
scrutinize(description);
|
||||
}
|
||||
}
|
||||
|
||||
public abstract void scrutinize(Value value);
|
||||
|
||||
@Override
|
||||
public void scrutinize(Snak snak, EntityIdValue entityId, boolean added) {
|
||||
scrutinize(snak.getValue());
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
package org.openrefine.wikidata.qa.scrutinizers;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.openrefine.wikidata.qa.QAWarning;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.StringValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.Value;
|
||||
|
||||
/**
|
||||
* Scrutinizes strings for trailing / leading whitespace, and others
|
||||
* @author antonin
|
||||
*
|
||||
*/
|
||||
public class WhitespaceScrutinizer extends ValueScrutinizer {
|
||||
|
||||
private Map<String,Pattern> _issuesMap;
|
||||
|
||||
public WhitespaceScrutinizer() {
|
||||
_issuesMap = new HashMap<>();
|
||||
_issuesMap.put("leading-whitespace", Pattern.compile("^\\s"));
|
||||
_issuesMap.put("trailing-whitespace", Pattern.compile("\\s$"));
|
||||
_issuesMap.put("duplicate-whitespace", Pattern.compile("\\s\\s"));
|
||||
|
||||
// https://stackoverflow.com/questions/14565934/regular-expression-to-remove-all-non-printable-characters
|
||||
_issuesMap.put("non-printable-characters", Pattern.compile("[\\x00\\x08\\x0B\\x0C\\x0E-\\x1F]"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void scrutinize(Value value) {
|
||||
String str = null;
|
||||
if(MonolingualTextValue.class.isInstance(value)) {
|
||||
str = ((MonolingualTextValue)value).getText();
|
||||
} else if (StringValue.class.isInstance(value)) {
|
||||
str = ((StringValue)value).getString();
|
||||
}
|
||||
|
||||
if (str != null) {
|
||||
for(Entry<String,Pattern> entry : _issuesMap.entrySet()) {
|
||||
if(entry.getValue().matcher(str).find()) {
|
||||
emitWarning(entry.getKey(), str);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void emitWarning(String type, String example) {
|
||||
QAWarning warning = new QAWarning(type, null, QAWarning.Severity.WARNING, 1);
|
||||
warning.setProperty("example_string", example);
|
||||
addIssue(warning);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user