Add scrutinizer for whitespace issues

This commit is contained in:
Antonin Delpeuch 2018-01-12 18:30:54 +00:00
parent 42d9ca0393
commit 784e866b4e
4 changed files with 112 additions and 0 deletions

View File

@ -116,6 +116,22 @@
"monolingual-text-without-language": {
"title": "No language provided for monolingual text.",
"body": "Some label, description, alias or monolingual text value have been skipped because no language was provided. Example value: <span class=\"wb-issue-preformat\">{example_text}</span>."
},
"leading-whitespace": {
"title": "Leading whitespace in strings.",
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> have leading whitespace."
},
"trailing-whitespace": {
"title": "Trailing whitespace in strings.",
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> have trailing whitespace."
},
"duplicate-whitespace": {
"title": "Duplicate whitespace in strings.",
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> contain duplicate whitespace."
},
"non-printable-characters": {
"title": "Non-printable characters in strings.",
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> contain non-printable characters."
}
}
}

View File

@ -16,6 +16,7 @@ import org.openrefine.wikidata.qa.scrutinizers.RestrictedPositionScrutinizer;
import org.openrefine.wikidata.qa.scrutinizers.SelfReferentialScrutinizer;
import org.openrefine.wikidata.qa.scrutinizers.SingleValueScrutinizer;
import org.openrefine.wikidata.qa.scrutinizers.UnsourcedScrutinizer;
import org.openrefine.wikidata.qa.scrutinizers.WhitespaceScrutinizer;
import org.openrefine.wikidata.schema.ItemUpdate;
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
@ -43,6 +44,7 @@ public class EditInspector {
register(new SingleValueScrutinizer());
register(new DistinctValuesScrutinizer());
register(new NoEditsMadeScrutinizer());
register(new WhitespaceScrutinizer());
}
/**

View File

@ -0,0 +1,38 @@
package org.openrefine.wikidata.qa.scrutinizers;
import org.openrefine.wikidata.schema.ItemUpdate;
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
import org.wikidata.wdtk.datamodel.interfaces.Snak;
import org.wikidata.wdtk.datamodel.interfaces.Value;
/**
* A scrutinizer that inspects the values of snaks and terms
* @author antonin
*
*/
public abstract class ValueScrutinizer extends SnakScrutinizer {
@Override
public void scrutinize(ItemUpdate update) {
super.scrutinize(update);
for(MonolingualTextValue label : update.getLabels()) {
scrutinize(label);
}
for(MonolingualTextValue alias : update.getAliases()) {
scrutinize(alias);
}
for(MonolingualTextValue description : update.getDescriptions()) {
scrutinize(description);
}
}
public abstract void scrutinize(Value value);
@Override
public void scrutinize(Snak snak, EntityIdValue entityId, boolean added) {
scrutinize(snak.getValue());
}
}

View File

@ -0,0 +1,56 @@
package org.openrefine.wikidata.qa.scrutinizers;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import org.openrefine.wikidata.qa.QAWarning;
import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue;
import org.wikidata.wdtk.datamodel.interfaces.StringValue;
import org.wikidata.wdtk.datamodel.interfaces.Value;
/**
* Scrutinizes strings for trailing / leading whitespace, and others
* @author antonin
*
*/
public class WhitespaceScrutinizer extends ValueScrutinizer {
private Map<String,Pattern> _issuesMap;
public WhitespaceScrutinizer() {
_issuesMap = new HashMap<>();
_issuesMap.put("leading-whitespace", Pattern.compile("^\\s"));
_issuesMap.put("trailing-whitespace", Pattern.compile("\\s$"));
_issuesMap.put("duplicate-whitespace", Pattern.compile("\\s\\s"));
// https://stackoverflow.com/questions/14565934/regular-expression-to-remove-all-non-printable-characters
_issuesMap.put("non-printable-characters", Pattern.compile("[\\x00\\x08\\x0B\\x0C\\x0E-\\x1F]"));
}
@Override
public void scrutinize(Value value) {
String str = null;
if(MonolingualTextValue.class.isInstance(value)) {
str = ((MonolingualTextValue)value).getText();
} else if (StringValue.class.isInstance(value)) {
str = ((StringValue)value).getString();
}
if (str != null) {
for(Entry<String,Pattern> entry : _issuesMap.entrySet()) {
if(entry.getValue().matcher(str).find()) {
emitWarning(entry.getKey(), str);
}
}
}
}
private void emitWarning(String type, String example) {
QAWarning warning = new QAWarning(type, null, QAWarning.Severity.WARNING, 1);
warning.setProperty("example_string", example);
addIssue(warning);
}
}