Merge pull request #1794 from OpenRefine/issue1781

Trim strings automatically in Wikibase schema.
This commit is contained in:
Antonin Delpeuch 2018-11-05 17:32:50 +00:00 committed by GitHub
commit e5061cc44c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 21 additions and 52 deletions

View File

@ -169,14 +169,6 @@
"title": "No language provided for monolingual text.",
"body": "Some label, description, alias or monolingual text value have been skipped because no language was provided. Example value: <span class=\"wb-issue-preformat\">{example_text}</span>."
},
"leading-whitespace": {
"title": "Leading whitespace in strings.",
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> have leading whitespace."
},
"trailing-whitespace": {
"title": "Trailing whitespace in strings.",
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> have trailing whitespace."
},
"duplicate-whitespace": {
"title": "Duplicate whitespace in strings.",
"body": "Strings such as <span class=\"wb-issue-preformat\">{example_string}</span> contain duplicate whitespace."

View File

@ -168,14 +168,6 @@
"title": "Pas de langue fournie pour des textes monolingues.",
"body": "Des libellés, descriptions, alias ou textes monolingues ont été ignorés car aucune langue n'a été fournie. Exemple: <span class=\"wb-issue-preformat\">{example_text}</span>."
},
"leading-whitespace": {
"title": "Espaces au début de chaînes de caractères.",
"body": "Des chaînes telles que <span class=\"wb-issue-preformat\">{example_string}</span> ont des espaces au début."
},
"trailing-whitespace": {
"title": "Espaces à la fin de chaînes de caractères.",
"body": "Des chaînes telles que <span class=\"wb-issue-preformat\">{example_string}</span> ont des espaces à la fin."
},
"duplicate-whitespace": {
"title": "Espaces dédoublées dans des chaînes de caractères.",
"body": "Des chaînes telles que <span class=\"wb-issue-preformat\">{example_string}</span> contiennent des espaces dédoublées."

View File

@ -180,14 +180,6 @@
"title": "言語指定がありません.",
"body": "言語指定がないので、ラベル・記述・別名・単一言語テキストが無視されました。例えば: <span class=\"wb-issue-preformat\">{example_text}</span>."
},
"leading-whitespace": {
"title": "文頭に空白文字があります.",
"body": "<span class=\"wb-issue-preformat\">{example_string}</span>の文頭に空白文字があります."
},
"trailing-whitespace": {
"title": "文末に空白文字があります.",
"body": "<span class=\"wb-issue-preformat\">{example_string}</span>の文末に空白文字があります."
},
"duplicate-whitespace": {
"title": "二重の空白文字があります.",
"body": "<span class=\"wb-issue-preformat\">{example_string}</span>には二重の空白文字があります."

View File

@ -43,15 +43,11 @@ public class WhitespaceScrutinizer extends ValueScrutinizer {
private Map<String, Pattern> _issuesMap;
public static final String leadingWhitespaceType = "leading-whitespace";
public static final String trailingWhitespaceType = "trailing-whitespace";
public static final String duplicateWhitespaceType = "duplicate-whitespace";
public static final String nonPrintableCharsType = "non-printable-characters";
public WhitespaceScrutinizer() {
_issuesMap = new HashMap<>();
_issuesMap.put(leadingWhitespaceType, Pattern.compile("^\\s"));
_issuesMap.put(trailingWhitespaceType, Pattern.compile("\\s$"));
_issuesMap.put(duplicateWhitespaceType, Pattern.compile("\\s\\s"));
// https://stackoverflow.com/questions/14565934/regular-expression-to-remove-all-non-printable-characters

View File

@ -53,7 +53,7 @@ public class WbMonolingualExpr implements WbExpression<MonolingualTextValue> {
String text = getValueExpr().evaluate(ctxt).getString();
try {
String lang = getLanguageExpr().evaluate(ctxt);
return Datamodel.makeMonolingualTextValue(text, lang);
return Datamodel.makeMonolingualTextValue(text.trim(), lang);
} catch (SkipSchemaExpressionException e) {
QAWarning warning = new QAWarning("monolingual-text-without-language", null, QAWarning.Severity.WARNING, 1);

View File

@ -39,7 +39,7 @@ public class WbStringConstant implements WbExpression<StringValue> {
Validate.notNull(value);
Validate.isTrue(!value.isEmpty()); // for now we don't accept empty strings
// because in the variable counterpart of this expression, they are skipped
this.value = value;
this.value = value.trim();
}
@Override

View File

@ -62,7 +62,7 @@ public class WbStringVariable extends WbVariableExpr<StringValue> {
if (cell.value instanceof Double && ((Double)cell.value) % 1 == 0) {
stringValue = Integer.toString(((Double)cell.value).intValue());
}
return Datamodel.makeStringValue(stringValue);
return Datamodel.makeStringValue(stringValue.trim());
}
throw new SkipSchemaExpressionException();
}

View File

@ -33,18 +33,6 @@ public class WhitespaceScrutinizerTest extends ValueScrutinizerTest {
return new WhitespaceScrutinizer();
}
@Test
public void testLeadingWhitespace() {
scrutinize(Datamodel.makeStringValue(" a"));
assertWarningsRaised(WhitespaceScrutinizer.leadingWhitespaceType);
}
@Test
public void testTrailingWhitespace() {
scrutinize(Datamodel.makeStringValue("a\t"));
assertWarningsRaised(WhitespaceScrutinizer.trailingWhitespaceType);
}
@Test
public void testDuplicateWhitespace() {
scrutinize(Datamodel.makeStringValue("a\t b"));
@ -65,14 +53,13 @@ public class WhitespaceScrutinizerTest extends ValueScrutinizerTest {
@Test
public void testMultipleIssues() {
scrutinize(Datamodel.makeStringValue(" a\t b "));
assertWarningsRaised(WhitespaceScrutinizer.duplicateWhitespaceType, WhitespaceScrutinizer.leadingWhitespaceType,
WhitespaceScrutinizer.trailingWhitespaceType);
scrutinize(Datamodel.makeStringValue("a\t b\u0003"));
assertWarningsRaised(WhitespaceScrutinizer.duplicateWhitespaceType, WhitespaceScrutinizer.nonPrintableCharsType);
}
@Test
public void testMonolingualTextValue() {
scrutinizeLabel(Datamodel.makeMonolingualTextValue(" a", "fr"));
assertWarningsRaised(WhitespaceScrutinizer.leadingWhitespaceType);
scrutinizeLabel(Datamodel.makeMonolingualTextValue("a b", "fr"));
assertWarningsRaised(WhitespaceScrutinizer.duplicateWhitespaceType);
}
}

View File

@ -44,6 +44,12 @@ public class WbMonolingualExprTest extends WbExpressionTest<MonolingualTextValue
evaluatesTo(Datamodel.makeMonolingualTextValue("hello", "en"), expr);
}
@Test
public void testTrim() {
setRow("en", " hello ");
evaluatesTo(Datamodel.makeMonolingualTextValue("hello", "en"), expr);
}
@Test
public void testInvalidLanguageCode() {
setRow("ueuue", "my label");

View File

@ -20,6 +20,11 @@ public class WbStringConstantTest extends WbExpressionTest<StringValue> {
evaluatesTo(Datamodel.makeStringValue("hello world"), constant);
}
@Test
public void testTrim() {
evaluatesTo(Datamodel.makeStringValue("hello world"), new WbStringConstant(" hello world "));
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testEmpty() {
new WbStringConstant("");

View File

@ -47,12 +47,11 @@ public class WbStringVariableTest extends WbVariableTest<StringValue> {
}
/**
* It is not up to the evaluator to clean up the strings it gets. This is
* flagged later on by scrutinizers.
* The evaluator cleans up leading and trailing whitespace, but not duplicate spaces
*/
@Test
public void testTrailingWhitespace() {
evaluatesTo(Datamodel.makeStringValue("dirty \t"), "dirty \t");
evaluatesTo(Datamodel.makeStringValue("dirty"), "dirty \t");
}
/**
@ -74,7 +73,7 @@ public class WbStringVariableTest extends WbVariableTest<StringValue> {
@Test
public void testLeadingWhitespace() {
evaluatesTo(Datamodel.makeStringValue(" dirty"), " dirty");
evaluatesTo(Datamodel.makeStringValue("dirty"), " dirty");
}
@Test