From 8b4cf84bfd6eabe93fc40e51b5f2916b9ea68175 Mon Sep 17 00:00:00 2001 From: Antonin Delpeuch Date: Thu, 1 Nov 2018 15:29:57 +0000 Subject: [PATCH] Trim strings automatically in Wikibase schema. Closes #1781. --- .../wikidata/module/langs/translation-en.json | 8 ------- .../wikidata/module/langs/translation-fr.json | 8 ------- .../wikidata/module/langs/translation-jp.json | 8 ------- .../scrutinizers/WhitespaceScrutinizer.java | 4 ---- .../wikidata/schema/WbStringVariable.java | 2 +- .../WhitespaceScrutinizerTest.java | 21 ++++--------------- .../wikidata/schema/WbStringConstantTest.java | 5 +++++ .../wikidata/schema/WbStringVariableTest.java | 7 +++---- 8 files changed, 13 insertions(+), 50 deletions(-) diff --git a/extensions/wikidata/module/langs/translation-en.json b/extensions/wikidata/module/langs/translation-en.json index 40a298048..18a82adb3 100644 --- a/extensions/wikidata/module/langs/translation-en.json +++ b/extensions/wikidata/module/langs/translation-en.json @@ -169,14 +169,6 @@ "title": "No language provided for monolingual text.", "body": "Some label, description, alias or monolingual text value have been skipped because no language was provided. Example value: {example_text}." }, - "leading-whitespace": { - "title": "Leading whitespace in strings.", - "body": "Strings such as {example_string} have leading whitespace." - }, - "trailing-whitespace": { - "title": "Trailing whitespace in strings.", - "body": "Strings such as {example_string} have trailing whitespace." - }, "duplicate-whitespace": { "title": "Duplicate whitespace in strings.", "body": "Strings such as {example_string} contain duplicate whitespace." diff --git a/extensions/wikidata/module/langs/translation-fr.json b/extensions/wikidata/module/langs/translation-fr.json index 450667be4..1e6c634cd 100644 --- a/extensions/wikidata/module/langs/translation-fr.json +++ b/extensions/wikidata/module/langs/translation-fr.json @@ -167,14 +167,6 @@ "title": "Pas de langue fournie pour des textes monolingues.", "body": "Des libellés, descriptions, alias ou textes monolingues ont été ignorés car aucune langue n'a été fournie. Exemple: {example_text}." }, - "leading-whitespace": { - "title": "Espaces au début de chaînes de caractères.", - "body": "Des chaînes telles que {example_string} ont des espaces au début." - }, - "trailing-whitespace": { - "title": "Espaces à la fin de chaînes de caractères.", - "body": "Des chaînes telles que {example_string} ont des espaces à la fin." - }, "duplicate-whitespace": { "title": "Espaces dédoublées dans des chaînes de caractères.", "body": "Des chaînes telles que {example_string} contiennent des espaces dédoublées." diff --git a/extensions/wikidata/module/langs/translation-jp.json b/extensions/wikidata/module/langs/translation-jp.json index c1b4009bf..2094def7e 100644 --- a/extensions/wikidata/module/langs/translation-jp.json +++ b/extensions/wikidata/module/langs/translation-jp.json @@ -180,14 +180,6 @@ "title": "言語指定がありません.", "body": "言語指定がないので、ラベル・記述・別名・単一言語テキストが無視されました。例えば: {example_text}." }, - "leading-whitespace": { - "title": "文頭に空白文字があります.", - "body": "{example_string}の文頭に空白文字があります." - }, - "trailing-whitespace": { - "title": "文末に空白文字があります.", - "body": "{example_string}の文末に空白文字があります." - }, "duplicate-whitespace": { "title": "二重の空白文字があります.", "body": "{example_string}には二重の空白文字があります." diff --git a/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/WhitespaceScrutinizer.java b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/WhitespaceScrutinizer.java index a5685384e..cdcbcc7f1 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/WhitespaceScrutinizer.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/WhitespaceScrutinizer.java @@ -43,15 +43,11 @@ public class WhitespaceScrutinizer extends ValueScrutinizer { private Map _issuesMap; - public static final String leadingWhitespaceType = "leading-whitespace"; - public static final String trailingWhitespaceType = "trailing-whitespace"; public static final String duplicateWhitespaceType = "duplicate-whitespace"; public static final String nonPrintableCharsType = "non-printable-characters"; public WhitespaceScrutinizer() { _issuesMap = new HashMap<>(); - _issuesMap.put(leadingWhitespaceType, Pattern.compile("^\\s")); - _issuesMap.put(trailingWhitespaceType, Pattern.compile("\\s$")); _issuesMap.put(duplicateWhitespaceType, Pattern.compile("\\s\\s")); // https://stackoverflow.com/questions/14565934/regular-expression-to-remove-all-non-printable-characters diff --git a/extensions/wikidata/src/org/openrefine/wikidata/schema/WbStringVariable.java b/extensions/wikidata/src/org/openrefine/wikidata/schema/WbStringVariable.java index cd02e7883..7726127d1 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/schema/WbStringVariable.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/schema/WbStringVariable.java @@ -58,7 +58,7 @@ public class WbStringVariable extends WbVariableExpr { public StringValue fromCell(Cell cell, ExpressionContext ctxt) throws SkipSchemaExpressionException { if (!cell.value.toString().isEmpty()) { - return Datamodel.makeStringValue(cell.value.toString()); + return Datamodel.makeStringValue(cell.value.toString().trim()); } throw new SkipSchemaExpressionException(); } diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/WhitespaceScrutinizerTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/WhitespaceScrutinizerTest.java index c2ffad6a1..25a70049c 100644 --- a/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/WhitespaceScrutinizerTest.java +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/WhitespaceScrutinizerTest.java @@ -33,18 +33,6 @@ public class WhitespaceScrutinizerTest extends ValueScrutinizerTest { return new WhitespaceScrutinizer(); } - @Test - public void testLeadingWhitespace() { - scrutinize(Datamodel.makeStringValue(" a")); - assertWarningsRaised(WhitespaceScrutinizer.leadingWhitespaceType); - } - - @Test - public void testTrailingWhitespace() { - scrutinize(Datamodel.makeStringValue("a\t")); - assertWarningsRaised(WhitespaceScrutinizer.trailingWhitespaceType); - } - @Test public void testDuplicateWhitespace() { scrutinize(Datamodel.makeStringValue("a\t b")); @@ -65,14 +53,13 @@ public class WhitespaceScrutinizerTest extends ValueScrutinizerTest { @Test public void testMultipleIssues() { - scrutinize(Datamodel.makeStringValue(" a\t b ")); - assertWarningsRaised(WhitespaceScrutinizer.duplicateWhitespaceType, WhitespaceScrutinizer.leadingWhitespaceType, - WhitespaceScrutinizer.trailingWhitespaceType); + scrutinize(Datamodel.makeStringValue("a\t b\u0003")); + assertWarningsRaised(WhitespaceScrutinizer.duplicateWhitespaceType, WhitespaceScrutinizer.nonPrintableCharsType); } @Test public void testMonolingualTextValue() { - scrutinizeLabel(Datamodel.makeMonolingualTextValue(" a", "fr")); - assertWarningsRaised(WhitespaceScrutinizer.leadingWhitespaceType); + scrutinizeLabel(Datamodel.makeMonolingualTextValue("a b", "fr")); + assertWarningsRaised(WhitespaceScrutinizer.duplicateWhitespaceType); } } diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WbStringConstantTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WbStringConstantTest.java index 9066b6c93..57bc29c19 100644 --- a/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WbStringConstantTest.java +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WbStringConstantTest.java @@ -20,6 +20,11 @@ public class WbStringConstantTest extends WbExpressionTest { evaluatesTo(Datamodel.makeStringValue("hello world"), constant); } + @Test + public void testTrim() { + evaluatesTo(Datamodel.makeStringValue("hello world"), new WbStringConstant(" hello world ")); + } + @Test(expectedExceptions = IllegalArgumentException.class) public void testEmpty() { new WbStringConstant(""); diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WbStringVariableTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WbStringVariableTest.java index 036c9bd10..63fcd2a62 100644 --- a/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WbStringVariableTest.java +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/schema/WbStringVariableTest.java @@ -45,17 +45,16 @@ public class WbStringVariableTest extends WbVariableTest { } /** - * It is not up to the evaluator to clean up the strings it gets. This is - * flagged later on by scrutinizers. + * The evaluator cleans up leading and trailing whitespace, but not duplicate spaces */ @Test public void testTrailingWhitespace() { - evaluatesTo(Datamodel.makeStringValue("dirty \t"), "dirty \t"); + evaluatesTo(Datamodel.makeStringValue("dirty"), "dirty \t"); } @Test public void testLeadingWhitespace() { - evaluatesTo(Datamodel.makeStringValue(" dirty"), " dirty"); + evaluatesTo(Datamodel.makeStringValue("dirty"), " dirty"); } @Test