diff --git a/extensions/wikidata/module/langs/translation-en.json b/extensions/wikidata/module/langs/translation-en.json index 0a1717be4..c63393aec 100644 --- a/extensions/wikidata/module/langs/translation-en.json +++ b/extensions/wikidata/module/langs/translation-en.json @@ -156,5 +156,15 @@ "warnings-messages/invalid-entity-type/title": "{property_entity} used on items", "warnings-messages/invalid-entity-type/body": "Uses of {property_entity} on items such as {example_entity} are invalid.", "warnings-messages/early-gregorian-date/title": "Early dates in the Gregorian calendar", - "warnings-messages/early-gregorian-date/body": "Dates earlier than October 1582 (such as in year {example_year}) are unlikely to be expressed using the Gregorian calendar. See the manual to specify the appropriate calendar for your dates." + "warnings-messages/early-gregorian-date/body": "Dates earlier than October 1582 (such as in year {example_year}) are unlikely to be expressed using the Gregorian calendar. See the manual to specify the appropriate calendar for your dates.", + "warnings-messages/item-description-too-long/title": "Description is too long", + "warnings-messages/item-description-too-long/body": "Description ({lang}) such as {description} on {example_entity} is too long. Its length is {length}, which is more than {max_length}. Descriptions are not full sentences, but small bits of information. In most cases, the proper length is between two and twelve words. See the manual for more information.", + "warnings-messages/item-description-identical-with-label/title": "Description is identical with label", + "warnings-messages/item-description-identical-with-label/body": "Both the description ({lang}) and the label ({label_lang}) on {example_entity} are {description}. Description are expected to be more specific than labels. See the manual for more information.", + "warnings-messages/item-description-end-by-punctuation-sign/title": "Description ends by punctuation sign", + "warnings-messages/item-description-end-by-punctuation-sign/body": "Description ({lang}) such as {description} on {example_entity} ends by a punctuation sign \"{punctuation_sign}\". Description are not sentences, so the punctuation sign at the end should be avoided. See the manual for more information.", + "warnings-messages/item-description-begin-with-uppercase/title": "Description begins with uppercase letter", + "warnings-messages/item-description-begin-with-uppercase/body": "Description ({lang}) such as {description} on {example_entity} begins with uppercase letter \"{uppercase_letter}\". Descriptions begin with a lowercase letter except when uppercase would normally be required or expected. See the manual for more information.", + "warnings-messages/item-description-begin-with-article/title": "Description begins with article (\"a\", \"an\" or \"the\")", + "warnings-messages/item-description-begin-with-article/body": "Description ({lang}) such as {description} on {example_entity} begins with article \"{article}\". Descriptions should not normally begin with initial articles (\"a\", \"an\", \"the\"). See the manual for more information." } diff --git a/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java b/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java index e41fe81f0..7165ffaef 100644 --- a/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java +++ b/extensions/wikidata/src/org/openrefine/wikidata/qa/EditInspector.java @@ -28,22 +28,7 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import org.openrefine.wikidata.qa.scrutinizers.CalendarScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.DistinctValuesScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.EditScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.EntityTypeScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.FormatScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.InverseConstraintScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.NewItemScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.NoEditsMadeScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.QualifierCompatibilityScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.QuantityScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.RestrictedPositionScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.RestrictedValuesScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.SelfReferentialScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.SingleValueScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.UnsourcedScrutinizer; -import org.openrefine.wikidata.qa.scrutinizers.WhitespaceScrutinizer; +import org.openrefine.wikidata.qa.scrutinizers.*; import org.openrefine.wikidata.updates.ItemUpdate; import org.openrefine.wikidata.updates.scheduler.WikibaseAPIUpdateScheduler; import org.openrefine.wikidata.utils.EntityCache; @@ -82,6 +67,8 @@ public class EditInspector { register(new RestrictedValuesScrutinizer()); register(new EntityTypeScrutinizer()); register(new CalendarScrutinizer()); + register(new CommonDescriptionScrutinizer()); + register(new EnglishDescriptionScrutinizer()); } /** diff --git a/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/CommonDescriptionScrutinizer.java b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/CommonDescriptionScrutinizer.java new file mode 100644 index 000000000..be78919a4 --- /dev/null +++ b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/CommonDescriptionScrutinizer.java @@ -0,0 +1,58 @@ +package org.openrefine.wikidata.qa.scrutinizers; + +import org.openrefine.wikidata.qa.QAWarning; +import org.openrefine.wikidata.updates.ItemUpdate; +import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue; + +import java.util.Set; + +/** + * @author Lu Liu + */ +public class CommonDescriptionScrutinizer extends DescriptionScrutinizer { + + public static final String descTooLongType = "item-description-too-long"; + public static final String descIdenticalWithLabel = "item-description-identical-with-label"; + + @Override + public void scrutinize(ItemUpdate update, String descText, String lang) { + checkLength(update, descText, lang); + checkLabel(update, descText, lang); + } + + // Descriptions are not full sentences, but small bits of information. + // In most cases, the proper length is between two and twelve words. + protected void checkLength(ItemUpdate update, String descText, String lang) { + final int maxLength = 250; + if (descText.length() > maxLength) { + QAWarning issue = new QAWarning(descTooLongType, null, QAWarning.Severity.CRITICAL, 1); + issue.setProperty("example_entity", update.getItemId()); + issue.setProperty("description", descText); + issue.setProperty("lang", lang); + issue.setProperty("length", descText.length()); + issue.setProperty("max_length", maxLength); + addIssue(issue); + } + } + + // Description are expected to be more specific than labels. + protected void checkLabel(ItemUpdate update, String descText, String lang) { + Set labels = update.getLabels(); + labels.addAll(update.getLabelsIfNew()); // merge + for (MonolingualTextValue label : labels) { + String labelText = label.getText(); + if (labelText == null) continue; + labelText = labelText.trim(); + if (labelText.equals(descText)) { + QAWarning issue = new QAWarning(descIdenticalWithLabel, null, QAWarning.Severity.WARNING, 1); + issue.setProperty("example_entity", update.getItemId()); + issue.setProperty("description", descText); + issue.setProperty("lang", lang); + issue.setProperty("label_lang", label.getLanguageCode()); + addIssue(issue); + break; + } + } + } + +} diff --git a/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/DescriptionScrutinizer.java b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/DescriptionScrutinizer.java new file mode 100644 index 000000000..3a9075adf --- /dev/null +++ b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/DescriptionScrutinizer.java @@ -0,0 +1,30 @@ +package org.openrefine.wikidata.qa.scrutinizers; + +import org.openrefine.wikidata.qa.QAWarning; +import org.openrefine.wikidata.updates.ItemUpdate; +import org.wikidata.wdtk.datamodel.interfaces.MonolingualTextValue; + +import java.util.Set; + +/** + * @author Lu Liu + */ +public abstract class DescriptionScrutinizer extends EditScrutinizer { + + @Override + public void scrutinize(ItemUpdate update) { + Set descriptions = update.getDescriptions(); + descriptions.addAll(update.getDescriptionsIfNew()); // merge + for (MonolingualTextValue description : descriptions) { + String descText = description.getText(); + if (descText == null) continue; + descText = descText.trim(); + if (descText.length() == 0) continue; // avoid NullPointerException + + scrutinize(update, descText, description.getLanguageCode()); + } + } + + public abstract void scrutinize(ItemUpdate update, String descText, String lang); + +} diff --git a/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/EnglishDescriptionScrutinizer.java b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/EnglishDescriptionScrutinizer.java new file mode 100644 index 000000000..ce3274a8d --- /dev/null +++ b/extensions/wikidata/src/org/openrefine/wikidata/qa/scrutinizers/EnglishDescriptionScrutinizer.java @@ -0,0 +1,72 @@ +package org.openrefine.wikidata.qa.scrutinizers; + +import org.openrefine.wikidata.qa.QAWarning; +import org.openrefine.wikidata.updates.ItemUpdate; + +/** + * @author Lu Liu + */ +public class EnglishDescriptionScrutinizer extends DescriptionScrutinizer { + + public static final String descEndsByPunctuationSign = "item-description-end-by-punctuation-sign"; + public static final String descBeginWithUppercase = "item-description-begin-with-uppercase"; + public static final String descBeginWithArticle = "item-description-begin-with-article"; + + private static final String LANG = "en"; + + @Override + public void scrutinize(ItemUpdate update, String descText, String lang) { + if (!LANG.equalsIgnoreCase(lang)) return; + + checkPunctuationSign(update, descText); + checkUppercase(update, descText); + checkArticle(update, descText); + } + + // Description are not sentences, so the punctuation sign at the end should be avoided. + protected void checkPunctuationSign(ItemUpdate update, String descText) { + assert descText.length() > 0; + final String punctuationSigns = ".?!;:,'\""; + + char last = descText.charAt(descText.length() - 1); + if (punctuationSigns.indexOf(last) != -1) { + QAWarning issue = new QAWarning(descEndsByPunctuationSign, null, QAWarning.Severity.WARNING, 1); + issue.setProperty("example_entity", update.getItemId()); + issue.setProperty("description", descText); + issue.setProperty("lang", LANG); + issue.setProperty("punctuation_sign", last); + addIssue(issue); + } + } + + // Descriptions begin with a lowercase letter except when uppercase would normally be required or expected. + protected void checkUppercase(ItemUpdate update, String descText) { + assert descText.length() > 0; + + char first = descText.charAt(0); + if ('A' <= first && first <= 'Z') { + QAWarning issue = new QAWarning(descBeginWithUppercase, null, QAWarning.Severity.INFO, 1); + issue.setProperty("example_entity", update.getItemId()); + issue.setProperty("description", descText); + issue.setProperty("lang", LANG); + issue.setProperty("uppercase_letter", first); + addIssue(issue); + } + } + + // Descriptions should not normally begin with initial articles ("a", "an", "the"). + protected void checkArticle(ItemUpdate update, String descText) { + assert descText.length() > 0; + + String firstWord = descText.split("\\s")[0].toLowerCase(); + if ("a".equals(firstWord) || "an".equals(firstWord) || "the".equals(firstWord)) { + QAWarning issue = new QAWarning(descBeginWithArticle, null, QAWarning.Severity.WARNING, 1); + issue.setProperty("example_entity", update.getItemId()); + issue.setProperty("description", descText); + issue.setProperty("lang", LANG); + issue.setProperty("article", firstWord); + addIssue(issue); + } + } + +} diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/CommonDescriptionScrutinizerTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/CommonDescriptionScrutinizerTest.java new file mode 100644 index 000000000..83850e312 --- /dev/null +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/CommonDescriptionScrutinizerTest.java @@ -0,0 +1,74 @@ +package org.openrefine.wikidata.qa.scrutinizers; + +import org.openrefine.wikidata.testing.TestingData; +import org.openrefine.wikidata.updates.ItemUpdate; +import org.openrefine.wikidata.updates.ItemUpdateBuilder; +import org.testng.annotations.Test; +import org.wikidata.wdtk.datamodel.helpers.Datamodel; + +public class CommonDescriptionScrutinizerTest extends ScrutinizerTest { + + @Override + public EditScrutinizer getScrutinizer() { + return new CommonDescriptionScrutinizer(); + } + + @Test + public void testGoodDesc() { + String description = "good description"; + ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) + .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true) + .build(); + scrutinize(update); + assertNoWarningRaised(); + } + + @Test + public void testTooLong() { + String description = "long description long description long description long description " + + "long description long description long description long description " + + "long description long description long description long description " + + "long description long description long description long description"; + ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) + .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true) + .build(); + scrutinize(update); + assertWarningsRaised(CommonDescriptionScrutinizer.descTooLongType); + } + + @Test + public void testIdenticalWithLabel() { + String description = "identical with label"; + ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) + .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true) + .addLabel(Datamodel.makeMonolingualTextValue(description, "en"), true) + .build(); + scrutinize(update); + assertWarningsRaised(CommonDescriptionScrutinizer.descIdenticalWithLabel); + } + + @Test + public void testIdenticalWithLabel1() { + String description = "identical with label"; + ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) + .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true) + .addLabel(Datamodel.makeMonolingualTextValue("bonjour", "fr"), true) + .build(); + scrutinize(update); + assertNoWarningRaised(); + } + + @Test + public void testAwfulDesc() { + String description = "long description long description long description long description " + + "long description long description long description long description " + + "long description long description long description long description " + + "long description long description long description long description"; + ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) + .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true) + .addLabel(Datamodel.makeMonolingualTextValue(description, "en"), true) + .build(); + scrutinize(update); + assertWarningsRaised(CommonDescriptionScrutinizer.descTooLongType, CommonDescriptionScrutinizer.descIdenticalWithLabel); + } +} diff --git a/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/EnglishDescriptionScrutinizerTest.java b/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/EnglishDescriptionScrutinizerTest.java new file mode 100644 index 000000000..e3e2dc796 --- /dev/null +++ b/extensions/wikidata/tests/src/org/openrefine/wikidata/qa/scrutinizers/EnglishDescriptionScrutinizerTest.java @@ -0,0 +1,67 @@ +package org.openrefine.wikidata.qa.scrutinizers; + +import org.openrefine.wikidata.testing.TestingData; +import org.openrefine.wikidata.updates.ItemUpdate; +import org.openrefine.wikidata.updates.ItemUpdateBuilder; +import org.testng.annotations.Test; +import org.wikidata.wdtk.datamodel.helpers.Datamodel; + +public class EnglishDescriptionScrutinizerTest extends ScrutinizerTest { + + @Override + public EditScrutinizer getScrutinizer() { + return new EnglishDescriptionScrutinizer(); + } + + @Test + public void testGoodDesc() { + String description = "good description"; + ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) + .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true) + .build(); + scrutinize(update); + assertNoWarningRaised(); + } + + @Test + public void testEndWithPunctuationSign() { + String description = "description with punctuationSign."; + ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) + .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), false) + .build(); + scrutinize(update); + assertWarningsRaised(EnglishDescriptionScrutinizer.descEndsByPunctuationSign); + } + + @Test + public void testBeginWithUppercase() { + String description = "Begin with uppercase"; + ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) + .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true) + .build(); + scrutinize(update); + assertWarningsRaised(EnglishDescriptionScrutinizer.descBeginWithUppercase); + } + + @Test + public void testBeginWithArticle() { + String description = "an article test"; + ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) + .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), false) + .build(); + scrutinize(update); + assertWarningsRaised(EnglishDescriptionScrutinizer.descBeginWithArticle); + } + + @Test + public void testAwfulDesc() { + String description = "An awful description."; + ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) + .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true) + .addLabel(Datamodel.makeMonolingualTextValue(description, "en"), true) + .build(); + scrutinize(update); + assertWarningsRaised(EnglishDescriptionScrutinizer.descEndsByPunctuationSign, + EnglishDescriptionScrutinizer.descBeginWithUppercase, EnglishDescriptionScrutinizer.descBeginWithArticle); + } +}