update DescriptionScrutinizer & create EnglishDescriptionScrutinizer

This commit is contained in:
afkbrb 2020-03-02 14:35:42 +08:00
parent 9659157a0b
commit 82f95f2bbf
5 changed files with 175 additions and 119 deletions

View File

@ -158,13 +158,13 @@
"warnings-messages/early-gregorian-date/title": "Early dates in the Gregorian calendar", "warnings-messages/early-gregorian-date/title": "Early dates in the Gregorian calendar",
"warnings-messages/early-gregorian-date/body": "Dates earlier than October 1582 (such as in year {example_year}) are unlikely to be expressed using the Gregorian calendar. See the <a href=\"https://www.wikidata.org/wiki/Wikidata:Tools/OpenRefine/Editing/Schema_alignment#Dates\" target=\"_blank\">manual</a> to specify the appropriate calendar for your dates.", "warnings-messages/early-gregorian-date/body": "Dates earlier than October 1582 (such as in year {example_year}) are unlikely to be expressed using the Gregorian calendar. See the <a href=\"https://www.wikidata.org/wiki/Wikidata:Tools/OpenRefine/Editing/Schema_alignment#Dates\" target=\"_blank\">manual</a> to specify the appropriate calendar for your dates.",
"warnings-messages/item-description-too-long/title": "Description is too long", "warnings-messages/item-description-too-long/title": "Description is too long",
"warnings-messages/item-description-too-long/body": "Description length of items such as {example_entity} is too long (more that 250 characters).", "warnings-messages/item-description-too-long/body": "Description ({lang}) such as <span class=\"wb-issue-preformat\">{description}</span> on {example_entity} is too long. Its length is {length}, which is more than {max_length}. Descriptions are not full sentences, but small bits of information. In most cases, the proper length is between two and twelve words. See the <a href=\"https://www.wikidata.org/wiki/Help:Description#Length\" target=\"_blank\">manual</a> for more information.",
"warnings-messages/item-description-end-by-punctuation-sign/title": "Description ends by punctuation sign",
"warnings-messages/item-description-end-by-punctuation-sign/body": "Description of items such as {example_entity} ends by a punctuation sign.",
"warnings-messages/item-description-begin-with-uppercase/title": "Description begins with a uppercase letter",
"warnings-messages/item-description-begin-with-uppercase/body": "Description of items such as {example_entity} begins with a uppercase letter.",
"warnings-messages/item-description-begin-with-article/title": "Description begins with article (a, an or the)",
"warnings-messages/item-description-begin-with-article/body": "Description of items such as {example_entity} begins with article (a, an or the).",
"warnings-messages/item-description-identical-with-label/title": "Description is identical with label", "warnings-messages/item-description-identical-with-label/title": "Description is identical with label",
"warnings-messages/item-description-identical-with-label/body": "Description of items such as {example_entity} is identical with corresponding label." "warnings-messages/item-description-identical-with-label/body": "Both the description ({lang}) and the label ({label_lang}) on {example_entity} are <span class=\"wb-issue-preformat\">{description}</span>. Description are expected to be more specific than labels",
"warnings-messages/item-description-end-by-punctuation-sign/title": "Description ends by punctuation sign",
"warnings-messages/item-description-end-by-punctuation-sign/body": "Description ({lang}) such as <span class=\"wb-issue-preformat\">{description}</span> on {example_entity} ends by a punctuation sign \"{punctuation_sign}\". Description are not sentences, so the punctuation sign at the end should be avoided. See the <a href=\"https://www.wikidata.org/wiki/Help:Description#Length\" target=\"_blank\">manual</a> for more information.",
"warnings-messages/item-description-begin-with-uppercase/title": "Description begins with uppercase letter",
"warnings-messages/item-description-begin-with-uppercase/body": "Description ({lang}) such as <span class=\"wb-issue-preformat\">{description}</span> on {example_entity} begins with uppercase letter \"{uppercase_letter}\". Descriptions begin with a lowercase letter except when uppercase would normally be required or expected. See the <a href=\"https://www.wikidata.org/wiki/Help:Description#Capitalization\" target=\"_blank\">manual</a> for more information.",
"warnings-messages/item-description-begin-with-article/title": "Description begins with article (\"a\", \"an\" or \"the\")",
"warnings-messages/item-description-begin-with-article/body": "Description ({lang}) such as <span class=\"wb-issue-preformat\">{description}</span> on {example_entity} begins with article \"{article}\". Descriptions should not normally begin with initial articles (\"a\", \"an\", \"the\"). See the <a href=\"https://www.wikidata.org/wiki/Help:Description#No_initial_articles_(a,_an,_the)\" target=\"_blank\">manual</a> for more information."
} }

View File

@ -67,7 +67,7 @@ public class EditInspector {
register(new RestrictedValuesScrutinizer()); register(new RestrictedValuesScrutinizer());
register(new EntityTypeScrutinizer()); register(new EntityTypeScrutinizer());
register(new CalendarScrutinizer()); register(new CalendarScrutinizer());
register(new DescriptionScrutinizer()); register(new EnglishDescriptionScrutinizer());
} }
/** /**

View File

@ -8,91 +8,75 @@ import java.util.Set;
/** /**
* A scrutinizer that checks the description of an item. * A scrutinizer that checks the description of an item.
* * <p>
* The checks work well for English. * This abstract scrutinizer does the following checks:
* It's impossible to cover all languages, * 1. is the description too long
* but since most edited information is in English, * 2. is the description identical with the label in the same language
* merely focusing on English here should be enough. * <p>
* * We can easily implement a language-specific description scrutinizer
* To be more specific, it does the following checks: * by extending this class.
* 1. is a description too long
* 2. does a description end by punctuation signs
* 3. does a description begin with a uppercase letter
* 4. does a description begin with article ("a", "an" or "the")
* 5. is the description identical with corresponding label
* *
* @author Lu Liu * @author Lu Liu
*/ */
public class DescriptionScrutinizer extends EditScrutinizer { public abstract class DescriptionScrutinizer extends EditScrutinizer {
public static final String descTooLongType = "item-description-too-long"; public static final String descTooLongType = "item-description-too-long";
public static final String descEndsByPunctuationSign = "item-description-end-by-punctuation-sign";
public static final String descBeginWithUppercase = "item-description-begin-with-uppercase";
public static final String descBeginWithArticle = "item-description-begin-with-article";
public static final String descIdenticalWithLabel = "item-description-identical-with-label"; public static final String descIdenticalWithLabel = "item-description-identical-with-label";
private static final int descLengthThreshold = 250;
private static final String punctuationSigns = ".!?,'\"";
@Override @Override
public void scrutinize(ItemUpdate update) { public void scrutinize(ItemUpdate update) {
Set<MonolingualTextValue> descriptions = update.getDescriptions(); Set<MonolingualTextValue> descriptions = update.getDescriptions();
descriptions.addAll(update.getDescriptionsIfNew()); // merge descriptions.addAll(update.getDescriptionsIfNew()); // merge
for (MonolingualTextValue description : descriptions) { for (MonolingualTextValue description : descriptions) {
doScrutinize(update, description);
}
}
private void doScrutinize(ItemUpdate update, MonolingualTextValue description) {
String descText = description.getText(); String descText = description.getText();
if (descText == null) return; if (descText == null) continue;
descText = descText.trim(); descText = descText.trim();
if (descText.length() == 0) return; if (descText.length() == 0) continue; // avoid NullPointerException
// length check String lang = description.getLanguageCode();
if (descText.length() > descLengthThreshold) {
warningWithEntity(update, descTooLongType); checkLength(update, descText, lang);
checkLabel(update, descText, lang);
scrutinize(update, descText, lang);
}
} }
// punctuation sign check public abstract void scrutinize(ItemUpdate update, String descText, String lang);
char last = descText.charAt(descText.length() - 1);
if (punctuationSigns.indexOf(last) != -1) { // Descriptions are not full sentences, but small bits of information.
warningWithEntity(update, descEndsByPunctuationSign); // In most cases, the proper length is between two and twelve words.
protected void checkLength(ItemUpdate update, String descText, String lang) {
final int maxLength = 250;
if (descText.length() > maxLength) {
QAWarning issue = new QAWarning(descTooLongType, null, QAWarning.Severity.WARNING, 1);
issue.setProperty("example_entity", update.getItemId());
issue.setProperty("description", descText);
issue.setProperty("lang", lang);
issue.setProperty("length", descText.length());
issue.setProperty("max_length", maxLength);
addIssue(issue);
}
} }
// begin with uppercase letter check // Description are expected to be more specific than labels.
char first = descText.charAt(0); protected void checkLabel(ItemUpdate update, String descText, String lang) {
if ('A' <= first && first <= 'Z') {
warningWithEntity(update, descBeginWithUppercase);
}
// article check
String firstWord = descText.split("\\s")[0].toLowerCase();
if ("a".equals(firstWord) || "an".equals(firstWord) || "the".equals(firstWord)) {
warningWithEntity(update, descBeginWithArticle);
}
// description-label check
Set<MonolingualTextValue> labels = update.getLabels(); Set<MonolingualTextValue> labels = update.getLabels();
labels.addAll(update.getLabelsIfNew()); // merge labels.addAll(update.getLabelsIfNew()); // merge
for (MonolingualTextValue label : labels) { for (MonolingualTextValue label : labels) {
if (label.getLanguageCode().equals(description.getLanguageCode())) {
String labelText = label.getText(); String labelText = label.getText();
if (labelText == null) break; if (labelText == null) continue;
labelText = labelText.trim(); labelText = labelText.trim();
if (labelText.equals(descText)) { if (labelText.equals(descText)) {
warningWithEntity(update, descIdenticalWithLabel); QAWarning issue = new QAWarning(descIdenticalWithLabel, null, QAWarning.Severity.WARNING, 1);
} issue.setProperty("example_entity", update.getItemId());
issue.setProperty("description", descText);
issue.setProperty("lang", lang);
issue.setProperty("label_lang", label.getLanguageCode());
addIssue(issue);
break; break;
} }
} }
} }
private void warningWithEntity(ItemUpdate update, String type) {
QAWarning issue = new QAWarning(type, null, QAWarning.Severity.WARNING, 1);
issue.setProperty("example_entity", update.getItemId());
addIssue(issue);
}
} }

View File

@ -0,0 +1,72 @@
package org.openrefine.wikidata.qa.scrutinizers;
import org.openrefine.wikidata.qa.QAWarning;
import org.openrefine.wikidata.updates.ItemUpdate;
/**
* @author Lu Liu
*/
public class EnglishDescriptionScrutinizer extends DescriptionScrutinizer {
public static final String descEndsByPunctuationSign = "item-description-end-by-punctuation-sign";
public static final String descBeginWithUppercase = "item-description-begin-with-uppercase";
public static final String descBeginWithArticle = "item-description-begin-with-article";
private static final String LANG = "en";
@Override
public void scrutinize(ItemUpdate update, String descText, String lang) {
if (!LANG.equalsIgnoreCase(lang)) return;
checkPunctuationSign(update, descText);
checkUppercase(update, descText);
checkArticle(update, descText);
}
// Description are not sentences, so the punctuation sign at the end should be avoided.
protected void checkPunctuationSign(ItemUpdate update, String descText) {
assert descText.length() > 0;
final String punctuationSigns = ".?!;:,'\"";
char last = descText.charAt(descText.length() - 1);
if (punctuationSigns.indexOf(last) != -1) {
QAWarning issue = new QAWarning(descEndsByPunctuationSign, null, QAWarning.Severity.WARNING, 1);
issue.setProperty("example_entity", update.getItemId());
issue.setProperty("description", descText);
issue.setProperty("lang", LANG);
issue.setProperty("punctuation_sign", last);
addIssue(issue);
}
}
// Descriptions begin with a lowercase letter except when uppercase would normally be required or expected.
protected void checkUppercase(ItemUpdate update, String descText) {
assert descText.length() > 0;
char first = descText.charAt(0);
if ('A' <= first && first <= 'Z') {
QAWarning issue = new QAWarning(descBeginWithUppercase, null, QAWarning.Severity.INFO, 1);
issue.setProperty("example_entity", update.getItemId());
issue.setProperty("description", descText);
issue.setProperty("lang", LANG);
issue.setProperty("uppercase_letter", first);
addIssue(issue);
}
}
// Descriptions should not normally begin with initial articles ("a", "an", "the").
protected void checkArticle(ItemUpdate update, String descText) {
assert descText.length() > 0;
String firstWord = descText.split("\\s")[0].toLowerCase();
if ("a".equals(firstWord) || "an".equals(firstWord) || "the".equals(firstWord)) {
QAWarning issue = new QAWarning(descBeginWithArticle, null, QAWarning.Severity.WARNING, 1);
issue.setProperty("example_entity", update.getItemId());
issue.setProperty("description", descText);
issue.setProperty("lang", LANG);
issue.setProperty("article", firstWord);
addIssue(issue);
}
}
}

View File

@ -6,11 +6,21 @@ import org.openrefine.wikidata.updates.ItemUpdateBuilder;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import org.wikidata.wdtk.datamodel.helpers.Datamodel; import org.wikidata.wdtk.datamodel.helpers.Datamodel;
public class DescriptionScrutinizerTest extends ScrutinizerTest { public class EnglishDescriptionScrutinizerTest extends ScrutinizerTest {
@Override @Override
public EditScrutinizer getScrutinizer() { public EditScrutinizer getScrutinizer() {
return new DescriptionScrutinizer(); return new EnglishDescriptionScrutinizer();
}
@Test
public void testGoodDesc() {
String description = "good description";
ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA)
.addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true)
.build();
scrutinize(update);
assertNoWarningRaised();
} }
@Test @Test
@ -23,48 +33,7 @@ public class DescriptionScrutinizerTest extends ScrutinizerTest {
.addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true) .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true)
.build(); .build();
scrutinize(update); scrutinize(update);
assertWarningsRaised(DescriptionScrutinizer.descTooLongType); assertWarningsRaised(EnglishDescriptionScrutinizer.descTooLongType);
}
@Test
public void testEndWithPunctuationSign() {
String description = "description with punctuationSign.";
ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA)
.addDescription(Datamodel.makeMonolingualTextValue(description, "en"), false)
.build();
scrutinize(update);
assertWarningsRaised(DescriptionScrutinizer.descEndsByPunctuationSign);
}
@Test
public void testBeginWithUppercase() {
String description = "Begin with uppercase";
ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA)
.addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true)
.build();
scrutinize(update);
assertWarningsRaised(DescriptionScrutinizer.descBeginWithUppercase);
}
@Test
public void testBeginWithArticle() {
String description = "an article test";
ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA)
.addDescription(Datamodel.makeMonolingualTextValue(description, "en"), false)
.build();
scrutinize(update);
assertWarningsRaised(DescriptionScrutinizer.descBeginWithArticle);
}
@Test
public void testIdenticalWithLabel() {
String description = "identical with label";
ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA)
.addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true)
.addLabel(Datamodel.makeMonolingualTextValue(description, "en"), true)
.build();
scrutinize(update);
assertWarningsRaised(DescriptionScrutinizer.descIdenticalWithLabel);
} }
@Test @Test
@ -79,13 +48,44 @@ public class DescriptionScrutinizerTest extends ScrutinizerTest {
} }
@Test @Test
public void testGoodDesc() { public void testIdenticalWithLabel() {
String description = "good description"; String description = "identical with label";
ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA)
.addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true)
.addLabel(Datamodel.makeMonolingualTextValue(description, "en"), true)
.build();
scrutinize(update);
assertWarningsRaised(EnglishDescriptionScrutinizer.descIdenticalWithLabel);
}
@Test
public void testEndWithPunctuationSign() {
String description = "description with punctuationSign.";
ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA)
.addDescription(Datamodel.makeMonolingualTextValue(description, "en"), false)
.build();
scrutinize(update);
assertWarningsRaised(EnglishDescriptionScrutinizer.descEndsByPunctuationSign);
}
@Test
public void testBeginWithUppercase() {
String description = "Begin with uppercase";
ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA) ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA)
.addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true) .addDescription(Datamodel.makeMonolingualTextValue(description, "en"), true)
.build(); .build();
scrutinize(update); scrutinize(update);
assertNoWarningRaised(); assertWarningsRaised(EnglishDescriptionScrutinizer.descBeginWithUppercase);
}
@Test
public void testBeginWithArticle() {
String description = "an article test";
ItemUpdate update = new ItemUpdateBuilder(TestingData.newIdA)
.addDescription(Datamodel.makeMonolingualTextValue(description, "en"), false)
.build();
scrutinize(update);
assertWarningsRaised(EnglishDescriptionScrutinizer.descBeginWithArticle);
} }
@Test @Test
@ -99,7 +99,7 @@ public class DescriptionScrutinizerTest extends ScrutinizerTest {
.addLabel(Datamodel.makeMonolingualTextValue(description, "en"), true) .addLabel(Datamodel.makeMonolingualTextValue(description, "en"), true)
.build(); .build();
scrutinize(update); scrutinize(update);
assertWarningsRaised(DescriptionScrutinizer.descTooLongType, DescriptionScrutinizer.descEndsByPunctuationSign, assertWarningsRaised(EnglishDescriptionScrutinizer.descTooLongType, EnglishDescriptionScrutinizer.descEndsByPunctuationSign,
DescriptionScrutinizer.descBeginWithUppercase, DescriptionScrutinizer.descBeginWithArticle, DescriptionScrutinizer.descIdenticalWithLabel); EnglishDescriptionScrutinizer.descBeginWithUppercase, EnglishDescriptionScrutinizer.descBeginWithArticle, EnglishDescriptionScrutinizer.descIdenticalWithLabel);
} }
} }