Merge pull request #2137 from OpenRefine/issue-2136-wikidata-calendar
Improved support for Wikidata dates
This commit is contained in:
commit
29f6d1d14b
@ -154,5 +154,7 @@
|
|||||||
"warnings-messages/no-unit-provided/title": "Unit missing for {property_entity}",
|
"warnings-messages/no-unit-provided/title": "Unit missing for {property_entity}",
|
||||||
"warnings-messages/no-unit-provided/body": "Values such as <span class=\"wb-issue-preformat\">{example_value}</span> on {example_item_entity} are expected to have units.",
|
"warnings-messages/no-unit-provided/body": "Values such as <span class=\"wb-issue-preformat\">{example_value}</span> on {example_item_entity} are expected to have units.",
|
||||||
"warnings-messages/invalid-entity-type/title": "{property_entity} used on items",
|
"warnings-messages/invalid-entity-type/title": "{property_entity} used on items",
|
||||||
"warnings-messages/invalid-entity-type/body": "Uses of {property_entity} on items such as {example_entity} are invalid."
|
"warnings-messages/invalid-entity-type/body": "Uses of {property_entity} on items such as {example_entity} are invalid.",
|
||||||
|
"warnings-messages/early-gregorian-date/title": "Early dates in the Gregorian calendar",
|
||||||
|
"warnings-messages/early-gregorian-date/body": "Dates earlier than October 1582 (such as in year {example_year}) are unlikely to be expressed using the Gregorian calendar. See the <a href=\"https://www.wikidata.org/wiki/Wikidata:Tools/OpenRefine/Editing/Schema_alignment#Dates\" target=\"_blank\">manual</a> to specify the appropriate calendar for your dates."
|
||||||
}
|
}
|
||||||
|
@ -28,6 +28,7 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.openrefine.wikidata.qa.scrutinizers.CalendarScrutinizer;
|
||||||
import org.openrefine.wikidata.qa.scrutinizers.DistinctValuesScrutinizer;
|
import org.openrefine.wikidata.qa.scrutinizers.DistinctValuesScrutinizer;
|
||||||
import org.openrefine.wikidata.qa.scrutinizers.EditScrutinizer;
|
import org.openrefine.wikidata.qa.scrutinizers.EditScrutinizer;
|
||||||
import org.openrefine.wikidata.qa.scrutinizers.EntityTypeScrutinizer;
|
import org.openrefine.wikidata.qa.scrutinizers.EntityTypeScrutinizer;
|
||||||
@ -79,6 +80,7 @@ public class EditInspector {
|
|||||||
register(new QuantityScrutinizer());
|
register(new QuantityScrutinizer());
|
||||||
register(new RestrictedValuesScrutinizer());
|
register(new RestrictedValuesScrutinizer());
|
||||||
register(new EntityTypeScrutinizer());
|
register(new EntityTypeScrutinizer());
|
||||||
|
register(new CalendarScrutinizer());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -0,0 +1,31 @@
|
|||||||
|
package org.openrefine.wikidata.qa.scrutinizers;
|
||||||
|
|
||||||
|
import org.openrefine.wikidata.qa.QAWarning;
|
||||||
|
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
|
||||||
|
import org.wikidata.wdtk.datamodel.interfaces.TimeValue;
|
||||||
|
import org.wikidata.wdtk.datamodel.interfaces.Value;
|
||||||
|
|
||||||
|
public class CalendarScrutinizer extends ValueScrutinizer {
|
||||||
|
|
||||||
|
public static final String earlyGregorianDateType = "early-gregorian-date";
|
||||||
|
|
||||||
|
public static final TimeValue earliestGregorian = Datamodel.makeTimeValue(
|
||||||
|
1582, (byte)10, (byte)15, (byte)0, (byte)0, (byte)0, (byte)11, 0, 0, 0, TimeValue.CM_GREGORIAN_PRO);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void scrutinize(Value value) {
|
||||||
|
if(TimeValue.class.isInstance(value)) {
|
||||||
|
TimeValue time = (TimeValue)value;
|
||||||
|
if(time.getPreferredCalendarModel().equals(earliestGregorian.getPreferredCalendarModel()) &&
|
||||||
|
time.getPrecision() >= 10 &&
|
||||||
|
(time.getYear() < earliestGregorian.getYear() ||
|
||||||
|
time.getYear() == earliestGregorian.getYear() && time.getMonth() < earliestGregorian.getMonth() ||
|
||||||
|
time.getYear() == earliestGregorian.getYear() && time.getMonth() == earliestGregorian.getMonth() && time.getDay() < earliestGregorian.getDay())) {
|
||||||
|
QAWarning warning = new QAWarning(earlyGregorianDateType, null, QAWarning.Severity.WARNING, 1);
|
||||||
|
warning.setProperty("example_year", Long.toString(time.getYear()));
|
||||||
|
addIssue(warning);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -30,6 +30,8 @@ import java.util.Calendar;
|
|||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Map.Entry;
|
import java.util.Map.Entry;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.jsoup.helper.Validate;
|
import org.jsoup.helper.Validate;
|
||||||
import org.openrefine.wikidata.schema.exceptions.SkipSchemaExpressionException;
|
import org.openrefine.wikidata.schema.exceptions.SkipSchemaExpressionException;
|
||||||
@ -61,6 +63,8 @@ public class WbDateConstant implements WbExpression<TimeValue> {
|
|||||||
.put(new SimpleDateFormat("yyyy-MM"), 10)
|
.put(new SimpleDateFormat("yyyy-MM"), 10)
|
||||||
.put(new SimpleDateFormat("yyyy-MM-dd"), 11)
|
.put(new SimpleDateFormat("yyyy-MM-dd"), 11)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
|
public static Pattern calendarSuffixPattern = Pattern.compile("_(Q[1-9][0-9]*)$");
|
||||||
|
|
||||||
private TimeValue parsed;
|
private TimeValue parsed;
|
||||||
private String origDatestamp;
|
private String origDatestamp;
|
||||||
@ -100,15 +104,46 @@ public class WbDateConstant implements WbExpression<TimeValue> {
|
|||||||
Date bestDate = null;
|
Date bestDate = null;
|
||||||
int precision = 0; // default precision (will be overridden if successfully parsed)
|
int precision = 0; // default precision (will be overridden if successfully parsed)
|
||||||
int maxLength = 0; // the maximum length parsed
|
int maxLength = 0; // the maximum length parsed
|
||||||
|
String calendarIri = TimeValue.CM_GREGORIAN_PRO; // Gregorian calendar is assumed by default
|
||||||
|
|
||||||
|
String trimmedDatestamp = datestamp.trim();
|
||||||
|
|
||||||
|
if("TODAY".equals(trimmedDatestamp)) {
|
||||||
|
Calendar calendar = Calendar.getInstance();
|
||||||
|
TimeValue todaysDate = Datamodel.makeTimeValue(
|
||||||
|
calendar.get(Calendar.YEAR),
|
||||||
|
(byte)calendar.get(Calendar.MONTH),
|
||||||
|
(byte)calendar.get(Calendar.DAY_OF_MONTH),
|
||||||
|
(byte)0, (byte)0, (byte)0, (byte)11, 0,0,0, TimeValue.CM_GREGORIAN_PRO);
|
||||||
|
return todaysDate;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
for (Entry<SimpleDateFormat, Integer> entry : acceptedFormats.entrySet()) {
|
for (Entry<SimpleDateFormat, Integer> entry : acceptedFormats.entrySet()) {
|
||||||
ParsePosition position = new ParsePosition(0);
|
ParsePosition position = new ParsePosition(0);
|
||||||
String trimmedDatestamp = datestamp.trim();
|
|
||||||
Date date = entry.getKey().parse(trimmedDatestamp, position);
|
Date date = entry.getKey().parse(trimmedDatestamp, position);
|
||||||
|
|
||||||
|
if (date == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Potentially parse the calendar Qid after the date
|
||||||
|
int consumedUntil = position.getIndex();
|
||||||
|
if(consumedUntil < trimmedDatestamp.length()) {
|
||||||
|
Matcher matcher = calendarSuffixPattern.matcher(
|
||||||
|
trimmedDatestamp.subSequence(position.getIndex(), trimmedDatestamp.length()));
|
||||||
|
if(matcher.find()) {
|
||||||
|
String calendarQid = matcher.group(1);
|
||||||
|
calendarIri = Datamodel.SITE_WIKIDATA + calendarQid;
|
||||||
|
consumedUntil = trimmedDatestamp.length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Ignore parses which failed or do not consume all the input
|
// Ignore parses which failed or do not consume all the input
|
||||||
if (date != null && position.getIndex() > maxLength
|
if (date != null && position.getIndex() > maxLength
|
||||||
// only allow to partially consume the input if the precision is more than a year
|
// only allow to partially consume the input if the precision is day and followed by a T (as in ISO)
|
||||||
&& (entry.getValue() > 9 || position.getIndex() == trimmedDatestamp.length())) {
|
&& (consumedUntil == trimmedDatestamp.length()
|
||||||
|
|| (entry.getValue() == 11 && trimmedDatestamp.charAt(consumedUntil) == 'T'))) {
|
||||||
precision = entry.getValue();
|
precision = entry.getValue();
|
||||||
bestDate = date;
|
bestDate = date;
|
||||||
maxLength = position.getIndex();
|
maxLength = position.getIndex();
|
||||||
@ -123,7 +158,7 @@ public class WbDateConstant implements WbExpression<TimeValue> {
|
|||||||
return Datamodel.makeTimeValue(calendar.get(Calendar.YEAR), (byte) (calendar.get(Calendar.MONTH) + 1),
|
return Datamodel.makeTimeValue(calendar.get(Calendar.YEAR), (byte) (calendar.get(Calendar.MONTH) + 1),
|
||||||
(byte) calendar.get(Calendar.DAY_OF_MONTH), (byte) calendar.get(Calendar.HOUR_OF_DAY),
|
(byte) calendar.get(Calendar.DAY_OF_MONTH), (byte) calendar.get(Calendar.HOUR_OF_DAY),
|
||||||
(byte) calendar.get(Calendar.MINUTE), (byte) calendar.get(Calendar.SECOND), (byte) precision, 0, 0,
|
(byte) calendar.get(Calendar.MINUTE), (byte) calendar.get(Calendar.SECOND), (byte) precision, 0, 0,
|
||||||
0, TimeValue.CM_GREGORIAN_PRO);
|
0, calendarIri);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,37 @@
|
|||||||
|
package org.openrefine.wikidata.qa.scrutinizers;
|
||||||
|
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
|
||||||
|
import org.wikidata.wdtk.datamodel.interfaces.TimeValue;
|
||||||
|
|
||||||
|
public class CalendarScrutinizerTest extends ValueScrutinizerTest {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public EditScrutinizer getScrutinizer() {
|
||||||
|
return new CalendarScrutinizer();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testScrutinizeRecentValue() {
|
||||||
|
scrutinize(Datamodel.makeTimeValue(1978L, (byte)3, (byte)4, (byte)0, (byte)0, (byte)0, 11, TimeValue.CM_GREGORIAN_PRO));
|
||||||
|
assertNoWarningRaised();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testScrutinizeCloseValue() {
|
||||||
|
scrutinize(Datamodel.makeTimeValue(1582L, (byte)10, (byte)17, (byte)0, (byte)0, (byte)0, 11, TimeValue.CM_GREGORIAN_PRO));
|
||||||
|
assertNoWarningRaised();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testScrutinizeEarlyYear() {
|
||||||
|
scrutinize(Datamodel.makeTimeValue(1400L, (byte)1, (byte)1, (byte)0, (byte)0, (byte)0, (byte)9, 0, 0, 0, TimeValue.CM_GREGORIAN_PRO));
|
||||||
|
assertNoWarningRaised();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testScrutinizeEarlyDay() {
|
||||||
|
scrutinize(Datamodel.makeTimeValue(1440L, (byte)10, (byte)17, (byte)0, (byte)0, (byte)0, 11, TimeValue.CM_GREGORIAN_PRO));
|
||||||
|
assertWarningsRaised(CalendarScrutinizer.earlyGregorianDateType);
|
||||||
|
}
|
||||||
|
}
|
@ -23,6 +23,8 @@
|
|||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
package org.openrefine.wikidata.schema;
|
package org.openrefine.wikidata.schema;
|
||||||
|
|
||||||
|
import java.util.Calendar;
|
||||||
|
|
||||||
import org.openrefine.wikidata.testing.JacksonSerializationTest;
|
import org.openrefine.wikidata.testing.JacksonSerializationTest;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
|
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
|
||||||
@ -40,6 +42,12 @@ public class WbDateConstantTest extends WbExpressionTest<TimeValue> {
|
|||||||
private WbDateConstant second = new WbDateConstant("2017-01-03T04:12:45");
|
private WbDateConstant second = new WbDateConstant("2017-01-03T04:12:45");
|
||||||
private WbDateConstant secondz = new WbDateConstant("2017-01-03T04:12:45Z");
|
private WbDateConstant secondz = new WbDateConstant("2017-01-03T04:12:45Z");
|
||||||
|
|
||||||
|
private WbDateConstant julianDay = new WbDateConstant("1324-02-27_Q1985786");
|
||||||
|
private WbDateConstant julianMonth = new WbDateConstant("1324-02_Q1985786");
|
||||||
|
private WbDateConstant julianYear = new WbDateConstant("1324_Q1985786");
|
||||||
|
private WbDateConstant julianDecade = new WbDateConstant("1320D_Q1985786");
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSerialize() {
|
public void testSerialize() {
|
||||||
JacksonSerializationTest.canonicalSerialization(WbExpression.class, year,
|
JacksonSerializationTest.canonicalSerialization(WbExpression.class, year,
|
||||||
@ -70,6 +78,26 @@ public class WbDateConstantTest extends WbExpressionTest<TimeValue> {
|
|||||||
|
|
||||||
evaluatesTo(Datamodel.makeTimeValue(2018, (byte) 2, (byte) 27, (byte) 0, (byte) 0, (byte) 0, (byte) 11, 0, 0, 0,
|
evaluatesTo(Datamodel.makeTimeValue(2018, (byte) 2, (byte) 27, (byte) 0, (byte) 0, (byte) 0, (byte) 11, 0, 0, 0,
|
||||||
TimeValue.CM_GREGORIAN_PRO), whitespace);
|
TimeValue.CM_GREGORIAN_PRO), whitespace);
|
||||||
|
|
||||||
|
evaluatesTo(Datamodel.makeTimeValue(1320, (byte) 1, (byte) 1, (byte) 0, (byte) 0, (byte) 0, (byte) 8, 0, 0, 0,
|
||||||
|
TimeValue.CM_JULIAN_PRO), julianDecade);
|
||||||
|
evaluatesTo(Datamodel.makeTimeValue(1324, (byte) 1, (byte) 1, (byte) 0, (byte) 0, (byte) 0, (byte) 9, 0, 0, 0,
|
||||||
|
TimeValue.CM_JULIAN_PRO), julianYear);
|
||||||
|
evaluatesTo(Datamodel.makeTimeValue(1324, (byte) 2, (byte) 1, (byte) 0, (byte) 0, (byte) 0, (byte) 10, 0, 0, 0,
|
||||||
|
TimeValue.CM_JULIAN_PRO), julianMonth);
|
||||||
|
evaluatesTo(Datamodel.makeTimeValue(1324, (byte) 2, (byte) 27, (byte) 0, (byte) 0, (byte) 0, (byte) 11, 0, 0, 0,
|
||||||
|
TimeValue.CM_JULIAN_PRO), julianDay);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testToday() {
|
||||||
|
Calendar calendar = Calendar.getInstance();
|
||||||
|
TimeValue expectedDate = Datamodel.makeTimeValue(
|
||||||
|
calendar.get(Calendar.YEAR),
|
||||||
|
(byte)calendar.get(Calendar.MONTH),
|
||||||
|
(byte)calendar.get(Calendar.DAY_OF_MONTH),
|
||||||
|
(byte)0, (byte)0, (byte)0, (byte)11, 0,0,0, TimeValue.CM_GREGORIAN_PRO);
|
||||||
|
evaluatesTo(expectedDate, new WbDateConstant("TODAY"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||||
@ -81,4 +109,9 @@ public class WbDateConstantTest extends WbExpressionTest<TimeValue> {
|
|||||||
public void testPartlyValid() {
|
public void testPartlyValid() {
|
||||||
new WbDateConstant("2018-partly valid");
|
new WbDateConstant("2018-partly valid");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||||
|
public void testInvalidCalendar() {
|
||||||
|
new WbDateConstant("2018-01-02_P234");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user