Add support for custom calendar in Wikidata date parsing
This commit is contained in:
parent
001dc584f0
commit
43980e69dd
@ -30,10 +30,13 @@ import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.jsoup.helper.Validate;
|
||||
import org.openrefine.wikidata.schema.exceptions.SkipSchemaExpressionException;
|
||||
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.EntityIdValue;
|
||||
import org.wikidata.wdtk.datamodel.interfaces.TimeValue;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||
@ -61,6 +64,8 @@ public class WbDateConstant implements WbExpression<TimeValue> {
|
||||
.put(new SimpleDateFormat("yyyy-MM"), 10)
|
||||
.put(new SimpleDateFormat("yyyy-MM-dd"), 11)
|
||||
.build();
|
||||
|
||||
public static Pattern calendarSuffixPattern = Pattern.compile("_(Q[1-9][0-9]*)$");
|
||||
|
||||
private TimeValue parsed;
|
||||
private String origDatestamp;
|
||||
@ -100,15 +105,33 @@ public class WbDateConstant implements WbExpression<TimeValue> {
|
||||
Date bestDate = null;
|
||||
int precision = 0; // default precision (will be overridden if successfully parsed)
|
||||
int maxLength = 0; // the maximum length parsed
|
||||
String calendarIri = TimeValue.CM_GREGORIAN_PRO; // Gregorian calendar is assumed by default
|
||||
for (Entry<SimpleDateFormat, Integer> entry : acceptedFormats.entrySet()) {
|
||||
ParsePosition position = new ParsePosition(0);
|
||||
String trimmedDatestamp = datestamp.trim();
|
||||
Date date = entry.getKey().parse(trimmedDatestamp, position);
|
||||
|
||||
if (date == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Potentially parse the calendar Qid after the date
|
||||
int consumedUntil = position.getIndex();
|
||||
if(consumedUntil < trimmedDatestamp.length()) {
|
||||
Matcher matcher = calendarSuffixPattern.matcher(
|
||||
trimmedDatestamp.subSequence(position.getIndex(), trimmedDatestamp.length()));
|
||||
if(matcher.find()) {
|
||||
String calendarQid = matcher.group(1);
|
||||
calendarIri = Datamodel.SITE_WIKIDATA + calendarQid;
|
||||
consumedUntil = trimmedDatestamp.length();
|
||||
}
|
||||
}
|
||||
|
||||
// Ignore parses which failed or do not consume all the input
|
||||
if (date != null && position.getIndex() > maxLength
|
||||
// only allow to partially consume the input if the precision is more than a year
|
||||
&& (entry.getValue() > 9 || position.getIndex() == trimmedDatestamp.length())) {
|
||||
// only allow to partially consume the input if the precision is day and followed by a T (as in ISO)
|
||||
&& (consumedUntil == trimmedDatestamp.length()
|
||||
|| (entry.getValue() == 11 && trimmedDatestamp.charAt(consumedUntil) == 'T'))) {
|
||||
precision = entry.getValue();
|
||||
bestDate = date;
|
||||
maxLength = position.getIndex();
|
||||
@ -123,7 +146,7 @@ public class WbDateConstant implements WbExpression<TimeValue> {
|
||||
return Datamodel.makeTimeValue(calendar.get(Calendar.YEAR), (byte) (calendar.get(Calendar.MONTH) + 1),
|
||||
(byte) calendar.get(Calendar.DAY_OF_MONTH), (byte) calendar.get(Calendar.HOUR_OF_DAY),
|
||||
(byte) calendar.get(Calendar.MINUTE), (byte) calendar.get(Calendar.SECOND), (byte) precision, 0, 0,
|
||||
0, TimeValue.CM_GREGORIAN_PRO);
|
||||
0, calendarIri);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -40,6 +40,12 @@ public class WbDateConstantTest extends WbExpressionTest<TimeValue> {
|
||||
private WbDateConstant second = new WbDateConstant("2017-01-03T04:12:45");
|
||||
private WbDateConstant secondz = new WbDateConstant("2017-01-03T04:12:45Z");
|
||||
|
||||
private WbDateConstant julianDay = new WbDateConstant("1324-02-27_Q1985786");
|
||||
private WbDateConstant julianMonth = new WbDateConstant("1324-02_Q1985786");
|
||||
private WbDateConstant julianYear = new WbDateConstant("1324_Q1985786");
|
||||
private WbDateConstant julianDecade = new WbDateConstant("1320D_Q1985786");
|
||||
|
||||
|
||||
@Test
|
||||
public void testSerialize() {
|
||||
JacksonSerializationTest.canonicalSerialization(WbExpression.class, year,
|
||||
@ -70,6 +76,15 @@ public class WbDateConstantTest extends WbExpressionTest<TimeValue> {
|
||||
|
||||
evaluatesTo(Datamodel.makeTimeValue(2018, (byte) 2, (byte) 27, (byte) 0, (byte) 0, (byte) 0, (byte) 11, 0, 0, 0,
|
||||
TimeValue.CM_GREGORIAN_PRO), whitespace);
|
||||
|
||||
evaluatesTo(Datamodel.makeTimeValue(1320, (byte) 1, (byte) 1, (byte) 0, (byte) 0, (byte) 0, (byte) 8, 0, 0, 0,
|
||||
TimeValue.CM_JULIAN_PRO), julianDecade);
|
||||
evaluatesTo(Datamodel.makeTimeValue(1324, (byte) 1, (byte) 1, (byte) 0, (byte) 0, (byte) 0, (byte) 9, 0, 0, 0,
|
||||
TimeValue.CM_JULIAN_PRO), julianYear);
|
||||
evaluatesTo(Datamodel.makeTimeValue(1324, (byte) 2, (byte) 1, (byte) 0, (byte) 0, (byte) 0, (byte) 10, 0, 0, 0,
|
||||
TimeValue.CM_JULIAN_PRO), julianMonth);
|
||||
evaluatesTo(Datamodel.makeTimeValue(1324, (byte) 2, (byte) 27, (byte) 0, (byte) 0, (byte) 0, (byte) 11, 0, 0, 0,
|
||||
TimeValue.CM_JULIAN_PRO), julianDay);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
@ -81,4 +96,9 @@ public class WbDateConstantTest extends WbExpressionTest<TimeValue> {
|
||||
public void testPartlyValid() {
|
||||
new WbDateConstant("2018-partly valid");
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
public void testInvalidCalendar() {
|
||||
new WbDateConstant("2018-01-02_P234");
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user