1190 lines
38 KiB
Cython
1190 lines
38 KiB
Cython
|
"""
|
|||
|
Parsing functions for datetime and datetime-like strings.
|
|||
|
"""
|
|||
|
import re
|
|||
|
import time
|
|||
|
import warnings
|
|||
|
|
|||
|
from pandas.util._exceptions import find_stack_level
|
|||
|
|
|||
|
cimport cython
|
|||
|
from cpython.datetime cimport (
|
|||
|
datetime,
|
|||
|
datetime_new,
|
|||
|
import_datetime,
|
|||
|
timedelta,
|
|||
|
tzinfo,
|
|||
|
)
|
|||
|
|
|||
|
from datetime import timezone
|
|||
|
|
|||
|
from cpython.object cimport PyObject_Str
|
|||
|
from cython cimport Py_ssize_t
|
|||
|
from libc.string cimport strchr
|
|||
|
|
|||
|
import_datetime()
|
|||
|
|
|||
|
import numpy as np
|
|||
|
|
|||
|
cimport numpy as cnp
|
|||
|
from numpy cimport (
|
|||
|
PyArray_GETITEM,
|
|||
|
PyArray_ITER_DATA,
|
|||
|
PyArray_ITER_NEXT,
|
|||
|
PyArray_IterNew,
|
|||
|
flatiter,
|
|||
|
float64_t,
|
|||
|
)
|
|||
|
|
|||
|
cnp.import_array()
|
|||
|
|
|||
|
# dateutil compat
|
|||
|
|
|||
|
from decimal import InvalidOperation
|
|||
|
|
|||
|
from dateutil.parser import (
|
|||
|
DEFAULTPARSER,
|
|||
|
parse as du_parse,
|
|||
|
)
|
|||
|
from dateutil.relativedelta import relativedelta
|
|||
|
from dateutil.tz import (
|
|||
|
tzlocal as _dateutil_tzlocal,
|
|||
|
tzoffset,
|
|||
|
tzutc as _dateutil_tzutc,
|
|||
|
)
|
|||
|
|
|||
|
from pandas._config import get_option
|
|||
|
|
|||
|
from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS
|
|||
|
from pandas._libs.tslibs.dtypes cimport (
|
|||
|
attrname_to_npy_unit,
|
|||
|
npy_unit_to_attrname,
|
|||
|
)
|
|||
|
from pandas._libs.tslibs.nattype cimport (
|
|||
|
c_NaT as NaT,
|
|||
|
c_nat_strings as nat_strings,
|
|||
|
)
|
|||
|
|
|||
|
from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
|
|||
|
|
|||
|
from pandas._libs.tslibs.np_datetime cimport (
|
|||
|
NPY_DATETIMEUNIT,
|
|||
|
npy_datetimestruct,
|
|||
|
string_to_dts,
|
|||
|
)
|
|||
|
|
|||
|
from pandas._libs.tslibs.strptime import array_strptime
|
|||
|
|
|||
|
from pandas._libs.tslibs.util cimport (
|
|||
|
get_c_string_buf_and_size,
|
|||
|
is_array,
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
cdef extern from "../src/headers/portable.h":
|
|||
|
int getdigit_ascii(char c, int default) nogil
|
|||
|
|
|||
|
cdef extern from "../src/parser/tokenizer.h":
|
|||
|
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
|
|||
|
int skip_trailing, int *error, int *maybe_int)
|
|||
|
|
|||
|
|
|||
|
# ----------------------------------------------------------------------
|
|||
|
# Constants
|
|||
|
|
|||
|
|
|||
|
class DateParseError(ValueError):
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0,
|
|||
|
second=0, microsecond=0)
|
|||
|
|
|||
|
cdef:
|
|||
|
set _not_datelike_strings = {"a", "A", "m", "M", "p", "P", "t", "T"}
|
|||
|
|
|||
|
# _timestamp_units -> units that we round to nanos
|
|||
|
set _timestamp_units = {
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_ns,
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_ps,
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_fs,
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_as,
|
|||
|
}
|
|||
|
|
|||
|
# ----------------------------------------------------------------------
|
|||
|
cdef:
|
|||
|
const char* delimiters = " /-."
|
|||
|
int MAX_DAYS_IN_MONTH = 31, MAX_MONTH = 12
|
|||
|
|
|||
|
|
|||
|
cdef bint _is_delimiter(const char ch):
|
|||
|
return strchr(delimiters, ch) != NULL
|
|||
|
|
|||
|
|
|||
|
cdef int _parse_1digit(const char* s):
|
|||
|
cdef int result = 0
|
|||
|
result += getdigit_ascii(s[0], -10) * 1
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
cdef int _parse_2digit(const char* s):
|
|||
|
cdef int result = 0
|
|||
|
result += getdigit_ascii(s[0], -10) * 10
|
|||
|
result += getdigit_ascii(s[1], -100) * 1
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
cdef int _parse_4digit(const char* s):
|
|||
|
cdef int result = 0
|
|||
|
result += getdigit_ascii(s[0], -10) * 1000
|
|||
|
result += getdigit_ascii(s[1], -100) * 100
|
|||
|
result += getdigit_ascii(s[2], -1000) * 10
|
|||
|
result += getdigit_ascii(s[3], -10000) * 1
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
cdef datetime _parse_delimited_date(
|
|||
|
str date_string, bint dayfirst, NPY_DATETIMEUNIT* out_bestunit
|
|||
|
):
|
|||
|
"""
|
|||
|
Parse special cases of dates: MM/DD/YYYY, DD/MM/YYYY, MM/YYYY.
|
|||
|
|
|||
|
At the beginning function tries to parse date in MM/DD/YYYY format, but
|
|||
|
if month > 12 - in DD/MM/YYYY (`dayfirst == False`).
|
|||
|
With `dayfirst == True` function makes an attempt to parse date in
|
|||
|
DD/MM/YYYY, if an attempt is wrong - in DD/MM/YYYY
|
|||
|
|
|||
|
For MM/DD/YYYY, DD/MM/YYYY: delimiter can be a space or one of /-.
|
|||
|
For MM/YYYY: delimiter can be a space or one of /-
|
|||
|
If `date_string` can't be converted to date, then function returns
|
|||
|
None, None
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
date_string : str
|
|||
|
dayfirst : bool
|
|||
|
out_bestunit : NPY_DATETIMEUNIT*
|
|||
|
For specifying identified resolution.
|
|||
|
|
|||
|
Returns:
|
|||
|
--------
|
|||
|
datetime or None
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
const char* buf
|
|||
|
Py_ssize_t length
|
|||
|
int day = 1, month = 1, year
|
|||
|
bint can_swap = 0
|
|||
|
|
|||
|
buf = get_c_string_buf_and_size(date_string, &length)
|
|||
|
if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
|
|||
|
# parsing MM?DD?YYYY and DD?MM?YYYY dates
|
|||
|
month = _parse_2digit(buf)
|
|||
|
day = _parse_2digit(buf + 3)
|
|||
|
year = _parse_4digit(buf + 6)
|
|||
|
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
|
|||
|
can_swap = 1
|
|||
|
elif length == 9 and _is_delimiter(buf[1]) and _is_delimiter(buf[4]):
|
|||
|
# parsing M?DD?YYYY and D?MM?YYYY dates
|
|||
|
month = _parse_1digit(buf)
|
|||
|
day = _parse_2digit(buf + 2)
|
|||
|
year = _parse_4digit(buf + 5)
|
|||
|
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
|
|||
|
can_swap = 1
|
|||
|
elif length == 9 and _is_delimiter(buf[2]) and _is_delimiter(buf[4]):
|
|||
|
# parsing MM?D?YYYY and DD?M?YYYY dates
|
|||
|
month = _parse_2digit(buf)
|
|||
|
day = _parse_1digit(buf + 3)
|
|||
|
year = _parse_4digit(buf + 5)
|
|||
|
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
|
|||
|
can_swap = 1
|
|||
|
elif length == 8 and _is_delimiter(buf[1]) and _is_delimiter(buf[3]):
|
|||
|
# parsing M?D?YYYY and D?M?YYYY dates
|
|||
|
month = _parse_1digit(buf)
|
|||
|
day = _parse_1digit(buf + 2)
|
|||
|
year = _parse_4digit(buf + 4)
|
|||
|
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
|
|||
|
can_swap = 1
|
|||
|
elif length == 7 and _is_delimiter(buf[2]):
|
|||
|
# parsing MM?YYYY dates
|
|||
|
if buf[2] == b".":
|
|||
|
# we cannot reliably tell whether e.g. 10.2010 is a float
|
|||
|
# or a date, thus we refuse to parse it here
|
|||
|
return None
|
|||
|
month = _parse_2digit(buf)
|
|||
|
year = _parse_4digit(buf + 3)
|
|||
|
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
|
|||
|
else:
|
|||
|
return None
|
|||
|
|
|||
|
if month < 0 or day < 0 or year < 1000:
|
|||
|
# some part is not an integer, so
|
|||
|
# date_string can't be converted to date, above format
|
|||
|
return None
|
|||
|
|
|||
|
if 1 <= month <= MAX_DAYS_IN_MONTH and 1 <= day <= MAX_DAYS_IN_MONTH \
|
|||
|
and (month <= MAX_MONTH or day <= MAX_MONTH):
|
|||
|
if (month > MAX_MONTH or (day <= MAX_MONTH and dayfirst)) and can_swap:
|
|||
|
day, month = month, day
|
|||
|
# In Python <= 3.6.0 there is no range checking for invalid dates
|
|||
|
# in C api, thus we call faster C version for 3.6.1 or newer
|
|||
|
return datetime_new(year, month, day, 0, 0, 0, 0, None)
|
|||
|
|
|||
|
raise DateParseError(f"Invalid date specified ({month}/{day})")
|
|||
|
|
|||
|
|
|||
|
cdef bint _does_string_look_like_time(str parse_string):
|
|||
|
"""
|
|||
|
Checks whether given string is a time: it has to start either from
|
|||
|
H:MM or from HH:MM, and hour and minute values must be valid.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
parse_string : str
|
|||
|
|
|||
|
Returns:
|
|||
|
--------
|
|||
|
bool
|
|||
|
Whether given string is potentially a time.
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
const char* buf
|
|||
|
Py_ssize_t length
|
|||
|
int hour = -1, minute = -1
|
|||
|
|
|||
|
buf = get_c_string_buf_and_size(parse_string, &length)
|
|||
|
if length >= 4:
|
|||
|
if buf[1] == b":":
|
|||
|
# h:MM format
|
|||
|
hour = getdigit_ascii(buf[0], -1)
|
|||
|
minute = _parse_2digit(buf + 2)
|
|||
|
elif buf[2] == b":":
|
|||
|
# HH:MM format
|
|||
|
hour = _parse_2digit(buf)
|
|||
|
minute = _parse_2digit(buf + 3)
|
|||
|
|
|||
|
return 0 <= hour <= 23 and 0 <= minute <= 59
|
|||
|
|
|||
|
|
|||
|
def py_parse_datetime_string(
|
|||
|
str date_string, bint dayfirst=False, bint yearfirst=False
|
|||
|
):
|
|||
|
# Python-accessible version for testing (we can't just make
|
|||
|
# parse_datetime_string cpdef bc it has a pointer argument)
|
|||
|
cdef:
|
|||
|
NPY_DATETIMEUNIT out_bestunit
|
|||
|
|
|||
|
return parse_datetime_string(date_string, dayfirst, yearfirst, &out_bestunit)
|
|||
|
|
|||
|
|
|||
|
cdef datetime parse_datetime_string(
|
|||
|
# NB: This will break with np.str_ (GH#32264) even though
|
|||
|
# isinstance(npstrobj, str) evaluates to True, so caller must ensure
|
|||
|
# the argument is *exactly* 'str'
|
|||
|
str date_string,
|
|||
|
bint dayfirst,
|
|||
|
bint yearfirst,
|
|||
|
NPY_DATETIMEUNIT* out_bestunit
|
|||
|
):
|
|||
|
"""
|
|||
|
Parse datetime string, only returns datetime.
|
|||
|
Also cares special handling matching time patterns.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
datetime
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Does not handle "today" or "now", which caller is responsible for handling.
|
|||
|
"""
|
|||
|
|
|||
|
cdef:
|
|||
|
datetime dt
|
|||
|
bint is_quarter = 0
|
|||
|
|
|||
|
if not _does_string_look_like_datetime(date_string):
|
|||
|
raise ValueError(f'Given date string "{date_string}" not likely a datetime')
|
|||
|
|
|||
|
if _does_string_look_like_time(date_string):
|
|||
|
# use current datetime as default, not pass _DEFAULT_DATETIME
|
|||
|
dt = du_parse(date_string, dayfirst=dayfirst,
|
|||
|
yearfirst=yearfirst)
|
|||
|
return dt
|
|||
|
|
|||
|
dt = _parse_delimited_date(date_string, dayfirst, out_bestunit)
|
|||
|
if dt is not None:
|
|||
|
return dt
|
|||
|
|
|||
|
try:
|
|||
|
dt = _parse_dateabbr_string(
|
|||
|
date_string, _DEFAULT_DATETIME, None, out_bestunit, &is_quarter
|
|||
|
)
|
|||
|
return dt
|
|||
|
except DateParseError:
|
|||
|
raise
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME,
|
|||
|
dayfirst=dayfirst, yearfirst=yearfirst,
|
|||
|
ignoretz=False, out_bestunit=out_bestunit)
|
|||
|
return dt
|
|||
|
|
|||
|
|
|||
|
def parse_datetime_string_with_reso(
|
|||
|
str date_string, str freq=None, dayfirst=None, yearfirst=None
|
|||
|
):
|
|||
|
# NB: This will break with np.str_ (GH#45580) even though
|
|||
|
# isinstance(npstrobj, str) evaluates to True, so caller must ensure
|
|||
|
# the argument is *exactly* 'str'
|
|||
|
"""
|
|||
|
Try hard to parse datetime string, leveraging dateutil plus some extra
|
|||
|
goodies like quarter recognition.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
date_string : str
|
|||
|
freq : str or None, default None
|
|||
|
Helps with interpreting time string if supplied
|
|||
|
Corresponds to `offset.rule_code`
|
|||
|
dayfirst : bool, default None
|
|||
|
If None uses default from print_config
|
|||
|
yearfirst : bool, default None
|
|||
|
If None uses default from print_config
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
datetime
|
|||
|
str
|
|||
|
Describing resolution of parsed string.
|
|||
|
|
|||
|
Raises
|
|||
|
------
|
|||
|
ValueError : preliminary check suggests string is not datetime
|
|||
|
DateParseError : error within dateutil
|
|||
|
"""
|
|||
|
|
|||
|
if dayfirst is None:
|
|||
|
dayfirst = get_option("display.date_dayfirst")
|
|||
|
if yearfirst is None:
|
|||
|
yearfirst = get_option("display.date_yearfirst")
|
|||
|
|
|||
|
cdef:
|
|||
|
datetime parsed
|
|||
|
str reso
|
|||
|
bint string_to_dts_failed
|
|||
|
npy_datetimestruct dts
|
|||
|
NPY_DATETIMEUNIT out_bestunit
|
|||
|
int out_local = 0
|
|||
|
int out_tzoffset
|
|||
|
tzinfo tz
|
|||
|
bint is_quarter = 0
|
|||
|
|
|||
|
if not _does_string_look_like_datetime(date_string):
|
|||
|
raise ValueError(f'Given date string "{date_string}" not likely a datetime')
|
|||
|
|
|||
|
# Try iso8601 first, as it handles nanoseconds
|
|||
|
string_to_dts_failed = string_to_dts(
|
|||
|
date_string, &dts, &out_bestunit, &out_local,
|
|||
|
&out_tzoffset, False
|
|||
|
)
|
|||
|
if not string_to_dts_failed:
|
|||
|
# Match Timestamp and drop picoseconds, femtoseconds, attoseconds
|
|||
|
# The new resolution will just be nano
|
|||
|
# GH#50417
|
|||
|
if out_bestunit in _timestamp_units:
|
|||
|
out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns
|
|||
|
|
|||
|
if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns:
|
|||
|
# TODO: avoid circular import
|
|||
|
from pandas import Timestamp
|
|||
|
parsed = Timestamp(date_string)
|
|||
|
else:
|
|||
|
if out_local:
|
|||
|
tz = timezone(timedelta(minutes=out_tzoffset))
|
|||
|
else:
|
|||
|
tz = None
|
|||
|
parsed = datetime_new(
|
|||
|
dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz
|
|||
|
)
|
|||
|
|
|||
|
reso = npy_unit_to_attrname[out_bestunit]
|
|||
|
return parsed, reso
|
|||
|
|
|||
|
parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit)
|
|||
|
if parsed is not None:
|
|||
|
reso = npy_unit_to_attrname[out_bestunit]
|
|||
|
return parsed, reso
|
|||
|
|
|||
|
try:
|
|||
|
parsed = _parse_dateabbr_string(
|
|||
|
date_string, _DEFAULT_DATETIME, freq, &out_bestunit, &is_quarter
|
|||
|
)
|
|||
|
except DateParseError:
|
|||
|
raise
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
else:
|
|||
|
if is_quarter:
|
|||
|
reso = "quarter"
|
|||
|
else:
|
|||
|
reso = npy_unit_to_attrname[out_bestunit]
|
|||
|
return parsed, reso
|
|||
|
|
|||
|
parsed = dateutil_parse(date_string, _DEFAULT_DATETIME,
|
|||
|
dayfirst=dayfirst, yearfirst=yearfirst,
|
|||
|
ignoretz=False, out_bestunit=&out_bestunit)
|
|||
|
reso = npy_unit_to_attrname[out_bestunit]
|
|||
|
return parsed, reso
|
|||
|
|
|||
|
|
|||
|
cpdef bint _does_string_look_like_datetime(str py_string):
|
|||
|
"""
|
|||
|
Checks whether given string is a datetime: it has to start with '0' or
|
|||
|
be greater than 1000.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
py_string: str
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
bool
|
|||
|
Whether given string is potentially a datetime.
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
const char *buf
|
|||
|
char *endptr = NULL
|
|||
|
Py_ssize_t length = -1
|
|||
|
double converted_date
|
|||
|
char first
|
|||
|
int error = 0
|
|||
|
|
|||
|
buf = get_c_string_buf_and_size(py_string, &length)
|
|||
|
if length >= 1:
|
|||
|
first = buf[0]
|
|||
|
if first == b"0":
|
|||
|
# Strings starting with 0 are more consistent with a
|
|||
|
# date-like string than a number
|
|||
|
return True
|
|||
|
elif py_string in _not_datelike_strings:
|
|||
|
return False
|
|||
|
else:
|
|||
|
# xstrtod with such parameters copies behavior of python `float`
|
|||
|
# cast; for example, " 35.e-1 " is valid string for this cast so,
|
|||
|
# for correctly xstrtod call necessary to pass these params:
|
|||
|
# b'.' - a dot is used as separator, b'e' - an exponential form of
|
|||
|
# a float number can be used, b'\0' - not to use a thousand
|
|||
|
# separator, 1 - skip extra spaces before and after,
|
|||
|
converted_date = xstrtod(buf, &endptr,
|
|||
|
b".", b"e", b"\0", 1, &error, NULL)
|
|||
|
# if there were no errors and the whole line was parsed, then ...
|
|||
|
if error == 0 and endptr == buf + length:
|
|||
|
return converted_date >= 1000
|
|||
|
|
|||
|
return True
|
|||
|
|
|||
|
|
|||
|
cdef datetime _parse_dateabbr_string(str date_string, datetime default,
|
|||
|
str freq, NPY_DATETIMEUNIT* out_bestunit,
|
|||
|
bint* is_quarter):
|
|||
|
# special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
|
|||
|
cdef:
|
|||
|
datetime ret
|
|||
|
# year initialized to prevent compiler warnings
|
|||
|
int year = -1, quarter = -1, month
|
|||
|
Py_ssize_t date_len
|
|||
|
const char* buf
|
|||
|
|
|||
|
if date_string in nat_strings:
|
|||
|
# default to nanos, could also reasonably do NPY_FR_GENERIC
|
|||
|
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_ns
|
|||
|
return NaT
|
|||
|
|
|||
|
date_string = date_string.upper()
|
|||
|
date_len = len(date_string)
|
|||
|
|
|||
|
if date_len == 4:
|
|||
|
# parse year only like 2000
|
|||
|
try:
|
|||
|
ret = default.replace(year=int(date_string))
|
|||
|
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_Y
|
|||
|
return ret
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
if 4 <= date_len <= 7:
|
|||
|
buf = get_c_string_buf_and_size(date_string, &date_len)
|
|||
|
try:
|
|||
|
i = date_string.index("Q", 1, 6)
|
|||
|
if i == 1:
|
|||
|
quarter = _parse_1digit(buf) # i.e. int(date_string[0])
|
|||
|
if date_len == 4 or (date_len == 5
|
|||
|
and date_string[i + 1] == "-"):
|
|||
|
# r'(\d)Q-?(\d\d)')
|
|||
|
year = 2000 + int(date_string[-2:])
|
|||
|
elif date_len == 6 or (date_len == 7
|
|||
|
and date_string[i + 1] == "-"):
|
|||
|
# r'(\d)Q-?(\d\d\d\d)')
|
|||
|
year = int(date_string[-4:])
|
|||
|
else:
|
|||
|
raise ValueError
|
|||
|
elif i == 2 or i == 3:
|
|||
|
# r'(\d\d)-?Q(\d)'
|
|||
|
if date_len == 4 or (date_len == 5
|
|||
|
and date_string[i - 1] == "-"):
|
|||
|
# i.e. quarter = int(date_string[-1])
|
|||
|
quarter = _parse_1digit(buf + date_len - 1)
|
|||
|
year = 2000 + int(date_string[:2])
|
|||
|
else:
|
|||
|
raise ValueError
|
|||
|
elif i == 4 or i == 5:
|
|||
|
if date_len == 6 or (date_len == 7
|
|||
|
and date_string[i - 1] == "-"):
|
|||
|
# r'(\d\d\d\d)-?Q(\d)'
|
|||
|
# i.e. quarter = int(date_string[-1])
|
|||
|
quarter = _parse_1digit(buf + date_len - 1)
|
|||
|
year = int(date_string[:4])
|
|||
|
else:
|
|||
|
raise ValueError
|
|||
|
|
|||
|
if not (1 <= quarter <= 4):
|
|||
|
raise DateParseError(f"Incorrect quarterly string is given, "
|
|||
|
f"quarter must be "
|
|||
|
f"between 1 and 4: {date_string}")
|
|||
|
|
|||
|
try:
|
|||
|
# GH#1228
|
|||
|
year, month = quarter_to_myear(year, quarter, freq)
|
|||
|
except KeyError:
|
|||
|
raise DateParseError("Unable to retrieve month "
|
|||
|
"information from given "
|
|||
|
f"freq: {freq}")
|
|||
|
|
|||
|
ret = default.replace(year=year, month=month)
|
|||
|
# Monthly is as close as we can get to a non-existent NPY_FR_Q
|
|||
|
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
|
|||
|
is_quarter[0] = 1
|
|||
|
return ret
|
|||
|
|
|||
|
except DateParseError:
|
|||
|
raise
|
|||
|
except ValueError:
|
|||
|
# e.g. if "Q" is not in date_string and .index raised
|
|||
|
pass
|
|||
|
|
|||
|
if date_len == 6 and freq == "M":
|
|||
|
year = int(date_string[:4])
|
|||
|
month = int(date_string[4:6])
|
|||
|
try:
|
|||
|
ret = default.replace(year=year, month=month)
|
|||
|
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
|
|||
|
return ret
|
|||
|
except ValueError as err:
|
|||
|
# We can infer that none of the patterns below will match
|
|||
|
raise ValueError(f"Unable to parse {date_string}") from err
|
|||
|
|
|||
|
for pat in ["%Y-%m", "%b %Y", "%b-%Y"]:
|
|||
|
try:
|
|||
|
ret = datetime.strptime(date_string, pat)
|
|||
|
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
|
|||
|
return ret
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
raise ValueError(f"Unable to parse {date_string}")
|
|||
|
|
|||
|
|
|||
|
cpdef quarter_to_myear(int year, int quarter, str freq):
|
|||
|
"""
|
|||
|
A quarterly frequency defines a "year" which may not coincide with
|
|||
|
the calendar-year. Find the calendar-year and calendar-month associated
|
|||
|
with the given year and quarter under the `freq`-derived calendar.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
year : int
|
|||
|
quarter : int
|
|||
|
freq : str or None
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
year : int
|
|||
|
month : int
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Period.qyear
|
|||
|
"""
|
|||
|
if quarter <= 0 or quarter > 4:
|
|||
|
raise ValueError("Quarter must be 1 <= q <= 4")
|
|||
|
|
|||
|
if freq is not None:
|
|||
|
mnum = c_MONTH_NUMBERS[get_rule_month(freq)] + 1
|
|||
|
month = (mnum + (quarter - 1) * 3) % 12 + 1
|
|||
|
if month > mnum:
|
|||
|
year -= 1
|
|||
|
else:
|
|||
|
month = (quarter - 1) * 3 + 1
|
|||
|
|
|||
|
return year, month
|
|||
|
|
|||
|
|
|||
|
cdef datetime dateutil_parse(
|
|||
|
str timestr,
|
|||
|
datetime default,
|
|||
|
bint ignoretz,
|
|||
|
bint dayfirst,
|
|||
|
bint yearfirst,
|
|||
|
NPY_DATETIMEUNIT* out_bestunit
|
|||
|
):
|
|||
|
""" lifted from dateutil to get resolution"""
|
|||
|
|
|||
|
cdef:
|
|||
|
str attr
|
|||
|
datetime ret
|
|||
|
object res
|
|||
|
str reso = None
|
|||
|
dict repl = {}
|
|||
|
|
|||
|
try:
|
|||
|
res, _ = DEFAULTPARSER._parse(timestr, dayfirst=dayfirst, yearfirst=yearfirst)
|
|||
|
except InvalidOperation:
|
|||
|
# GH#51157 dateutil can raise decimal.InvalidOperation
|
|||
|
res = None
|
|||
|
|
|||
|
if res is None:
|
|||
|
raise DateParseError(
|
|||
|
f"Unknown datetime string format, unable to parse: {timestr}"
|
|||
|
)
|
|||
|
|
|||
|
for attr in ["year", "month", "day", "hour",
|
|||
|
"minute", "second", "microsecond"]:
|
|||
|
value = getattr(res, attr)
|
|||
|
if value is not None:
|
|||
|
repl[attr] = value
|
|||
|
reso = attr
|
|||
|
|
|||
|
if reso is None:
|
|||
|
raise DateParseError(f"Unable to parse datetime string: {timestr}")
|
|||
|
|
|||
|
if reso == "microsecond":
|
|||
|
if repl["microsecond"] == 0:
|
|||
|
reso = "second"
|
|||
|
elif repl["microsecond"] % 1000 == 0:
|
|||
|
reso = "millisecond"
|
|||
|
|
|||
|
try:
|
|||
|
ret = default.replace(**repl)
|
|||
|
except ValueError as err:
|
|||
|
# e.g. "day is out of range for month"
|
|||
|
# we re-raise to match dateutil's exception message
|
|||
|
raise DateParseError(str(err) + ": " + timestr) from err
|
|||
|
except OverflowError as err:
|
|||
|
# with e.g. "08335394550" dateutil raises when trying to pass
|
|||
|
# year=8335394550 to datetime.replace
|
|||
|
raise OutOfBoundsDatetime(
|
|||
|
f'Parsing "{timestr}" to datetime overflows'
|
|||
|
) from err
|
|||
|
|
|||
|
if res.weekday is not None and not res.day:
|
|||
|
ret = ret + relativedelta.relativedelta(weekday=res.weekday)
|
|||
|
if not ignoretz:
|
|||
|
if res.tzname and res.tzname in time.tzname:
|
|||
|
# GH#50791
|
|||
|
if res.tzname != "UTC":
|
|||
|
# If the system is localized in UTC (as many CI runs are)
|
|||
|
# we get tzlocal, once the deprecation is enforced will get
|
|||
|
# timezone.utc, not raise.
|
|||
|
warnings.warn(
|
|||
|
"Parsing '{res.tzname}' as tzlocal (dependent on system timezone) "
|
|||
|
"is deprecated and will raise in a future version. Pass the 'tz' "
|
|||
|
"keyword or call tz_localize after construction instead",
|
|||
|
FutureWarning,
|
|||
|
stacklevel=find_stack_level()
|
|||
|
)
|
|||
|
ret = ret.replace(tzinfo=_dateutil_tzlocal())
|
|||
|
elif res.tzoffset == 0:
|
|||
|
ret = ret.replace(tzinfo=_dateutil_tzutc())
|
|||
|
elif res.tzoffset:
|
|||
|
ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset))
|
|||
|
|
|||
|
# dateutil can return a datetime with a tzoffset outside of (-24H, 24H)
|
|||
|
# bounds, which is invalid (can be constructed, but raises if we call
|
|||
|
# str(ret)). Check that and raise here if necessary.
|
|||
|
try:
|
|||
|
ret.utcoffset()
|
|||
|
except ValueError as err:
|
|||
|
# offset must be a timedelta strictly between -timedelta(hours=24)
|
|||
|
# and timedelta(hours=24)
|
|||
|
raise ValueError(
|
|||
|
f'Parsed string "{timestr}" gives an invalid tzoffset, '
|
|||
|
"which must be between -timedelta(hours=24) and timedelta(hours=24)"
|
|||
|
)
|
|||
|
|
|||
|
out_bestunit[0] = attrname_to_npy_unit[reso]
|
|||
|
return ret
|
|||
|
|
|||
|
|
|||
|
# ----------------------------------------------------------------------
|
|||
|
# Parsing for type-inference
|
|||
|
|
|||
|
|
|||
|
def try_parse_dates(object[:] values, parser) -> np.ndarray:
|
|||
|
cdef:
|
|||
|
Py_ssize_t i, n
|
|||
|
object[::1] result
|
|||
|
|
|||
|
n = len(values)
|
|||
|
result = np.empty(n, dtype="O")
|
|||
|
|
|||
|
for i in range(n):
|
|||
|
if values[i] == "":
|
|||
|
result[i] = np.nan
|
|||
|
else:
|
|||
|
result[i] = parser(values[i])
|
|||
|
|
|||
|
return result.base # .base to access underlying ndarray
|
|||
|
|
|||
|
|
|||
|
def try_parse_year_month_day(
|
|||
|
object[:] years, object[:] months, object[:] days
|
|||
|
) -> np.ndarray:
|
|||
|
cdef:
|
|||
|
Py_ssize_t i, n
|
|||
|
object[::1] result
|
|||
|
|
|||
|
n = len(years)
|
|||
|
# TODO(cython3): Use len instead of `shape[0]`
|
|||
|
if months.shape[0] != n or days.shape[0] != n:
|
|||
|
raise ValueError("Length of years/months/days must all be equal")
|
|||
|
result = np.empty(n, dtype="O")
|
|||
|
|
|||
|
for i in range(n):
|
|||
|
result[i] = datetime(int(years[i]), int(months[i]), int(days[i]))
|
|||
|
|
|||
|
return result.base # .base to access underlying ndarray
|
|||
|
|
|||
|
|
|||
|
# ----------------------------------------------------------------------
|
|||
|
# Miscellaneous
|
|||
|
|
|||
|
|
|||
|
# Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
|
|||
|
#
|
|||
|
# We use this class to parse and tokenize date strings. However, as it is
|
|||
|
# a private class in the dateutil library, relying on backwards compatibility
|
|||
|
# is not practical. In fact, using this class issues warnings (xref gh-21322).
|
|||
|
# Thus, we port the class over so that both issues are resolved.
|
|||
|
#
|
|||
|
# Copyright (c) 2017 - dateutil contributors
|
|||
|
class _timelex:
|
|||
|
def __init__(self, instream):
|
|||
|
if getattr(instream, "decode", None) is not None:
|
|||
|
instream = instream.decode()
|
|||
|
|
|||
|
if isinstance(instream, str):
|
|||
|
self.stream = instream
|
|||
|
elif getattr(instream, "read", None) is None:
|
|||
|
raise TypeError(
|
|||
|
"Parser must be a string or character stream, not "
|
|||
|
f"{type(instream).__name__}")
|
|||
|
else:
|
|||
|
self.stream = instream.read()
|
|||
|
|
|||
|
def get_tokens(self):
|
|||
|
"""
|
|||
|
This function breaks the time string into lexical units (tokens), which
|
|||
|
can be parsed by the parser. Lexical units are demarcated by changes in
|
|||
|
the character set, so any continuous string of letters is considered
|
|||
|
one unit, any continuous string of numbers is considered one unit.
|
|||
|
The main complication arises from the fact that dots ('.') can be used
|
|||
|
both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
|
|||
|
"4:30:21.447"). As such, it is necessary to read the full context of
|
|||
|
any dot-separated strings before breaking it into tokens; as such, this
|
|||
|
function maintains a "token stack", for when the ambiguous context
|
|||
|
demands that multiple tokens be parsed at once.
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
Py_ssize_t n
|
|||
|
|
|||
|
stream = self.stream.replace("\x00", "")
|
|||
|
|
|||
|
# TODO: Change \s --> \s+ (this doesn't match existing behavior)
|
|||
|
# TODO: change the punctuation block to punc+ (does not match existing)
|
|||
|
# TODO: can we merge the two digit patterns?
|
|||
|
tokens = re.findall(r"\s|"
|
|||
|
r"(?<![\.\d])\d+\.\d+(?![\.\d])"
|
|||
|
r"|\d+"
|
|||
|
r"|[a-zA-Z]+"
|
|||
|
r"|[\./:]+"
|
|||
|
r"|[^\da-zA-Z\./:\s]+", stream)
|
|||
|
|
|||
|
# Re-combine token tuples of the form ["59", ",", "456"] because
|
|||
|
# in this context the "," is treated as a decimal
|
|||
|
# (e.g. in python's default logging format)
|
|||
|
for n, token in enumerate(tokens[:-2]):
|
|||
|
# Kludge to match ,-decimal behavior; it'd be better to do this
|
|||
|
# later in the process and have a simpler tokenization
|
|||
|
if (token is not None and token.isdigit() and
|
|||
|
tokens[n + 1] == "," and tokens[n + 2].isdigit()):
|
|||
|
# Have to check None b/c it might be replaced during the loop
|
|||
|
# TODO: I _really_ don't faking the value here
|
|||
|
tokens[n] = token + "." + tokens[n + 2]
|
|||
|
tokens[n + 1] = None
|
|||
|
tokens[n + 2] = None
|
|||
|
|
|||
|
tokens = [x for x in tokens if x is not None]
|
|||
|
return tokens
|
|||
|
|
|||
|
@classmethod
|
|||
|
def split(cls, s):
|
|||
|
return cls(s).get_tokens()
|
|||
|
|
|||
|
|
|||
|
_DATEUTIL_LEXER_SPLIT = _timelex.split
|
|||
|
|
|||
|
|
|||
|
def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
|
|||
|
"""
|
|||
|
Guess the datetime format of a given datetime string.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
dt_str : str
|
|||
|
Datetime string to guess the format of.
|
|||
|
dayfirst : bool, default False
|
|||
|
If True parses dates with the day first, eg 20/01/2005
|
|||
|
Warning: dayfirst=True is not strict, but will prefer to parse
|
|||
|
with day first (this is a known bug).
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
str or None : ret
|
|||
|
datetime format string (for `strftime` or `strptime`),
|
|||
|
or None if it can't be guessed.
|
|||
|
"""
|
|||
|
day_attribute_and_format = (("day",), "%d", 2)
|
|||
|
|
|||
|
# attr name, format, padding (if any)
|
|||
|
datetime_attrs_to_format = [
|
|||
|
(("year", "month", "day", "hour", "minute", "second"), "%Y%m%d%H%M%S", 0),
|
|||
|
(("year", "month", "day", "hour", "minute"), "%Y%m%d%H%M", 0),
|
|||
|
(("year", "month", "day", "hour"), "%Y%m%d%H", 0),
|
|||
|
(("year", "month", "day"), "%Y%m%d", 0),
|
|||
|
(("hour", "minute", "second"), "%H%M%S", 0),
|
|||
|
(("hour", "minute"), "%H%M", 0),
|
|||
|
(("year",), "%Y", 0),
|
|||
|
(("month",), "%B", 0),
|
|||
|
(("month",), "%b", 0),
|
|||
|
(("month",), "%m", 2),
|
|||
|
day_attribute_and_format,
|
|||
|
(("hour",), "%H", 2),
|
|||
|
(("minute",), "%M", 2),
|
|||
|
(("second",), "%S", 2),
|
|||
|
(("second", "microsecond"), "%S.%f", 0),
|
|||
|
(("tzinfo",), "%z", 0),
|
|||
|
(("tzinfo",), "%Z", 0),
|
|||
|
(("day_of_week",), "%a", 0),
|
|||
|
(("day_of_week",), "%A", 0),
|
|||
|
(("meridiem",), "%p", 0),
|
|||
|
]
|
|||
|
|
|||
|
if dayfirst:
|
|||
|
datetime_attrs_to_format.remove(day_attribute_and_format)
|
|||
|
datetime_attrs_to_format.insert(0, day_attribute_and_format)
|
|||
|
|
|||
|
try:
|
|||
|
parsed_datetime = du_parse(dt_str, dayfirst=dayfirst)
|
|||
|
except (ValueError, OverflowError, InvalidOperation):
|
|||
|
# In case the datetime can't be parsed, its format cannot be guessed
|
|||
|
return None
|
|||
|
|
|||
|
if parsed_datetime is None:
|
|||
|
return None
|
|||
|
|
|||
|
# _DATEUTIL_LEXER_SPLIT from dateutil will never raise here
|
|||
|
tokens = _DATEUTIL_LEXER_SPLIT(dt_str)
|
|||
|
|
|||
|
# Normalize offset part of tokens.
|
|||
|
# There are multiple formats for the timezone offset.
|
|||
|
# To pass the comparison condition between the output of `strftime` and
|
|||
|
# joined tokens, which is carried out at the final step of the function,
|
|||
|
# the offset part of the tokens must match the '%z' format like '+0900'
|
|||
|
# instead of ‘+09:00’.
|
|||
|
if parsed_datetime.tzinfo is not None:
|
|||
|
offset_index = None
|
|||
|
if len(tokens) > 0 and tokens[-1] == "Z":
|
|||
|
# the last 'Z' means zero offset
|
|||
|
offset_index = -1
|
|||
|
elif len(tokens) > 1 and tokens[-2] in ("+", "-"):
|
|||
|
# ex. [..., '+', '0900']
|
|||
|
offset_index = -2
|
|||
|
elif len(tokens) > 3 and tokens[-4] in ("+", "-"):
|
|||
|
# ex. [..., '+', '09', ':', '00']
|
|||
|
offset_index = -4
|
|||
|
|
|||
|
if offset_index is not None:
|
|||
|
# If the input string has a timezone offset like '+0900',
|
|||
|
# the offset is separated into two tokens, ex. ['+', '0900’].
|
|||
|
# This separation will prevent subsequent processing
|
|||
|
# from correctly parsing the time zone format.
|
|||
|
# So in addition to the format nomalization, we rejoin them here.
|
|||
|
try:
|
|||
|
tokens[offset_index] = parsed_datetime.strftime("%z")
|
|||
|
except ValueError:
|
|||
|
# Invalid offset might not have raised in du_parse
|
|||
|
# https://github.com/dateutil/dateutil/issues/188
|
|||
|
return None
|
|||
|
tokens = tokens[:offset_index + 1 or None]
|
|||
|
|
|||
|
format_guess = [None] * len(tokens)
|
|||
|
found_attrs = set()
|
|||
|
|
|||
|
for attrs, attr_format, padding in datetime_attrs_to_format:
|
|||
|
# If a given attribute has been placed in the format string, skip
|
|||
|
# over other formats for that same underlying attribute (IE, month
|
|||
|
# can be represented in multiple different ways)
|
|||
|
if set(attrs) & found_attrs:
|
|||
|
continue
|
|||
|
|
|||
|
if parsed_datetime.tzinfo is None and attr_format in ("%Z", "%z"):
|
|||
|
continue
|
|||
|
|
|||
|
parsed_formatted = parsed_datetime.strftime(attr_format)
|
|||
|
for i, token_format in enumerate(format_guess):
|
|||
|
token_filled = _fill_token(tokens[i], padding)
|
|||
|
if token_format is None and token_filled == parsed_formatted:
|
|||
|
format_guess[i] = attr_format
|
|||
|
tokens[i] = token_filled
|
|||
|
found_attrs.update(attrs)
|
|||
|
break
|
|||
|
|
|||
|
# Only consider it a valid guess if we have a year, month and day.
|
|||
|
# We make exceptions for %Y and %Y-%m (only with the `-` separator)
|
|||
|
# as they conform with ISO8601.
|
|||
|
if (
|
|||
|
len({"year", "month", "day"} & found_attrs) != 3
|
|||
|
and format_guess != ["%Y"]
|
|||
|
and not (
|
|||
|
format_guess == ["%Y", None, "%m"] and tokens[1] == "-"
|
|||
|
)
|
|||
|
):
|
|||
|
return None
|
|||
|
|
|||
|
output_format = []
|
|||
|
for i, guess in enumerate(format_guess):
|
|||
|
if guess is not None:
|
|||
|
# Either fill in the format placeholder (like %Y)
|
|||
|
output_format.append(guess)
|
|||
|
else:
|
|||
|
# Or just the token separate (IE, the dashes in "01-01-2013")
|
|||
|
try:
|
|||
|
# If the token is numeric, then we likely didn't parse it
|
|||
|
# properly, so our guess is wrong
|
|||
|
float(tokens[i])
|
|||
|
return None
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
output_format.append(tokens[i])
|
|||
|
|
|||
|
# if am/pm token present, replace 24-hour %H, with 12-hour %I
|
|||
|
if "%p" in output_format and "%H" in output_format:
|
|||
|
i = output_format.index("%H")
|
|||
|
output_format[i] = "%I"
|
|||
|
|
|||
|
guessed_format = "".join(output_format)
|
|||
|
|
|||
|
try:
|
|||
|
array_strptime(np.asarray([dt_str], dtype=object), guessed_format)
|
|||
|
except ValueError:
|
|||
|
# Doesn't parse, so this can't be the correct format.
|
|||
|
return None
|
|||
|
# rebuild string, capturing any inferred padding
|
|||
|
dt_str = "".join(tokens)
|
|||
|
if parsed_datetime.strftime(guessed_format) == dt_str:
|
|||
|
_maybe_warn_about_dayfirst(guessed_format, dayfirst)
|
|||
|
return guessed_format
|
|||
|
else:
|
|||
|
return None
|
|||
|
|
|||
|
|
|||
|
cdef str _fill_token(token: str, padding: int):
|
|||
|
cdef str token_filled
|
|||
|
if re.search(r"\d+\.\d+", token) is None:
|
|||
|
# For example: 98
|
|||
|
token_filled = token.zfill(padding)
|
|||
|
else:
|
|||
|
# For example: 00.123
|
|||
|
seconds, nanoseconds = token.split(".")
|
|||
|
seconds = f"{int(seconds):02d}"
|
|||
|
# right-pad so we get nanoseconds, then only take
|
|||
|
# first 6 digits (microseconds) as stdlib datetime
|
|||
|
# doesn't support nanoseconds
|
|||
|
nanoseconds = nanoseconds.ljust(9, "0")[:6]
|
|||
|
token_filled = f"{seconds}.{nanoseconds}"
|
|||
|
return token_filled
|
|||
|
|
|||
|
|
|||
|
cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst):
|
|||
|
"""Warn if guessed datetime format doesn't respect dayfirst argument."""
|
|||
|
cdef:
|
|||
|
int day_index = format.find("%d")
|
|||
|
int month_index = format.find("%m")
|
|||
|
|
|||
|
if (day_index != -1) and (month_index != -1):
|
|||
|
if (day_index > month_index) and dayfirst:
|
|||
|
warnings.warn(
|
|||
|
f"Parsing dates in {format} format when dayfirst=True was specified. "
|
|||
|
"Pass `dayfirst=False` or specify a format to silence this warning.",
|
|||
|
UserWarning,
|
|||
|
stacklevel=find_stack_level(),
|
|||
|
)
|
|||
|
if (day_index < month_index) and not dayfirst:
|
|||
|
warnings.warn(
|
|||
|
f"Parsing dates in {format} format when dayfirst=False (the default) "
|
|||
|
"was specified. "
|
|||
|
"Pass `dayfirst=True` or specify a format to silence this warning.",
|
|||
|
UserWarning,
|
|||
|
stacklevel=find_stack_level(),
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
@cython.wraparound(False)
|
|||
|
@cython.boundscheck(False)
|
|||
|
cdef object convert_to_unicode(object item, bint keep_trivial_numbers):
|
|||
|
"""
|
|||
|
Convert `item` to str.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
item : object
|
|||
|
keep_trivial_numbers : bool
|
|||
|
if True, then conversion (to string from integer/float zero)
|
|||
|
is not performed
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
str or int or float
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
float64_t float_item
|
|||
|
|
|||
|
if keep_trivial_numbers:
|
|||
|
if isinstance(item, int):
|
|||
|
if <int>item == 0:
|
|||
|
return item
|
|||
|
elif isinstance(item, float):
|
|||
|
float_item = item
|
|||
|
if float_item == 0.0 or float_item != float_item:
|
|||
|
return item
|
|||
|
|
|||
|
if not isinstance(item, str):
|
|||
|
item = PyObject_Str(item)
|
|||
|
|
|||
|
return item
|
|||
|
|
|||
|
|
|||
|
@cython.wraparound(False)
|
|||
|
@cython.boundscheck(False)
|
|||
|
def concat_date_cols(tuple date_cols) -> np.ndarray:
|
|||
|
"""
|
|||
|
Concatenates elements from numpy arrays in `date_cols` into strings.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
date_cols : tuple[ndarray]
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
arr_of_rows : ndarray[object]
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
|
|||
|
>>> times=np.array(['11:20', '10:45'], dtype=object)
|
|||
|
>>> result = concat_date_cols((dates, times))
|
|||
|
>>> result
|
|||
|
array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
Py_ssize_t rows_count = 0, col_count = len(date_cols)
|
|||
|
Py_ssize_t col_idx, row_idx
|
|||
|
list list_to_join
|
|||
|
cnp.ndarray[object] iters
|
|||
|
object[::1] iters_view
|
|||
|
flatiter it
|
|||
|
cnp.ndarray[object] result
|
|||
|
object[::1] result_view
|
|||
|
|
|||
|
if col_count == 0:
|
|||
|
return np.zeros(0, dtype=object)
|
|||
|
|
|||
|
if not all(is_array(array) for array in date_cols):
|
|||
|
raise ValueError("not all elements from date_cols are numpy arrays")
|
|||
|
|
|||
|
rows_count = min(len(array) for array in date_cols)
|
|||
|
result = np.zeros(rows_count, dtype=object)
|
|||
|
result_view = result
|
|||
|
|
|||
|
if col_count == 1:
|
|||
|
array = date_cols[0]
|
|||
|
it = <flatiter>PyArray_IterNew(array)
|
|||
|
for row_idx in range(rows_count):
|
|||
|
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
|
|||
|
result_view[row_idx] = convert_to_unicode(item, True)
|
|||
|
PyArray_ITER_NEXT(it)
|
|||
|
else:
|
|||
|
# create fixed size list - more efficient memory allocation
|
|||
|
list_to_join = [None] * col_count
|
|||
|
iters = np.zeros(col_count, dtype=object)
|
|||
|
|
|||
|
# create memoryview of iters ndarray, that will contain some
|
|||
|
# flatiter's for each array in `date_cols` - more efficient indexing
|
|||
|
iters_view = iters
|
|||
|
for col_idx, array in enumerate(date_cols):
|
|||
|
iters_view[col_idx] = PyArray_IterNew(array)
|
|||
|
|
|||
|
# array elements that are on the same line are converted to one string
|
|||
|
for row_idx in range(rows_count):
|
|||
|
for col_idx, array in enumerate(date_cols):
|
|||
|
# this cast is needed, because we did not find a way
|
|||
|
# to efficiently store `flatiter` type objects in ndarray
|
|||
|
it = <flatiter>iters_view[col_idx]
|
|||
|
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
|
|||
|
list_to_join[col_idx] = convert_to_unicode(item, False)
|
|||
|
PyArray_ITER_NEXT(it)
|
|||
|
result_view[row_idx] = " ".join(list_to_join)
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
cpdef str get_rule_month(str source):
|
|||
|
"""
|
|||
|
Return starting month of given freq, default is December.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
source : str
|
|||
|
Derived from `freq.rule_code` or `freq.freqstr`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
rule_month: str
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> get_rule_month('D')
|
|||
|
'DEC'
|
|||
|
|
|||
|
>>> get_rule_month('A-JAN')
|
|||
|
'JAN'
|
|||
|
"""
|
|||
|
source = source.upper()
|
|||
|
if "-" not in source:
|
|||
|
return "DEC"
|
|||
|
else:
|
|||
|
return source.split("-")[1]
|