200 lines
6.6 KiB
Python
200 lines
6.6 KiB
Python
import numpy as np
|
|
|
|
from pandas._libs import lib
|
|
|
|
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
|
|
from pandas.core.dtypes.common import (
|
|
ensure_object,
|
|
is_datetime_or_timedelta_dtype,
|
|
is_decimal,
|
|
is_number,
|
|
is_numeric_dtype,
|
|
is_scalar,
|
|
needs_i8_conversion,
|
|
)
|
|
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
|
|
|
|
import pandas as pd
|
|
|
|
|
|
def to_numeric(arg, errors="raise", downcast=None):
|
|
"""
|
|
Convert argument to a numeric type.
|
|
|
|
The default return dtype is `float64` or `int64`
|
|
depending on the data supplied. Use the `downcast` parameter
|
|
to obtain other dtypes.
|
|
|
|
Please note that precision loss may occur if really large numbers
|
|
are passed in. Due to the internal limitations of `ndarray`, if
|
|
numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
|
|
or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
|
|
passed in, it is very likely they will be converted to float so that
|
|
they can stored in an `ndarray`. These warnings apply similarly to
|
|
`Series` since it internally leverages `ndarray`.
|
|
|
|
Parameters
|
|
----------
|
|
arg : scalar, list, tuple, 1-d array, or Series
|
|
Argument to be converted.
|
|
errors : {'ignore', 'raise', 'coerce'}, default 'raise'
|
|
- If 'raise', then invalid parsing will raise an exception.
|
|
- If 'coerce', then invalid parsing will be set as NaN.
|
|
- If 'ignore', then invalid parsing will return the input.
|
|
downcast : {'integer', 'signed', 'unsigned', 'float'}, default None
|
|
If not None, and if the data has been successfully cast to a
|
|
numerical dtype (or if the data was numeric to begin with),
|
|
downcast that resulting data to the smallest numerical dtype
|
|
possible according to the following rules:
|
|
|
|
- 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
|
|
- 'unsigned': smallest unsigned int dtype (min.: np.uint8)
|
|
- 'float': smallest float dtype (min.: np.float32)
|
|
|
|
As this behaviour is separate from the core conversion to
|
|
numeric values, any errors raised during the downcasting
|
|
will be surfaced regardless of the value of the 'errors' input.
|
|
|
|
In addition, downcasting will only occur if the size
|
|
of the resulting data's dtype is strictly larger than
|
|
the dtype it is to be cast to, so if none of the dtypes
|
|
checked satisfy that specification, no downcasting will be
|
|
performed on the data.
|
|
|
|
Returns
|
|
-------
|
|
ret
|
|
Numeric if parsing succeeded.
|
|
Return type depends on input. Series if Series, otherwise ndarray.
|
|
|
|
See Also
|
|
--------
|
|
DataFrame.astype : Cast argument to a specified dtype.
|
|
to_datetime : Convert argument to datetime.
|
|
to_timedelta : Convert argument to timedelta.
|
|
numpy.ndarray.astype : Cast a numpy array to a specified type.
|
|
DataFrame.convert_dtypes : Convert dtypes.
|
|
|
|
Examples
|
|
--------
|
|
Take separate series and convert to numeric, coercing when told to
|
|
|
|
>>> s = pd.Series(['1.0', '2', -3])
|
|
>>> pd.to_numeric(s)
|
|
0 1.0
|
|
1 2.0
|
|
2 -3.0
|
|
dtype: float64
|
|
>>> pd.to_numeric(s, downcast='float')
|
|
0 1.0
|
|
1 2.0
|
|
2 -3.0
|
|
dtype: float32
|
|
>>> pd.to_numeric(s, downcast='signed')
|
|
0 1
|
|
1 2
|
|
2 -3
|
|
dtype: int8
|
|
>>> s = pd.Series(['apple', '1.0', '2', -3])
|
|
>>> pd.to_numeric(s, errors='ignore')
|
|
0 apple
|
|
1 1.0
|
|
2 2
|
|
3 -3
|
|
dtype: object
|
|
>>> pd.to_numeric(s, errors='coerce')
|
|
0 NaN
|
|
1 1.0
|
|
2 2.0
|
|
3 -3.0
|
|
dtype: float64
|
|
"""
|
|
if downcast not in (None, "integer", "signed", "unsigned", "float"):
|
|
raise ValueError("invalid downcasting method provided")
|
|
|
|
if errors not in ("ignore", "raise", "coerce"):
|
|
raise ValueError("invalid error value specified")
|
|
|
|
is_series = False
|
|
is_index = False
|
|
is_scalars = False
|
|
|
|
if isinstance(arg, ABCSeries):
|
|
is_series = True
|
|
values = arg.values
|
|
elif isinstance(arg, ABCIndexClass):
|
|
is_index = True
|
|
if needs_i8_conversion(arg.dtype):
|
|
values = arg.asi8
|
|
else:
|
|
values = arg.values
|
|
elif isinstance(arg, (list, tuple)):
|
|
values = np.array(arg, dtype="O")
|
|
elif is_scalar(arg):
|
|
if is_decimal(arg):
|
|
return float(arg)
|
|
if is_number(arg):
|
|
return arg
|
|
is_scalars = True
|
|
values = np.array([arg], dtype="O")
|
|
elif getattr(arg, "ndim", 1) > 1:
|
|
raise TypeError("arg must be a list, tuple, 1-d array, or Series")
|
|
else:
|
|
values = arg
|
|
|
|
values_dtype = getattr(values, "dtype", None)
|
|
if is_numeric_dtype(values_dtype):
|
|
pass
|
|
elif is_datetime_or_timedelta_dtype(values_dtype):
|
|
values = values.astype(np.int64)
|
|
else:
|
|
values = ensure_object(values)
|
|
coerce_numeric = errors not in ("ignore", "raise")
|
|
try:
|
|
values = lib.maybe_convert_numeric(
|
|
values, set(), coerce_numeric=coerce_numeric
|
|
)
|
|
except (ValueError, TypeError):
|
|
if errors == "raise":
|
|
raise
|
|
|
|
# attempt downcast only if the data has been successfully converted
|
|
# to a numerical dtype and if a downcast method has been specified
|
|
if downcast is not None and is_numeric_dtype(values.dtype):
|
|
typecodes = None
|
|
|
|
if downcast in ("integer", "signed"):
|
|
typecodes = np.typecodes["Integer"]
|
|
elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0):
|
|
typecodes = np.typecodes["UnsignedInteger"]
|
|
elif downcast == "float":
|
|
typecodes = np.typecodes["Float"]
|
|
|
|
# pandas support goes only to np.float32,
|
|
# as float dtypes smaller than that are
|
|
# extremely rare and not well supported
|
|
float_32_char = np.dtype(np.float32).char
|
|
float_32_ind = typecodes.index(float_32_char)
|
|
typecodes = typecodes[float_32_ind:]
|
|
|
|
if typecodes is not None:
|
|
# from smallest to largest
|
|
for dtype in typecodes:
|
|
if np.dtype(dtype).itemsize <= values.dtype.itemsize:
|
|
values = maybe_downcast_to_dtype(values, dtype)
|
|
|
|
# successful conversion
|
|
if values.dtype == dtype:
|
|
break
|
|
|
|
if is_series:
|
|
return arg._constructor(values, index=arg.index, name=arg.name)
|
|
elif is_index:
|
|
# because we want to coerce to numeric if possible,
|
|
# do not use _shallow_copy
|
|
return pd.Index(values, name=arg.name)
|
|
elif is_scalars:
|
|
return values[0]
|
|
else:
|
|
return values
|