projektAI/venv/Lib/site-packages/pandas/core/computation/expr.py
2021-06-06 22:13:05 +02:00

830 lines
24 KiB
Python

"""
:func:`~pandas.eval` parsers.
"""
import ast
from functools import partial, reduce
from keyword import iskeyword
import tokenize
from typing import Callable, Optional, Set, Tuple, Type, TypeVar
import numpy as np
from pandas.compat import PY39
import pandas.core.common as com
from pandas.core.computation.ops import (
ARITH_OPS_SYMS,
BOOL_OPS_SYMS,
CMP_OPS_SYMS,
LOCAL_TAG,
MATHOPS,
REDUCTIONS,
UNARY_OPS_SYMS,
BinOp,
Constant,
Div,
FuncNode,
Op,
Term,
UnaryOp,
UndefinedVariableError,
is_term,
)
from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string
from pandas.core.computation.scope import Scope
import pandas.io.formats.printing as printing
def _rewrite_assign(tok: Tuple[int, str]) -> Tuple[int, str]:
"""
Rewrite the assignment operator for PyTables expressions that use ``=``
as a substitute for ``==``.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
return toknum, "==" if tokval == "=" else tokval
def _replace_booleans(tok: Tuple[int, str]) -> Tuple[int, str]:
"""
Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
precedence is changed to boolean precedence.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == tokenize.OP:
if tokval == "&":
return tokenize.NAME, "and"
elif tokval == "|":
return tokenize.NAME, "or"
return toknum, tokval
return toknum, tokval
def _replace_locals(tok: Tuple[int, str]) -> Tuple[int, str]:
"""
Replace local variables with a syntactically valid name.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tuple of int, str
Either the input or token or the replacement values
Notes
-----
This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
"""
toknum, tokval = tok
if toknum == tokenize.OP and tokval == "@":
return tokenize.OP, LOCAL_TAG
return toknum, tokval
def _compose2(f, g):
"""
Compose 2 callables.
"""
return lambda *args, **kwargs: f(g(*args, **kwargs))
def _compose(*funcs):
"""
Compose 2 or more callables.
"""
assert len(funcs) > 1, "At least 2 callables must be passed to compose"
return reduce(_compose2, funcs)
def _preparse(
source: str,
f=_compose(
_replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks
),
) -> str:
"""
Compose a collection of tokenization functions.
Parameters
----------
source : str
A Python source code string
f : callable
This takes a tuple of (toknum, tokval) as its argument and returns a
tuple with the same structure but possibly different elements. Defaults
to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
``_replace_locals``.
Returns
-------
str
Valid Python source code
Notes
-----
The `f` parameter can be any callable that takes *and* returns input of the
form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
the ``tokenize`` module and ``tokval`` is a string.
"""
assert callable(f), "f must be callable"
return tokenize.untokenize(f(x) for x in tokenize_string(source))
def _is_type(t):
"""
Factory for a type checking function of type ``t`` or tuple of types.
"""
return lambda x: isinstance(x.value, t)
_is_list = _is_type(list)
_is_str = _is_type(str)
# partition all AST nodes
_all_nodes = frozenset(
node
for node in (getattr(ast, name) for name in dir(ast))
if isinstance(node, type) and issubclass(node, ast.AST)
)
def _filter_nodes(superclass, all_nodes=_all_nodes):
"""
Filter out AST nodes that are subclasses of ``superclass``.
"""
node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass))
return frozenset(node_names)
_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes))
_mod_nodes = _filter_nodes(ast.mod)
_stmt_nodes = _filter_nodes(ast.stmt)
_expr_nodes = _filter_nodes(ast.expr)
_expr_context_nodes = _filter_nodes(ast.expr_context)
_boolop_nodes = _filter_nodes(ast.boolop)
_operator_nodes = _filter_nodes(ast.operator)
_unary_op_nodes = _filter_nodes(ast.unaryop)
_cmp_op_nodes = _filter_nodes(ast.cmpop)
_comprehension_nodes = _filter_nodes(ast.comprehension)
_handler_nodes = _filter_nodes(ast.excepthandler)
_arguments_nodes = _filter_nodes(ast.arguments)
_keyword_nodes = _filter_nodes(ast.keyword)
_alias_nodes = _filter_nodes(ast.alias)
if not PY39:
_slice_nodes = _filter_nodes(ast.slice)
# nodes that we don't support directly but are needed for parsing
_hacked_nodes = frozenset(["Assign", "Module", "Expr"])
_unsupported_expr_nodes = frozenset(
[
"Yield",
"GeneratorExp",
"IfExp",
"DictComp",
"SetComp",
"Repr",
"Lambda",
"Set",
"AST",
"Is",
"IsNot",
]
)
# these nodes are low priority or won't ever be supported (e.g., AST)
_unsupported_nodes = (
_stmt_nodes
| _mod_nodes
| _handler_nodes
| _arguments_nodes
| _keyword_nodes
| _alias_nodes
| _expr_context_nodes
| _unsupported_expr_nodes
) - _hacked_nodes
# we're adding a different assignment in some cases to be equality comparison
# and we don't want `stmt` and friends in their so get only the class whose
# names are capitalized
_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
intersection = _unsupported_nodes & _base_supported_nodes
_msg = f"cannot both support and not support {intersection}"
assert not intersection, _msg
# TODO: Python 3.6.2: replace Callable[..., None] with Callable[..., NoReturn]
def _node_not_implemented(node_name: str) -> Callable[..., None]:
"""
Return a function that raises a NotImplementedError with a passed node name.
"""
def f(self, *args, **kwargs):
raise NotImplementedError(f"'{node_name}' nodes are not implemented")
return f
_T = TypeVar("_T", bound="BaseExprVisitor")
def disallow(nodes: Set[str]) -> Callable[[Type[_T]], Type[_T]]:
"""
Decorator to disallow certain nodes from parsing. Raises a
NotImplementedError instead.
Returns
-------
callable
"""
def disallowed(cls: Type[_T]) -> Type[_T]:
cls.unsupported_nodes = ()
for node in nodes:
new_method = _node_not_implemented(node)
name = f"visit_{node}"
cls.unsupported_nodes += (name,)
setattr(cls, name, new_method)
return cls
return disallowed
def _op_maker(op_class, op_symbol):
"""
Return a function to create an op class with its symbol already passed.
Returns
-------
callable
"""
def f(self, node, *args, **kwargs):
"""
Return a partial function with an Op subclass with an operator already passed.
Returns
-------
callable
"""
return partial(op_class, op_symbol, *args, **kwargs)
return f
_op_classes = {"binary": BinOp, "unary": UnaryOp}
def add_ops(op_classes):
"""
Decorator to add default implementation of ops.
"""
def f(cls):
for op_attr_name, op_class in op_classes.items():
ops = getattr(cls, f"{op_attr_name}_ops")
ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map")
for op in ops:
op_node = ops_map[op]
if op_node is not None:
made_op = _op_maker(op_class, op)
setattr(cls, f"visit_{op_node}", made_op)
return cls
return f
@disallow(_unsupported_nodes)
@add_ops(_op_classes)
class BaseExprVisitor(ast.NodeVisitor):
"""
Custom ast walker. Parsers of other engines should subclass this class
if necessary.
Parameters
----------
env : Scope
engine : str
parser : str
preparser : callable
"""
const_type: Type[Term] = Constant
term_type = Term
binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS
binary_op_nodes = (
"Gt",
"Lt",
"GtE",
"LtE",
"Eq",
"NotEq",
"In",
"NotIn",
"BitAnd",
"BitOr",
"And",
"Or",
"Add",
"Sub",
"Mult",
None,
"Pow",
"FloorDiv",
"Mod",
)
binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
unary_ops = UNARY_OPS_SYMS
unary_op_nodes = "UAdd", "USub", "Invert", "Not"
unary_op_nodes_map = {k: v for k, v in zip(unary_ops, unary_op_nodes)}
rewrite_map = {
ast.Eq: ast.In,
ast.NotEq: ast.NotIn,
ast.In: ast.In,
ast.NotIn: ast.NotIn,
}
unsupported_nodes: Tuple[str, ...]
def __init__(self, env, engine, parser, preparser=_preparse):
self.env = env
self.engine = engine
self.parser = parser
self.preparser = preparser
self.assigner = None
def visit(self, node, **kwargs):
if isinstance(node, str):
clean = self.preparser(node)
try:
node = ast.fix_missing_locations(ast.parse(clean))
except SyntaxError as e:
if any(iskeyword(x) for x in clean.split()):
e.msg = "Python keyword not valid identifier in numexpr query"
raise e
method = "visit_" + type(node).__name__
visitor = getattr(self, method)
return visitor(node, **kwargs)
def visit_Module(self, node, **kwargs):
if len(node.body) != 1:
raise SyntaxError("only a single expression is allowed")
expr = node.body[0]
return self.visit(expr, **kwargs)
def visit_Expr(self, node, **kwargs):
return self.visit(node.value, **kwargs)
def _rewrite_membership_op(self, node, left, right):
# the kind of the operator (is actually an instance)
op_instance = node.op
op_type = type(op_instance)
# must be two terms and the comparison operator must be ==/!=/in/not in
if is_term(left) and is_term(right) and op_type in self.rewrite_map:
left_list, right_list = map(_is_list, (left, right))
left_str, right_str = map(_is_str, (left, right))
# if there are any strings or lists in the expression
if left_list or right_list or left_str or right_str:
op_instance = self.rewrite_map[op_type]()
# pop the string variable out of locals and replace it with a list
# of one string, kind of a hack
if right_str:
name = self.env.add_tmp([right.value])
right = self.term_type(name, self.env)
if left_str:
name = self.env.add_tmp([left.value])
left = self.term_type(name, self.env)
op = self.visit(op_instance)
return op, op_instance, left, right
def _maybe_transform_eq_ne(self, node, left=None, right=None):
if left is None:
left = self.visit(node.left, side="left")
if right is None:
right = self.visit(node.right, side="right")
op, op_class, left, right = self._rewrite_membership_op(node, left, right)
return op, op_class, left, right
def _maybe_downcast_constants(self, left, right):
f32 = np.dtype(np.float32)
if (
left.is_scalar
and hasattr(left, "value")
and not right.is_scalar
and right.return_type == f32
):
# right is a float32 array, left is a scalar
name = self.env.add_tmp(np.float32(left.value))
left = self.term_type(name, self.env)
if (
right.is_scalar
and hasattr(right, "value")
and not left.is_scalar
and left.return_type == f32
):
# left is a float32 array, right is a scalar
name = self.env.add_tmp(np.float32(right.value))
right = self.term_type(name, self.env)
return left, right
def _maybe_eval(self, binop, eval_in_python):
# eval `in` and `not in` (for now) in "partial" python space
# things that can be evaluated in "eval" space will be turned into
# temporary variables. for example,
# [1,2] in a + 2 * b
# in that case a + 2 * b will be evaluated using numexpr, and the "in"
# call will be evaluated using isin (in python space)
return binop.evaluate(
self.env, self.engine, self.parser, self.term_type, eval_in_python
)
def _maybe_evaluate_binop(
self,
op,
op_class,
lhs,
rhs,
eval_in_python=("in", "not in"),
maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="),
):
res = op(lhs, rhs)
if res.has_invalid_return_type:
raise TypeError(
f"unsupported operand type(s) for {res.op}: "
f"'{lhs.type}' and '{rhs.type}'"
)
if self.engine != "pytables" and (
res.op in CMP_OPS_SYMS
and getattr(lhs, "is_datetime", False)
or getattr(rhs, "is_datetime", False)
):
# all date ops must be done in python bc numexpr doesn't work
# well with NaT
return self._maybe_eval(res, self.binary_ops)
if res.op in eval_in_python:
# "in"/"not in" ops are always evaluated in python
return self._maybe_eval(res, eval_in_python)
elif self.engine != "pytables":
if (
getattr(lhs, "return_type", None) == object
or getattr(rhs, "return_type", None) == object
):
# evaluate "==" and "!=" in python if either of our operands
# has an object return type
return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
return res
def visit_BinOp(self, node, **kwargs):
op, op_class, left, right = self._maybe_transform_eq_ne(node)
left, right = self._maybe_downcast_constants(left, right)
return self._maybe_evaluate_binop(op, op_class, left, right)
def visit_Div(self, node, **kwargs):
return lambda lhs, rhs: Div(lhs, rhs)
def visit_UnaryOp(self, node, **kwargs):
op = self.visit(node.op)
operand = self.visit(node.operand)
return op(operand)
def visit_Name(self, node, **kwargs):
return self.term_type(node.id, self.env, **kwargs)
def visit_NameConstant(self, node, **kwargs):
return self.const_type(node.value, self.env)
def visit_Num(self, node, **kwargs):
return self.const_type(node.n, self.env)
def visit_Constant(self, node, **kwargs):
return self.const_type(node.n, self.env)
def visit_Str(self, node, **kwargs):
name = self.env.add_tmp(node.s)
return self.term_type(name, self.env)
def visit_List(self, node, **kwargs):
name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts])
return self.term_type(name, self.env)
visit_Tuple = visit_List
def visit_Index(self, node, **kwargs):
""" df.index[4] """
return self.visit(node.value)
def visit_Subscript(self, node, **kwargs):
import pandas as pd
value = self.visit(node.value)
slobj = self.visit(node.slice)
result = pd.eval(
slobj, local_dict=self.env, engine=self.engine, parser=self.parser
)
try:
# a Term instance
v = value.value[result]
except AttributeError:
# an Op instance
lhs = pd.eval(
value, local_dict=self.env, engine=self.engine, parser=self.parser
)
v = lhs[result]
name = self.env.add_tmp(v)
return self.term_type(name, env=self.env)
def visit_Slice(self, node, **kwargs):
""" df.index[slice(4,6)] """
lower = node.lower
if lower is not None:
lower = self.visit(lower).value
upper = node.upper
if upper is not None:
upper = self.visit(upper).value
step = node.step
if step is not None:
step = self.visit(step).value
return slice(lower, upper, step)
def visit_Assign(self, node, **kwargs):
"""
support a single assignment node, like
c = a + b
set the assigner at the top level, must be a Name node which
might or might not exist in the resolvers
"""
if len(node.targets) != 1:
raise SyntaxError("can only assign a single expression")
if not isinstance(node.targets[0], ast.Name):
raise SyntaxError("left hand side of an assignment must be a single name")
if self.env.target is None:
raise ValueError("cannot assign without a target object")
try:
assigner = self.visit(node.targets[0], **kwargs)
except UndefinedVariableError:
assigner = node.targets[0].id
self.assigner = getattr(assigner, "name", assigner)
if self.assigner is None:
raise SyntaxError(
"left hand side of an assignment must be a single resolvable name"
)
return self.visit(node.value, **kwargs)
def visit_Attribute(self, node, **kwargs):
attr = node.attr
value = node.value
ctx = node.ctx
if isinstance(ctx, ast.Load):
# resolve the value
resolved = self.visit(value).value
try:
v = getattr(resolved, attr)
name = self.env.add_tmp(v)
return self.term_type(name, self.env)
except AttributeError:
# something like datetime.datetime where scope is overridden
if isinstance(value, ast.Name) and value.id == attr:
return resolved
raise
raise ValueError(f"Invalid Attribute context {type(ctx).__name__}")
def visit_Call(self, node, side=None, **kwargs):
if isinstance(node.func, ast.Attribute) and node.func.attr != "__call__":
res = self.visit_Attribute(node.func)
elif not isinstance(node.func, ast.Name):
raise TypeError("Only named functions are supported")
else:
try:
res = self.visit(node.func)
except UndefinedVariableError:
# Check if this is a supported function name
try:
res = FuncNode(node.func.id)
except ValueError:
# Raise original error
raise
if res is None:
# pandas\core\computation\expr.py:663: error: "expr" has no
# attribute "id" [attr-defined]
raise ValueError(
f"Invalid function call {node.func.id}" # type: ignore[attr-defined]
)
if hasattr(res, "value"):
res = res.value
if isinstance(res, FuncNode):
new_args = [self.visit(arg) for arg in node.args]
if node.keywords:
raise TypeError(
f'Function "{res.name}" does not support keyword arguments'
)
return res(*new_args)
else:
new_args = [self.visit(arg).value for arg in node.args]
for key in node.keywords:
if not isinstance(key, ast.keyword):
# pandas\core\computation\expr.py:684: error: "expr" has no
# attribute "id" [attr-defined]
raise ValueError(
"keyword error in function call " # type: ignore[attr-defined]
f"'{node.func.id}'"
)
if key.arg:
kwargs[key.arg] = self.visit(key.value).value
return self.const_type(res(*new_args, **kwargs), self.env)
def translate_In(self, op):
return op
def visit_Compare(self, node, **kwargs):
ops = node.ops
comps = node.comparators
# base case: we have something like a CMP b
if len(comps) == 1:
op = self.translate_In(ops[0])
binop = ast.BinOp(op=op, left=node.left, right=comps[0])
return self.visit(binop)
# recursive case: we have a chained comparison, a CMP b CMP c, etc.
left = node.left
values = []
for op, comp in zip(ops, comps):
new_node = self.visit(
ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)])
)
left = comp
values.append(new_node)
return self.visit(ast.BoolOp(op=ast.And(), values=values))
def _try_visit_binop(self, bop):
if isinstance(bop, (Op, Term)):
return bop
return self.visit(bop)
def visit_BoolOp(self, node, **kwargs):
def visitor(x, y):
lhs = self._try_visit_binop(x)
rhs = self._try_visit_binop(y)
op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs)
return self._maybe_evaluate_binop(op, node.op, lhs, rhs)
operands = node.values
return reduce(visitor, operands)
_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"])
_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS)
@disallow(
(_unsupported_nodes | _python_not_supported)
- (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"]))
)
class PandasExprVisitor(BaseExprVisitor):
def __init__(
self,
env,
engine,
parser,
preparser=partial(
_preparse,
f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks),
),
):
super().__init__(env, engine, parser, preparser)
@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"]))
class PythonExprVisitor(BaseExprVisitor):
def __init__(self, env, engine, parser, preparser=lambda x: x):
super().__init__(env, engine, parser, preparser=preparser)
class Expr:
"""
Object encapsulating an expression.
Parameters
----------
expr : str
engine : str, optional, default 'numexpr'
parser : str, optional, default 'pandas'
env : Scope, optional, default None
level : int, optional, default 2
"""
env: Scope
engine: str
parser: str
def __init__(
self,
expr,
engine: str = "numexpr",
parser: str = "pandas",
env: Optional[Scope] = None,
level: int = 0,
):
self.expr = expr
self.env = env or Scope(level=level + 1)
self.engine = engine
self.parser = parser
self._visitor = PARSERS[parser](self.env, self.engine, self.parser)
self.terms = self.parse()
@property
def assigner(self):
return getattr(self._visitor, "assigner", None)
def __call__(self):
return self.terms(self.env)
def __repr__(self) -> str:
return printing.pprint_thing(self.terms)
def __len__(self) -> int:
return len(self.expr)
def parse(self):
"""
Parse an expression.
"""
return self._visitor.visit(self.expr)
@property
def names(self):
"""
Get the names in an expression.
"""
if is_term(self.terms):
return frozenset([self.terms.name])
return frozenset(term.name for term in com.flatten(self.terms))
PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}