Inzynierka/Lib/site-packages/pandas/core/computation/parsing.py

196 lines
6.2 KiB
Python
Raw Permalink Normal View History

2023-06-02 12:51:02 +02:00
"""
:func:`~pandas.eval` source string parsing functions
"""
from __future__ import annotations
from io import StringIO
from keyword import iskeyword
import token
import tokenize
from typing import (
Hashable,
Iterator,
)
# A token value Python's tokenizer probably will never use.
BACKTICK_QUOTED_STRING = 100
def create_valid_python_identifier(name: str) -> str:
"""
Create valid Python identifiers from any string.
Check if name contains any special characters. If it contains any
special characters, the special characters will be replaced by
a special string and a prefix is added.
Raises
------
SyntaxError
If the returned name is not a Python valid identifier, raise an exception.
This can happen if there is a hashtag in the name, as the tokenizer will
than terminate and not find the backtick.
But also for characters that fall out of the range of (U+0001..U+007F).
"""
if name.isidentifier() and not iskeyword(name):
return name
# Create a dict with the special characters and their replacement string.
# EXACT_TOKEN_TYPES contains these special characters
# token.tok_name contains a readable description of the replacement string.
special_characters_replacements = {
char: f"_{token.tok_name[tokval]}_"
for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items())
}
special_characters_replacements.update(
{
" ": "_",
"?": "_QUESTIONMARK_",
"!": "_EXCLAMATIONMARK_",
"$": "_DOLLARSIGN_",
"": "_EUROSIGN_",
"°": "_DEGREESIGN_",
# Including quotes works, but there are exceptions.
"'": "_SINGLEQUOTE_",
'"': "_DOUBLEQUOTE_",
# Currently not possible. Terminates parser and won't find backtick.
# "#": "_HASH_",
}
)
name = "".join([special_characters_replacements.get(char, char) for char in name])
name = f"BACKTICK_QUOTED_STRING_{name}"
if not name.isidentifier():
raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
return name
def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]:
"""
Clean up a column name if surrounded by backticks.
Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_create_valid_python_identifier` so that the parser can find this
string when the query is executed.
In this case the tok will get the NAME tokval.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
tok : Tuple[int, str]
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == BACKTICK_QUOTED_STRING:
return tokenize.NAME, create_valid_python_identifier(tokval)
return toknum, tokval
def clean_column_name(name: Hashable) -> Hashable:
"""
Function to emulate the cleaning of a backtick quoted name.
The purpose for this function is to see what happens to the name of
identifier if it goes to the process of being parsed a Python code
inside a backtick quoted string and than being cleaned
(removed of any special characters).
Parameters
----------
name : hashable
Name to be cleaned.
Returns
-------
name : hashable
Returns the name after tokenizing and cleaning.
Notes
-----
For some cases, a name cannot be converted to a valid Python identifier.
In that case :func:`tokenize_string` raises a SyntaxError.
In that case, we just return the name unmodified.
If this name was used in the query string (this makes the query call impossible)
an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
which is not caught and propagates to the user level.
"""
try:
tokenized = tokenize_string(f"`{name}`")
tokval = next(tokenized)[1]
return create_valid_python_identifier(tokval)
except SyntaxError:
return name
def tokenize_backtick_quoted_string(
token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
) -> tuple[int, str]:
"""
Creates a token from a backtick quoted string.
Moves the token_generator forwards till right after the next backtick.
Parameters
----------
token_generator : Iterator[tokenize.TokenInfo]
The generator that yields the tokens of the source string (Tuple[int, str]).
The generator is at the first token after the backtick (`)
source : str
The Python source code string.
string_start : int
This is the start of backtick quoted string inside the source string.
Returns
-------
tok: Tuple[int, str]
The token that represents the backtick quoted string.
The integer is equal to BACKTICK_QUOTED_STRING (100).
"""
for _, tokval, start, _, _ in token_generator:
if tokval == "`":
string_end = start[1]
break
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
"""
Tokenize a Python source code string.
Parameters
----------
source : str
The Python source code string.
Returns
-------
tok_generator : Iterator[Tuple[int, str]]
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
"""
line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)
# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted string
for toknum, tokval, start, _, _ in token_generator:
if tokval == "`":
try:
yield tokenize_backtick_quoted_string(
token_generator, source, string_start=start[1] + 1
)
except Exception as err:
raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
else:
yield toknum, tokval