Inzynierka/Lib/site-packages/pandas/io/gbq.py

228 lines
8.1 KiB
Python
Raw Normal View History

2023-06-02 12:51:02 +02:00
""" Google BigQuery support """
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
from pandas.compat._optional import import_optional_dependency
if TYPE_CHECKING:
from pandas import DataFrame
def _try_import():
# since pandas is a dependency of pandas-gbq
# we need to import on first use
msg = (
"pandas-gbq is required to load data from Google BigQuery. "
"See the docs: https://pandas-gbq.readthedocs.io."
)
pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg)
return pandas_gbq
def read_gbq(
query: str,
project_id: str | None = None,
index_col: str | None = None,
col_order: list[str] | None = None,
reauth: bool = False,
auth_local_webserver: bool = True,
dialect: str | None = None,
location: str | None = None,
configuration: dict[str, Any] | None = None,
credentials=None,
use_bqstorage_api: bool | None = None,
max_results: int | None = None,
progress_bar_type: str | None = None,
) -> DataFrame:
"""
Load data from Google BigQuery.
This function requires the `pandas-gbq package
<https://pandas-gbq.readthedocs.io>`__.
See the `How to authenticate with Google BigQuery
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
guide for authentication instructions.
Parameters
----------
query : str
SQL-Like Query to return data values.
project_id : str, optional
Google BigQuery Account project ID. Optional when available from
the environment.
index_col : str, optional
Name of result column to use for index in results DataFrame.
col_order : list(str), optional
List of BigQuery column names in the desired order for results
DataFrame.
reauth : bool, default False
Force Google BigQuery to re-authenticate the user. This is useful
if multiple accounts are used.
auth_local_webserver : bool, default True
Use the `local webserver flow`_ instead of the `console flow`_
when getting user credentials.
.. _local webserver flow:
https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
.. _console flow:
https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
*New in version 0.2.0 of pandas-gbq*.
.. versionchanged:: 1.5.0
Default value is changed to ``True``. Google has deprecated the
``auth_local_webserver = False`` `"out of band" (copy-paste)
flow
<https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
dialect : str, default 'legacy'
Note: The default value is changing to 'standard' in a future version.
SQL syntax dialect to use. Value can be one of:
``'legacy'``
Use BigQuery's legacy SQL dialect. For more information see
`BigQuery Legacy SQL Reference
<https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
``'standard'``
Use BigQuery's standard SQL, which is
compliant with the SQL 2011 standard. For more information
see `BigQuery Standard SQL Reference
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
location : str, optional
Location where the query job should run. See the `BigQuery locations
documentation
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
list of available locations. The location must match that of any
datasets used in the query.
*New in version 0.5.0 of pandas-gbq*.
configuration : dict, optional
Query config parameters for job processing.
For example:
configuration = {'query': {'useQueryCache': False}}
For more information see `BigQuery REST API Reference
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
credentials : google.auth.credentials.Credentials, optional
Credentials for accessing Google APIs. Use this parameter to override
default credentials, such as to use Compute Engine
:class:`google.auth.compute_engine.Credentials` or Service Account
:class:`google.oauth2.service_account.Credentials` directly.
*New in version 0.8.0 of pandas-gbq*.
use_bqstorage_api : bool, default False
Use the `BigQuery Storage API
<https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
download query results quickly, but at an increased cost. To use this
API, first `enable it in the Cloud Console
<https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
You must also have the `bigquery.readsessions.create
<https://cloud.google.com/bigquery/docs/access-control#roles>`__
permission on the project you are billing queries to.
This feature requires version 0.10.0 or later of the ``pandas-gbq``
package. It also requires the ``google-cloud-bigquery-storage`` and
``fastavro`` packages.
max_results : int, optional
If set, limit the maximum number of rows to fetch from the query
results.
*New in version 0.12.0 of pandas-gbq*.
.. versionadded:: 1.1.0
progress_bar_type : Optional, str
If set, use the `tqdm <https://tqdm.github.io/>`__ library to
display a progress bar while the data downloads. Install the
``tqdm`` package to use this feature.
Possible values of ``progress_bar_type`` include:
``None``
No progress bar.
``'tqdm'``
Use the :func:`tqdm.tqdm` function to print a progress bar
to :data:`sys.stderr`.
``'tqdm_notebook'``
Use the :func:`tqdm.tqdm_notebook` function to display a
progress bar as a Jupyter notebook widget.
``'tqdm_gui'``
Use the :func:`tqdm.tqdm_gui` function to display a
progress bar as a graphical dialog box.
Note that this feature requires version 0.12.0 or later of the
``pandas-gbq`` package. And it requires the ``tqdm`` package. Slightly
different than ``pandas-gbq``, here the default is ``None``.
Returns
-------
df: DataFrame
DataFrame representing results of query.
See Also
--------
pandas_gbq.read_gbq : This function in the pandas-gbq library.
DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
"""
pandas_gbq = _try_import()
kwargs: dict[str, str | bool | int | None] = {}
# START: new kwargs. Don't populate unless explicitly set.
if use_bqstorage_api is not None:
kwargs["use_bqstorage_api"] = use_bqstorage_api
if max_results is not None:
kwargs["max_results"] = max_results
kwargs["progress_bar_type"] = progress_bar_type
# END: new kwargs
return pandas_gbq.read_gbq(
query,
project_id=project_id,
index_col=index_col,
col_order=col_order,
reauth=reauth,
auth_local_webserver=auth_local_webserver,
dialect=dialect,
location=location,
configuration=configuration,
credentials=credentials,
**kwargs,
)
def to_gbq(
dataframe: DataFrame,
destination_table: str,
project_id: str | None = None,
chunksize: int | None = None,
reauth: bool = False,
if_exists: str = "fail",
auth_local_webserver: bool = True,
table_schema: list[dict[str, str]] | None = None,
location: str | None = None,
progress_bar: bool = True,
credentials=None,
) -> None:
pandas_gbq = _try_import()
pandas_gbq.to_gbq(
dataframe,
destination_table,
project_id=project_id,
chunksize=chunksize,
reauth=reauth,
if_exists=if_exists,
auth_local_webserver=auth_local_webserver,
table_schema=table_schema,
location=location,
progress_bar=progress_bar,
credentials=credentials,
)