Inzynierka/Lib/site-packages/pandas/tests/io/test_orc.py

""" test orc compat """
import datetime
from decimal import Decimal
from io import BytesIO
import os

import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import read_orc
import pandas._testing as tm
from pandas.core.arrays import StringArray

pytest.importorskip("pyarrow.orc")

import pyarrow as pa


@pytest.fixture
def dirpath(datapath):
    return datapath("io", "data", "orc")


# Examples of dataframes with dtypes for which conversion to ORC
# hasn't been implemented yet, that is, Category, unsigned integers,
# interval, period and sparse.
orc_writer_dtypes_not_supported = [
    pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}),
    pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}),
    pd.DataFrame(
        {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]}
    ),
    pd.DataFrame(
        {
            "unimpl": [
                pd.Period("2022-01-03", freq="D"),
                pd.Period("2022-01-04", freq="D"),
            ]
        }
    ),
    pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)),
]


def test_orc_reader_empty(dirpath):
    columns = [
        "boolean1",
        "byte1",
        "short1",
        "int1",
        "long1",
        "float1",
        "double1",
        "bytes1",
        "string1",
    ]
    dtypes = [
        "bool",
        "int8",
        "int16",
        "int32",
        "int64",
        "float32",
        "float64",
        "object",
        "object",
    ]
    expected = pd.DataFrame(index=pd.RangeIndex(0))
    for colname, dtype in zip(columns, dtypes):
        expected[colname] = pd.Series(dtype=dtype)

    inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
    got = read_orc(inputfile, columns=columns)

    tm.assert_equal(expected, got)


def test_orc_reader_basic(dirpath):
    data = {
        "boolean1": np.array([False, True], dtype="bool"),
        "byte1": np.array([1, 100], dtype="int8"),
        "short1": np.array([1024, 2048], dtype="int16"),
        "int1": np.array([65536, 65536], dtype="int32"),
        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
        "float1": np.array([1.0, 2.0], dtype="float32"),
        "double1": np.array([-15.0, -5.0], dtype="float64"),
        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
        "string1": np.array(["hi", "bye"], dtype="object"),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
    got = read_orc(inputfile, columns=data.keys())

    tm.assert_equal(expected, got)


def test_orc_reader_decimal(dirpath):
    # Only testing the first 10 rows of data
    data = {
        "_col0": np.array(
            [
                Decimal("-1000.50000"),
                Decimal("-999.60000"),
                Decimal("-998.70000"),
                Decimal("-997.80000"),
                Decimal("-996.90000"),
                Decimal("-995.10000"),
                Decimal("-994.11000"),
                Decimal("-993.12000"),
                Decimal("-992.13000"),
                Decimal("-991.14000"),
            ],
            dtype="object",
        )
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)


def test_orc_reader_date_low(dirpath):
    data = {
        "time": np.array(
            [
                "1900-05-05 12:34:56.100000",
                "1900-05-05 12:34:56.100100",
                "1900-05-05 12:34:56.100200",
                "1900-05-05 12:34:56.100300",
                "1900-05-05 12:34:56.100400",
                "1900-05-05 12:34:56.100500",
                "1900-05-05 12:34:56.100600",
                "1900-05-05 12:34:56.100700",
                "1900-05-05 12:34:56.100800",
                "1900-05-05 12:34:56.100900",
            ],
            dtype="datetime64[ns]",
        ),
        "date": np.array(
            [
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
                datetime.date(1900, 12, 25),
            ],
            dtype="object",
        ),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)


def test_orc_reader_date_high(dirpath):
    data = {
        "time": np.array(
            [
                "2038-05-05 12:34:56.100000",
                "2038-05-05 12:34:56.100100",
                "2038-05-05 12:34:56.100200",
                "2038-05-05 12:34:56.100300",
                "2038-05-05 12:34:56.100400",
                "2038-05-05 12:34:56.100500",
                "2038-05-05 12:34:56.100600",
                "2038-05-05 12:34:56.100700",
                "2038-05-05 12:34:56.100800",
                "2038-05-05 12:34:56.100900",
            ],
            dtype="datetime64[ns]",
        ),
        "date": np.array(
            [
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
                datetime.date(2038, 12, 25),
            ],
            dtype="object",
        ),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)


def test_orc_reader_snappy_compressed(dirpath):
    data = {
        "int1": np.array(
            [
                -1160101563,
                1181413113,
                2065821249,
                -267157795,
                172111193,
                1752363137,
                1406072123,
                1911809390,
                -1308542224,
                -467100286,
            ],
            dtype="int32",
        ),
        "string1": np.array(
            [
                "f50dcb8",
                "382fdaaa",
                "90758c6",
                "9e8caf3f",
                "ee97332b",
                "d634da1",
                "2bea4396",
                "d67d89e8",
                "ad71007e",
                "e8c82066",
            ],
            dtype="object",
        ),
    }
    expected = pd.DataFrame.from_dict(data)

    inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
    got = read_orc(inputfile).iloc[:10]

    tm.assert_equal(expected, got)


@td.skip_if_no("pyarrow", min_version="7.0.0")
def test_orc_roundtrip_file(dirpath):
    # GH44554
    # PyArrow gained ORC write support with the current argument order
    data = {
        "boolean1": np.array([False, True], dtype="bool"),
        "byte1": np.array([1, 100], dtype="int8"),
        "short1": np.array([1024, 2048], dtype="int16"),
        "int1": np.array([65536, 65536], dtype="int32"),
        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
        "float1": np.array([1.0, 2.0], dtype="float32"),
        "double1": np.array([-15.0, -5.0], dtype="float64"),
        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
        "string1": np.array(["hi", "bye"], dtype="object"),
    }
    expected = pd.DataFrame.from_dict(data)

    with tm.ensure_clean() as path:
        expected.to_orc(path)
        got = read_orc(path)

        tm.assert_equal(expected, got)


@td.skip_if_no("pyarrow", min_version="7.0.0")
def test_orc_roundtrip_bytesio():
    # GH44554
    # PyArrow gained ORC write support with the current argument order
    data = {
        "boolean1": np.array([False, True], dtype="bool"),
        "byte1": np.array([1, 100], dtype="int8"),
        "short1": np.array([1024, 2048], dtype="int16"),
        "int1": np.array([65536, 65536], dtype="int32"),
        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
        "float1": np.array([1.0, 2.0], dtype="float32"),
        "double1": np.array([-15.0, -5.0], dtype="float64"),
        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
        "string1": np.array(["hi", "bye"], dtype="object"),
    }
    expected = pd.DataFrame.from_dict(data)

    bytes = expected.to_orc()
    got = read_orc(BytesIO(bytes))

    tm.assert_equal(expected, got)


@td.skip_if_no("pyarrow", min_version="7.0.0")
@pytest.mark.parametrize("df_not_supported", orc_writer_dtypes_not_supported)
def test_orc_writer_dtypes_not_supported(df_not_supported):
    # GH44554
    # PyArrow gained ORC write support with the current argument order
    msg = "The dtype of one or more columns is not supported yet."
    with pytest.raises(NotImplementedError, match=msg):
        df_not_supported.to_orc()


@td.skip_if_no("pyarrow", min_version="7.0.0")
def test_orc_dtype_backend_pyarrow():
    df = pd.DataFrame(
        {
            "string": list("abc"),
            "string_with_nan": ["a", np.nan, "c"],
            "string_with_none": ["a", None, "c"],
            "bytes": [b"foo", b"bar", None],
            "int": list(range(1, 4)),
            "float": np.arange(4.0, 7.0, dtype="float64"),
            "float_with_nan": [2.0, np.nan, 3.0],
            "bool": [True, False, True],
            "bool_with_na": [True, False, None],
            "datetime": pd.date_range("20130101", periods=3),
            "datetime_with_nat": [
                pd.Timestamp("20130101"),
                pd.NaT,
                pd.Timestamp("20130103"),
            ],
        }
    )

    bytes_data = df.copy().to_orc()
    result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow")

    expected = pd.DataFrame(
        {
            col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
            for col in df.columns
        }
    )

    tm.assert_frame_equal(result, expected)


@td.skip_if_no("pyarrow", min_version="7.0.0")
def test_orc_dtype_backend_numpy_nullable():
    # GH#50503
    df = pd.DataFrame(
        {
            "string": list("abc"),
            "string_with_nan": ["a", np.nan, "c"],
            "string_with_none": ["a", None, "c"],
            "int": list(range(1, 4)),
            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
            "float": np.arange(4.0, 7.0, dtype="float64"),
            "float_with_nan": [2.0, np.nan, 3.0],
            "bool": [True, False, True],
            "bool_with_na": [True, False, None],
        }
    )

    bytes_data = df.copy().to_orc()
    result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable")

    expected = pd.DataFrame(
        {
            "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
            "string_with_nan": StringArray(
                np.array(["a", pd.NA, "c"], dtype=np.object_)
            ),
            "string_with_none": StringArray(
                np.array(["a", pd.NA, "c"], dtype=np.object_)
            ),
            "int": pd.Series([1, 2, 3], dtype="Int64"),
            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
            "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
            "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
            "bool": pd.Series([True, False, True], dtype="boolean"),
            "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
        }
    )

    tm.assert_frame_equal(result, expected)


def test_invalid_dtype_backend():
    msg = (
        "dtype_backend numpy is invalid, only 'numpy_nullable' and "
        "'pyarrow' are allowed."
    )
    df = pd.DataFrame({"int": list(range(1, 4))})
    with tm.ensure_clean("tmp.orc") as path:
        df.to_orc(path)
        with pytest.raises(ValueError, match=msg):
            read_orc(path, dtype_backend="numpy")
first commit 2023-06-02 12:51:02 +02:00			`""" test orc compat """`
			`import datetime`
			`from decimal import Decimal`
			`from io import BytesIO`
			`import os`

			`import numpy as np`
			`import pytest`

			`import pandas.util._test_decorators as td`

			`import pandas as pd`
			`from pandas import read_orc`
			`import pandas._testing as tm`
			`from pandas.core.arrays import StringArray`

			`pytest.importorskip("pyarrow.orc")`

			`import pyarrow as pa`


			`@pytest.fixture`
			`def dirpath(datapath):`
			`return datapath("io", "data", "orc")`


			`# Examples of dataframes with dtypes for which conversion to ORC`
			`# hasn't been implemented yet, that is, Category, unsigned integers,`
			`# interval, period and sparse.`
			`orc_writer_dtypes_not_supported = [`
			`pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}),`
			`pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}),`
			`pd.DataFrame(`
			`{"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]}`
			`),`
			`pd.DataFrame(`
			`{`
			`"unimpl": [`
			`pd.Period("2022-01-03", freq="D"),`
			`pd.Period("2022-01-04", freq="D"),`
			`]`
			`}`
			`),`
			`pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)),`
			`]`


			`def test_orc_reader_empty(dirpath):`
			`columns = [`
			`"boolean1",`
			`"byte1",`
			`"short1",`
			`"int1",`
			`"long1",`
			`"float1",`
			`"double1",`
			`"bytes1",`
			`"string1",`
			`]`
			`dtypes = [`
			`"bool",`
			`"int8",`
			`"int16",`
			`"int32",`
			`"int64",`
			`"float32",`
			`"float64",`
			`"object",`
			`"object",`
			`]`
			`expected = pd.DataFrame(index=pd.RangeIndex(0))`
			`for colname, dtype in zip(columns, dtypes):`
			`expected[colname] = pd.Series(dtype=dtype)`

			`inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")`
			`got = read_orc(inputfile, columns=columns)`

			`tm.assert_equal(expected, got)`


			`def test_orc_reader_basic(dirpath):`
			`data = {`
			`"boolean1": np.array([False, True], dtype="bool"),`
			`"byte1": np.array([1, 100], dtype="int8"),`
			`"short1": np.array([1024, 2048], dtype="int16"),`
			`"int1": np.array([65536, 65536], dtype="int32"),`
			`"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),`
			`"float1": np.array([1.0, 2.0], dtype="float32"),`
			`"double1": np.array([-15.0, -5.0], dtype="float64"),`
			`"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),`
			`"string1": np.array(["hi", "bye"], dtype="object"),`
			`}`
			`expected = pd.DataFrame.from_dict(data)`

			`inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")`
			`got = read_orc(inputfile, columns=data.keys())`

			`tm.assert_equal(expected, got)`


			`def test_orc_reader_decimal(dirpath):`
			`# Only testing the first 10 rows of data`
			`data = {`
			`"_col0": np.array(`
			`[`
			`Decimal("-1000.50000"),`
			`Decimal("-999.60000"),`
			`Decimal("-998.70000"),`
			`Decimal("-997.80000"),`
			`Decimal("-996.90000"),`
			`Decimal("-995.10000"),`
			`Decimal("-994.11000"),`
			`Decimal("-993.12000"),`
			`Decimal("-992.13000"),`
			`Decimal("-991.14000"),`
			`],`
			`dtype="object",`
			`)`
			`}`
			`expected = pd.DataFrame.from_dict(data)`

			`inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")`
			`got = read_orc(inputfile).iloc[:10]`

			`tm.assert_equal(expected, got)`


			`def test_orc_reader_date_low(dirpath):`
			`data = {`
			`"time": np.array(`
			`[`
			`"1900-05-05 12:34:56.100000",`
			`"1900-05-05 12:34:56.100100",`
			`"1900-05-05 12:34:56.100200",`
			`"1900-05-05 12:34:56.100300",`
			`"1900-05-05 12:34:56.100400",`
			`"1900-05-05 12:34:56.100500",`
			`"1900-05-05 12:34:56.100600",`
			`"1900-05-05 12:34:56.100700",`
			`"1900-05-05 12:34:56.100800",`
			`"1900-05-05 12:34:56.100900",`
			`],`
			`dtype="datetime64[ns]",`
			`),`
			`"date": np.array(`
			`[`
			`datetime.date(1900, 12, 25),`
			`datetime.date(1900, 12, 25),`
			`datetime.date(1900, 12, 25),`
			`datetime.date(1900, 12, 25),`
			`datetime.date(1900, 12, 25),`
			`datetime.date(1900, 12, 25),`
			`datetime.date(1900, 12, 25),`
			`datetime.date(1900, 12, 25),`
			`datetime.date(1900, 12, 25),`
			`datetime.date(1900, 12, 25),`
			`],`
			`dtype="object",`
			`),`
			`}`
			`expected = pd.DataFrame.from_dict(data)`

			`inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")`
			`got = read_orc(inputfile).iloc[:10]`

			`tm.assert_equal(expected, got)`


			`def test_orc_reader_date_high(dirpath):`
			`data = {`
			`"time": np.array(`
			`[`
			`"2038-05-05 12:34:56.100000",`
			`"2038-05-05 12:34:56.100100",`
			`"2038-05-05 12:34:56.100200",`
			`"2038-05-05 12:34:56.100300",`
			`"2038-05-05 12:34:56.100400",`
			`"2038-05-05 12:34:56.100500",`
			`"2038-05-05 12:34:56.100600",`
			`"2038-05-05 12:34:56.100700",`
			`"2038-05-05 12:34:56.100800",`
			`"2038-05-05 12:34:56.100900",`
			`],`
			`dtype="datetime64[ns]",`
			`),`
			`"date": np.array(`
			`[`
			`datetime.date(2038, 12, 25),`
			`datetime.date(2038, 12, 25),`
			`datetime.date(2038, 12, 25),`
			`datetime.date(2038, 12, 25),`
			`datetime.date(2038, 12, 25),`
			`datetime.date(2038, 12, 25),`
			`datetime.date(2038, 12, 25),`
			`datetime.date(2038, 12, 25),`
			`datetime.date(2038, 12, 25),`
			`datetime.date(2038, 12, 25),`
			`],`
			`dtype="object",`
			`),`
			`}`
			`expected = pd.DataFrame.from_dict(data)`

			`inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")`
			`got = read_orc(inputfile).iloc[:10]`

			`tm.assert_equal(expected, got)`


			`def test_orc_reader_snappy_compressed(dirpath):`
			`data = {`
			`"int1": np.array(`
			`[`
			`-1160101563,`
			`1181413113,`
			`2065821249,`
			`-267157795,`
			`172111193,`
			`1752363137,`
			`1406072123,`
			`1911809390,`
			`-1308542224,`
			`-467100286,`
			`],`
			`dtype="int32",`
			`),`
			`"string1": np.array(`
			`[`
			`"f50dcb8",`
			`"382fdaaa",`
			`"90758c6",`
			`"9e8caf3f",`
			`"ee97332b",`
			`"d634da1",`
			`"2bea4396",`
			`"d67d89e8",`
			`"ad71007e",`
			`"e8c82066",`
			`],`
			`dtype="object",`
			`),`
			`}`
			`expected = pd.DataFrame.from_dict(data)`

			`inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")`
			`got = read_orc(inputfile).iloc[:10]`

			`tm.assert_equal(expected, got)`


			`@td.skip_if_no("pyarrow", min_version="7.0.0")`
			`def test_orc_roundtrip_file(dirpath):`
			`# GH44554`
			`# PyArrow gained ORC write support with the current argument order`
			`data = {`
			`"boolean1": np.array([False, True], dtype="bool"),`
			`"byte1": np.array([1, 100], dtype="int8"),`
			`"short1": np.array([1024, 2048], dtype="int16"),`
			`"int1": np.array([65536, 65536], dtype="int32"),`
			`"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),`
			`"float1": np.array([1.0, 2.0], dtype="float32"),`
			`"double1": np.array([-15.0, -5.0], dtype="float64"),`
			`"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),`
			`"string1": np.array(["hi", "bye"], dtype="object"),`
			`}`
			`expected = pd.DataFrame.from_dict(data)`

			`with tm.ensure_clean() as path:`
			`expected.to_orc(path)`
			`got = read_orc(path)`

			`tm.assert_equal(expected, got)`


			`@td.skip_if_no("pyarrow", min_version="7.0.0")`
			`def test_orc_roundtrip_bytesio():`
			`# GH44554`
			`# PyArrow gained ORC write support with the current argument order`
			`data = {`
			`"boolean1": np.array([False, True], dtype="bool"),`
			`"byte1": np.array([1, 100], dtype="int8"),`
			`"short1": np.array([1024, 2048], dtype="int16"),`
			`"int1": np.array([65536, 65536], dtype="int32"),`
			`"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),`
			`"float1": np.array([1.0, 2.0], dtype="float32"),`
			`"double1": np.array([-15.0, -5.0], dtype="float64"),`
			`"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),`
			`"string1": np.array(["hi", "bye"], dtype="object"),`
			`}`
			`expected = pd.DataFrame.from_dict(data)`

			`bytes = expected.to_orc()`
			`got = read_orc(BytesIO(bytes))`

			`tm.assert_equal(expected, got)`


			`@td.skip_if_no("pyarrow", min_version="7.0.0")`
			`@pytest.mark.parametrize("df_not_supported", orc_writer_dtypes_not_supported)`
			`def test_orc_writer_dtypes_not_supported(df_not_supported):`
			`# GH44554`
			`# PyArrow gained ORC write support with the current argument order`
			`msg = "The dtype of one or more columns is not supported yet."`
			`with pytest.raises(NotImplementedError, match=msg):`
			`df_not_supported.to_orc()`


			`@td.skip_if_no("pyarrow", min_version="7.0.0")`
			`def test_orc_dtype_backend_pyarrow():`
			`df = pd.DataFrame(`
			`{`
			`"string": list("abc"),`
			`"string_with_nan": ["a", np.nan, "c"],`
			`"string_with_none": ["a", None, "c"],`
			`"bytes": [b"foo", b"bar", None],`
			`"int": list(range(1, 4)),`
			`"float": np.arange(4.0, 7.0, dtype="float64"),`
			`"float_with_nan": [2.0, np.nan, 3.0],`
			`"bool": [True, False, True],`
			`"bool_with_na": [True, False, None],`
			`"datetime": pd.date_range("20130101", periods=3),`
			`"datetime_with_nat": [`
			`pd.Timestamp("20130101"),`
			`pd.NaT,`
			`pd.Timestamp("20130103"),`
			`],`
			`}`
			`)`

			`bytes_data = df.copy().to_orc()`
			`result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow")`

			`expected = pd.DataFrame(`
			`{`
			`col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))`
			`for col in df.columns`
			`}`
			`)`

			`tm.assert_frame_equal(result, expected)`


			`@td.skip_if_no("pyarrow", min_version="7.0.0")`
			`def test_orc_dtype_backend_numpy_nullable():`
			`# GH#50503`
			`df = pd.DataFrame(`
			`{`
			`"string": list("abc"),`
			`"string_with_nan": ["a", np.nan, "c"],`
			`"string_with_none": ["a", None, "c"],`
			`"int": list(range(1, 4)),`
			`"int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),`
			`"na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),`
			`"float": np.arange(4.0, 7.0, dtype="float64"),`
			`"float_with_nan": [2.0, np.nan, 3.0],`
			`"bool": [True, False, True],`
			`"bool_with_na": [True, False, None],`
			`}`
			`)`

			`bytes_data = df.copy().to_orc()`
			`result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable")`

			`expected = pd.DataFrame(`
			`{`
			`"string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),`
			`"string_with_nan": StringArray(`
			`np.array(["a", pd.NA, "c"], dtype=np.object_)`
			`),`
			`"string_with_none": StringArray(`
			`np.array(["a", pd.NA, "c"], dtype=np.object_)`
			`),`
			`"int": pd.Series([1, 2, 3], dtype="Int64"),`
			`"int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),`
			`"na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),`
			`"float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),`
			`"float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),`
			`"bool": pd.Series([True, False, True], dtype="boolean"),`
			`"bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),`
			`}`
			`)`

			`tm.assert_frame_equal(result, expected)`


			`def test_invalid_dtype_backend():`
			`msg = (`
			`"dtype_backend numpy is invalid, only 'numpy_nullable' and "`
			`"'pyarrow' are allowed."`
			`)`
			`df = pd.DataFrame({"int": list(range(1, 4))})`
			`with tm.ensure_clean("tmp.orc") as path:`
			`df.to_orc(path)`
			`with pytest.raises(ValueError, match=msg):`
			`read_orc(path, dtype_backend="numpy")`