projektAI/venv/Lib/site-packages/pandas/tests/io/test_orc.py
2021-06-06 22:13:05 +02:00

228 lines
6.3 KiB
Python

""" test orc compat """
import datetime
import os
import numpy as np
import pytest
import pandas as pd
from pandas import read_orc
import pandas._testing as tm
pytest.importorskip("pyarrow", minversion="0.13.0")
pytest.importorskip("pyarrow.orc")
pytestmark = pytest.mark.filterwarnings(
"ignore:RangeIndex.* is deprecated:DeprecationWarning"
)
@pytest.fixture
def dirpath(datapath):
return datapath("io", "data", "orc")
def test_orc_reader_empty(dirpath):
columns = [
"boolean1",
"byte1",
"short1",
"int1",
"long1",
"float1",
"double1",
"bytes1",
"string1",
]
dtypes = [
"bool",
"int8",
"int16",
"int32",
"int64",
"float32",
"float64",
"object",
"object",
]
expected = pd.DataFrame(index=pd.RangeIndex(0))
for colname, dtype in zip(columns, dtypes):
expected[colname] = pd.Series(dtype=dtype)
inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
got = read_orc(inputfile, columns=columns)
tm.assert_equal(expected, got)
def test_orc_reader_basic(dirpath):
data = {
"boolean1": np.array([False, True], dtype="bool"),
"byte1": np.array([1, 100], dtype="int8"),
"short1": np.array([1024, 2048], dtype="int16"),
"int1": np.array([65536, 65536], dtype="int32"),
"long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
"float1": np.array([1.0, 2.0], dtype="float32"),
"double1": np.array([-15.0, -5.0], dtype="float64"),
"bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
"string1": np.array(["hi", "bye"], dtype="object"),
}
expected = pd.DataFrame.from_dict(data)
inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
got = read_orc(inputfile, columns=data.keys())
tm.assert_equal(expected, got)
def test_orc_reader_decimal(dirpath):
from decimal import Decimal
# Only testing the first 10 rows of data
data = {
"_col0": np.array(
[
Decimal("-1000.50000"),
Decimal("-999.60000"),
Decimal("-998.70000"),
Decimal("-997.80000"),
Decimal("-996.90000"),
Decimal("-995.10000"),
Decimal("-994.11000"),
Decimal("-993.12000"),
Decimal("-992.13000"),
Decimal("-991.14000"),
],
dtype="object",
)
}
expected = pd.DataFrame.from_dict(data)
inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
got = read_orc(inputfile).iloc[:10]
tm.assert_equal(expected, got)
def test_orc_reader_date_low(dirpath):
data = {
"time": np.array(
[
"1900-05-05 12:34:56.100000",
"1900-05-05 12:34:56.100100",
"1900-05-05 12:34:56.100200",
"1900-05-05 12:34:56.100300",
"1900-05-05 12:34:56.100400",
"1900-05-05 12:34:56.100500",
"1900-05-05 12:34:56.100600",
"1900-05-05 12:34:56.100700",
"1900-05-05 12:34:56.100800",
"1900-05-05 12:34:56.100900",
],
dtype="datetime64[ns]",
),
"date": np.array(
[
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
datetime.date(1900, 12, 25),
],
dtype="object",
),
}
expected = pd.DataFrame.from_dict(data)
inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
got = read_orc(inputfile).iloc[:10]
tm.assert_equal(expected, got)
def test_orc_reader_date_high(dirpath):
data = {
"time": np.array(
[
"2038-05-05 12:34:56.100000",
"2038-05-05 12:34:56.100100",
"2038-05-05 12:34:56.100200",
"2038-05-05 12:34:56.100300",
"2038-05-05 12:34:56.100400",
"2038-05-05 12:34:56.100500",
"2038-05-05 12:34:56.100600",
"2038-05-05 12:34:56.100700",
"2038-05-05 12:34:56.100800",
"2038-05-05 12:34:56.100900",
],
dtype="datetime64[ns]",
),
"date": np.array(
[
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
datetime.date(2038, 12, 25),
],
dtype="object",
),
}
expected = pd.DataFrame.from_dict(data)
inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
got = read_orc(inputfile).iloc[:10]
tm.assert_equal(expected, got)
def test_orc_reader_snappy_compressed(dirpath):
data = {
"int1": np.array(
[
-1160101563,
1181413113,
2065821249,
-267157795,
172111193,
1752363137,
1406072123,
1911809390,
-1308542224,
-467100286,
],
dtype="int32",
),
"string1": np.array(
[
"f50dcb8",
"382fdaaa",
"90758c6",
"9e8caf3f",
"ee97332b",
"d634da1",
"2bea4396",
"d67d89e8",
"ad71007e",
"e8c82066",
],
dtype="object",
),
}
expected = pd.DataFrame.from_dict(data)
inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
got = read_orc(inputfile).iloc[:10]
tm.assert_equal(expected, got)