Inzynierka/Lib/site-packages/pandas/tests/io/json/test_pandas.py
2023-06-02 12:51:02 +02:00

1966 lines
68 KiB
Python

import datetime
from datetime import timedelta
from decimal import Decimal
from io import StringIO
import json
import os
import sys
import time
import numpy as np
import pytest
from pandas.compat import IS64
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
NA,
DataFrame,
DatetimeIndex,
Series,
Timestamp,
read_json,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
StringArray,
)
def assert_json_roundtrip_equal(result, expected, orient):
if orient in ("records", "values"):
expected = expected.reset_index(drop=True)
if orient == "values":
expected.columns = range(len(expected.columns))
tm.assert_frame_equal(result, expected)
class TestPandasContainer:
@pytest.fixture
def categorical_frame(self):
_seriesd = tm.getSeriesData()
_cat_frame = DataFrame(_seriesd)
cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * (len(_cat_frame) - 15)
_cat_frame.index = pd.CategoricalIndex(cat, name="E")
_cat_frame["E"] = list(reversed(cat))
_cat_frame["sort"] = np.arange(len(_cat_frame), dtype="int64")
return _cat_frame
@pytest.fixture
def datetime_series(self):
# Same as usual datetime_series, but with index freq set to None,
# since that doesn't round-trip, see GH#33711
ser = tm.makeTimeSeries()
ser.name = "ts"
ser.index = ser.index._with_freq(None)
return ser
@pytest.fixture
def datetime_frame(self):
# Same as usual datetime_frame, but with index freq set to None,
# since that doesn't round-trip, see GH#33711
df = DataFrame(tm.getTimeSeriesData())
df.index = df.index._with_freq(None)
return df
def test_frame_double_encoded_labels(self, orient):
df = DataFrame(
[["a", "b"], ["c", "d"]],
index=['index " 1', "index / 2"],
columns=["a \\ b", "y / z"],
)
result = read_json(df.to_json(orient=orient), orient=orient)
expected = df.copy()
assert_json_roundtrip_equal(result, expected, orient)
@pytest.mark.parametrize("orient", ["split", "records", "values"])
def test_frame_non_unique_index(self, orient):
df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"])
result = read_json(df.to_json(orient=orient), orient=orient)
expected = df.copy()
assert_json_roundtrip_equal(result, expected, orient)
@pytest.mark.parametrize("orient", ["index", "columns"])
def test_frame_non_unique_index_raises(self, orient):
df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"])
msg = f"DataFrame index must be unique for orient='{orient}'"
with pytest.raises(ValueError, match=msg):
df.to_json(orient=orient)
@pytest.mark.parametrize("orient", ["split", "values"])
@pytest.mark.parametrize(
"data",
[
[["a", "b"], ["c", "d"]],
[[1.5, 2.5], [3.5, 4.5]],
[[1, 2.5], [3, 4.5]],
[[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]],
],
)
def test_frame_non_unique_columns(self, orient, data):
df = DataFrame(data, index=[1, 2], columns=["x", "x"])
result = read_json(
df.to_json(orient=orient), orient=orient, convert_dates=["x"]
)
if orient == "values":
expected = DataFrame(data)
if expected.iloc[:, 0].dtype == "datetime64[ns]":
# orient == "values" by default will write Timestamp objects out
# in milliseconds; these are internally stored in nanosecond,
# so divide to get where we need
# TODO: a to_epoch method would also solve; see GH 14772
expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000
elif orient == "split":
expected = df
expected.columns = ["x", "x.1"]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("orient", ["index", "columns", "records"])
def test_frame_non_unique_columns_raises(self, orient):
df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "x"])
msg = f"DataFrame columns must be unique for orient='{orient}'"
with pytest.raises(ValueError, match=msg):
df.to_json(orient=orient)
def test_frame_default_orient(self, float_frame):
assert float_frame.to_json() == float_frame.to_json(orient="columns")
@pytest.mark.parametrize("dtype", [False, float])
@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame):
data = float_frame.to_json(orient=orient)
result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype)
expected = float_frame
assert_json_roundtrip_equal(result, expected, orient)
@pytest.mark.parametrize("dtype", [False, np.int64])
@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame):
data = int_frame.to_json(orient=orient)
result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype)
expected = int_frame
assert_json_roundtrip_equal(result, expected, orient)
@pytest.mark.parametrize("dtype", [None, np.float64, int, "U3"])
@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_str_axes(self, orient, convert_axes, dtype):
df = DataFrame(
np.zeros((200, 4)),
columns=[str(i) for i in range(4)],
index=[str(i) for i in range(200)],
dtype=dtype,
)
data = df.to_json(orient=orient)
result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype)
expected = df.copy()
if not dtype:
expected = expected.astype(np.int64)
# index columns, and records orients cannot fully preserve the string
# dtype for axes as the index and column labels are used as keys in
# JSON objects. JSON keys are by definition strings, so there's no way
# to disambiguate whether those keys actually were strings or numeric
# beforehand and numeric wins out.
if convert_axes and (orient in ("index", "columns")):
expected.columns = expected.columns.astype(np.int64)
expected.index = expected.index.astype(np.int64)
elif orient == "records" and convert_axes:
expected.columns = expected.columns.astype(np.int64)
elif convert_axes and orient == "split":
expected.columns = expected.columns.astype(np.int64)
assert_json_roundtrip_equal(result, expected, orient)
@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_categorical(
self, request, orient, categorical_frame, convert_axes
):
# TODO: create a better frame to test with and improve coverage
if orient in ("index", "columns"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Can't have duplicate index values for orient '{orient}')"
)
)
data = categorical_frame.to_json(orient=orient)
result = read_json(data, orient=orient, convert_axes=convert_axes)
expected = categorical_frame.copy()
expected.index = expected.index.astype(str) # Categorical not preserved
expected.index.name = None # index names aren't preserved in JSON
assert_json_roundtrip_equal(result, expected, orient)
@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_empty(self, orient, convert_axes):
empty_frame = DataFrame()
data = empty_frame.to_json(orient=orient)
result = read_json(data, orient=orient, convert_axes=convert_axes)
if orient == "split":
idx = pd.Index([], dtype=(float if convert_axes else object))
expected = DataFrame(index=idx, columns=idx)
elif orient in ["index", "columns"]:
expected = DataFrame()
else:
expected = empty_frame.copy()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame):
# TODO: improve coverage with date_format parameter
data = datetime_frame.to_json(orient=orient)
result = read_json(data, orient=orient, convert_axes=convert_axes)
expected = datetime_frame.copy()
if not convert_axes: # one off for ts handling
# DTI gets converted to epoch values
idx = expected.index.view(np.int64) // 1000000
if orient != "split": # TODO: handle consistently across orients
idx = idx.astype(str)
expected.index = idx
assert_json_roundtrip_equal(result, expected, orient)
@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_mixed(self, orient, convert_axes):
index = pd.Index(["a", "b", "c", "d", "e"])
values = {
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
"D": [True, False, True, False, True],
}
df = DataFrame(data=values, index=index)
data = df.to_json(orient=orient)
result = read_json(data, orient=orient, convert_axes=convert_axes)
expected = df.copy()
expected = expected.assign(**expected.select_dtypes("number").astype(np.int64))
assert_json_roundtrip_equal(result, expected, orient)
@pytest.mark.xfail(
reason="#50456 Column multiindex is stored and loaded differently",
raises=AssertionError,
)
@pytest.mark.parametrize(
"columns",
[
[["2022", "2022"], ["JAN", "FEB"]],
[["2022", "2023"], ["JAN", "JAN"]],
[["2022", "2022"], ["JAN", "JAN"]],
],
)
def test_roundtrip_multiindex(self, columns):
df = DataFrame(
[[1, 2], [3, 4]],
columns=pd.MultiIndex.from_arrays(columns),
)
result = read_json(df.to_json(orient="split"), orient="split")
tm.assert_frame_equal(result, df)
@pytest.mark.parametrize(
"data,msg,orient",
[
('{"key":b:a:d}', "Expected object or value", "columns"),
# too few indices
(
'{"columns":["A","B"],'
'"index":["2","3"],'
'"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}',
"|".join(
[
r"Length of values \(3\) does not match length of index \(2\)",
]
),
"split",
),
# too many columns
(
'{"columns":["A","B","C"],'
'"index":["1","2","3"],'
'"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}',
"3 columns passed, passed data had 2 columns",
"split",
),
# bad key
(
'{"badkey":["A","B"],'
'"index":["2","3"],'
'"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}',
r"unexpected key\(s\): badkey",
"split",
),
],
)
def test_frame_from_json_bad_data_raises(self, data, msg, orient):
with pytest.raises(ValueError, match=msg):
read_json(StringIO(data), orient=orient)
@pytest.mark.parametrize("dtype", [True, False])
@pytest.mark.parametrize("convert_axes", [True, False])
def test_frame_from_json_missing_data(self, orient, convert_axes, dtype):
num_df = DataFrame([[1, 2], [4, 5, 6]])
result = read_json(
num_df.to_json(orient=orient),
orient=orient,
convert_axes=convert_axes,
dtype=dtype,
)
assert np.isnan(result.iloc[0, 2])
obj_df = DataFrame([["1", "2"], ["4", "5", "6"]])
result = read_json(
obj_df.to_json(orient=orient),
orient=orient,
convert_axes=convert_axes,
dtype=dtype,
)
assert np.isnan(result.iloc[0, 2])
@pytest.mark.parametrize("dtype", [True, False])
def test_frame_read_json_dtype_missing_value(self, dtype):
# GH28501 Parse missing values using read_json with dtype=False
# to NaN instead of None
result = read_json("[null]", dtype=dtype)
expected = DataFrame([np.nan])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("inf", [np.inf, np.NINF])
@pytest.mark.parametrize("dtype", [True, False])
def test_frame_infinity(self, inf, dtype):
# infinities get mapped to nulls which get mapped to NaNs during
# deserialisation
df = DataFrame([[1, 2], [4, 5, 6]])
df.loc[0, 2] = inf
result = read_json(df.to_json(), dtype=dtype)
assert np.isnan(result.iloc[0, 2])
@pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865")
@pytest.mark.parametrize(
"value,precision,expected_val",
[
(0.95, 1, 1.0),
(1.95, 1, 2.0),
(-1.95, 1, -2.0),
(0.995, 2, 1.0),
(0.9995, 3, 1.0),
(0.99999999999999944, 15, 1.0),
],
)
def test_frame_to_json_float_precision(self, value, precision, expected_val):
df = DataFrame([{"a_float": value}])
encoded = df.to_json(double_precision=precision)
assert encoded == f'{{"a_float":{{"0":{expected_val}}}}}'
def test_frame_to_json_except(self):
df = DataFrame([1, 2, 3])
msg = "Invalid value 'garbage' for option 'orient'"
with pytest.raises(ValueError, match=msg):
df.to_json(orient="garbage")
def test_frame_empty(self):
df = DataFrame(columns=["jim", "joe"])
assert not df._is_mixed_type
tm.assert_frame_equal(
read_json(df.to_json(), dtype=dict(df.dtypes)), df, check_index_type=False
)
# GH 7445
result = DataFrame({"test": []}, index=[]).to_json(orient="columns")
expected = '{"test":{}}'
assert result == expected
def test_frame_empty_mixedtype(self):
# mixed type
df = DataFrame(columns=["jim", "joe"])
df["joe"] = df["joe"].astype("i8")
assert df._is_mixed_type
tm.assert_frame_equal(
read_json(df.to_json(), dtype=dict(df.dtypes)), df, check_index_type=False
)
def test_frame_mixedtype_orient(self): # GH10289
vals = [
[10, 1, "foo", 0.1, 0.01],
[20, 2, "bar", 0.2, 0.02],
[30, 3, "baz", 0.3, 0.03],
[40, 4, "qux", 0.4, 0.04],
]
df = DataFrame(
vals, index=list("abcd"), columns=["1st", "2nd", "3rd", "4th", "5th"]
)
assert df._is_mixed_type
right = df.copy()
for orient in ["split", "index", "columns"]:
inp = df.to_json(orient=orient)
left = read_json(inp, orient=orient, convert_axes=False)
tm.assert_frame_equal(left, right)
right.index = pd.RangeIndex(len(df))
inp = df.to_json(orient="records")
left = read_json(inp, orient="records", convert_axes=False)
tm.assert_frame_equal(left, right)
right.columns = pd.RangeIndex(df.shape[1])
inp = df.to_json(orient="values")
left = read_json(inp, orient="values", convert_axes=False)
tm.assert_frame_equal(left, right)
def test_v12_compat(self, datapath):
dti = pd.date_range("2000-01-03", "2000-01-07")
# freq doesn't roundtrip
dti = DatetimeIndex(np.asarray(dti), freq=None)
df = DataFrame(
[
[1.56808523, 0.65727391, 1.81021139, -0.17251653],
[-0.2550111, -0.08072427, -0.03202878, -0.17581665],
[1.51493992, 0.11805825, 1.629455, -1.31506612],
[-0.02765498, 0.44679743, 0.33192641, -0.27885413],
[0.05951614, -2.69652057, 1.28163262, 0.34703478],
],
columns=["A", "B", "C", "D"],
index=dti,
)
df["date"] = Timestamp("19920106 18:21:32.12")
df.iloc[3, df.columns.get_loc("date")] = Timestamp("20130101")
df["modified"] = df["date"]
df.iloc[1, df.columns.get_loc("modified")] = pd.NaT
dirpath = datapath("io", "json", "data")
v12_json = os.path.join(dirpath, "tsframe_v012.json")
df_unser = read_json(v12_json)
tm.assert_frame_equal(df, df_unser)
df_iso = df.drop(["modified"], axis=1)
v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json")
df_unser_iso = read_json(v12_iso_json)
tm.assert_frame_equal(df_iso, df_unser_iso)
def test_blocks_compat_GH9037(self):
index = pd.date_range("20000101", periods=10, freq="H")
# freq doesn't round-trip
index = DatetimeIndex(list(index), freq=None)
df_mixed = DataFrame(
{
"float_1": [
-0.92077639,
0.77434435,
1.25234727,
0.61485564,
-0.60316077,
0.24653374,
0.28668979,
-2.51969012,
0.95748401,
-1.02970536,
],
"int_1": [
19680418,
75337055,
99973684,
65103179,
79373900,
40314334,
21290235,
4991321,
41903419,
16008365,
],
"str_1": [
"78c608f1",
"64a99743",
"13d2ff52",
"ca7f4af2",
"97236474",
"bde7e214",
"1a6bde47",
"b1190be5",
"7a669144",
"8d64d068",
],
"float_2": [
-0.0428278,
-1.80872357,
3.36042349,
-0.7573685,
-0.48217572,
0.86229683,
1.08935819,
0.93898739,
-0.03030452,
1.43366348,
],
"str_2": [
"14f04af9",
"d085da90",
"4bcfac83",
"81504caf",
"2ffef4a9",
"08e2f5c4",
"07e1af03",
"addbd4a7",
"1f6a09ba",
"4bfc4d87",
],
"int_2": [
86967717,
98098830,
51927505,
20372254,
12601730,
20884027,
34193846,
10561746,
24867120,
76131025,
],
},
index=index,
)
# JSON deserialisation always creates unicode strings
df_mixed.columns = df_mixed.columns.astype("unicode")
df_roundtrip = read_json(df_mixed.to_json(orient="split"), orient="split")
tm.assert_frame_equal(
df_mixed,
df_roundtrip,
check_index_type=True,
check_column_type=True,
by_blocks=True,
check_exact=True,
)
def test_frame_nonprintable_bytes(self):
# GH14256: failing column caused segfaults, if it is not the last one
class BinaryThing:
def __init__(self, hexed) -> None:
self.hexed = hexed
self.binary = bytes.fromhex(hexed)
def __str__(self) -> str:
return self.hexed
hexed = "574b4454ba8c5eb4f98a8f45"
binthing = BinaryThing(hexed)
# verify the proper conversion of printable content
df_printable = DataFrame({"A": [binthing.hexed]})
assert df_printable.to_json() == f'{{"A":{{"0":"{hexed}"}}}}'
# check if non-printable content throws appropriate Exception
df_nonprintable = DataFrame({"A": [binthing]})
msg = "Unsupported UTF-8 sequence length when encoding string"
with pytest.raises(OverflowError, match=msg):
df_nonprintable.to_json()
# the same with multiple columns threw segfaults
df_mixed = DataFrame({"A": [binthing], "B": [1]}, columns=["A", "B"])
with pytest.raises(OverflowError, match=msg):
df_mixed.to_json()
# default_handler should resolve exceptions for non-string types
result = df_nonprintable.to_json(default_handler=str)
expected = f'{{"A":{{"0":"{hexed}"}}}}'
assert result == expected
assert (
df_mixed.to_json(default_handler=str)
== f'{{"A":{{"0":"{hexed}"}},"B":{{"0":1}}}}'
)
def test_label_overflow(self):
# GH14256: buffer length not checked when writing label
result = DataFrame({"bar" * 100000: [1], "foo": [1337]}).to_json()
expected = f'{{"{"bar" * 100000}":{{"0":1}},"foo":{{"0":1337}}}}'
assert result == expected
def test_series_non_unique_index(self):
s = Series(["a", "b"], index=[1, 1])
msg = "Series index must be unique for orient='index'"
with pytest.raises(ValueError, match=msg):
s.to_json(orient="index")
tm.assert_series_equal(
s, read_json(s.to_json(orient="split"), orient="split", typ="series")
)
unserialized = read_json(
s.to_json(orient="records"), orient="records", typ="series"
)
tm.assert_numpy_array_equal(s.values, unserialized.values)
def test_series_default_orient(self, string_series):
assert string_series.to_json() == string_series.to_json(orient="index")
def test_series_roundtrip_simple(self, orient, string_series):
data = string_series.to_json(orient=orient)
result = read_json(data, typ="series", orient=orient)
expected = string_series
if orient in ("values", "records"):
expected = expected.reset_index(drop=True)
if orient != "split":
expected.name = None
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dtype", [False, None])
def test_series_roundtrip_object(self, orient, dtype, object_series):
data = object_series.to_json(orient=orient)
result = read_json(data, typ="series", orient=orient, dtype=dtype)
expected = object_series
if orient in ("values", "records"):
expected = expected.reset_index(drop=True)
if orient != "split":
expected.name = None
tm.assert_series_equal(result, expected)
def test_series_roundtrip_empty(self, orient):
empty_series = Series([], index=[], dtype=np.float64)
data = empty_series.to_json(orient=orient)
result = read_json(data, typ="series", orient=orient)
expected = empty_series.reset_index(drop=True)
if orient in ("split"):
expected.index = expected.index.astype(np.float64)
tm.assert_series_equal(result, expected)
def test_series_roundtrip_timeseries(self, orient, datetime_series):
data = datetime_series.to_json(orient=orient)
result = read_json(data, typ="series", orient=orient)
expected = datetime_series
if orient in ("values", "records"):
expected = expected.reset_index(drop=True)
if orient != "split":
expected.name = None
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dtype", [np.float64, int])
def test_series_roundtrip_numeric(self, orient, dtype):
s = Series(range(6), index=["a", "b", "c", "d", "e", "f"])
data = s.to_json(orient=orient)
result = read_json(data, typ="series", orient=orient)
expected = s.copy()
if orient in ("values", "records"):
expected = expected.reset_index(drop=True)
tm.assert_series_equal(result, expected)
def test_series_to_json_except(self):
s = Series([1, 2, 3])
msg = "Invalid value 'garbage' for option 'orient'"
with pytest.raises(ValueError, match=msg):
s.to_json(orient="garbage")
def test_series_from_json_precise_float(self):
s = Series([4.56, 4.56, 4.56])
result = read_json(s.to_json(), typ="series", precise_float=True)
tm.assert_series_equal(result, s, check_index_type=False)
def test_series_with_dtype(self):
# GH 21986
s = Series([4.56, 4.56, 4.56])
result = read_json(s.to_json(), typ="series", dtype=np.int64)
expected = Series([4] * 3)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dtype,expected",
[
(True, Series(["2000-01-01"], dtype="datetime64[ns]")),
(False, Series([946684800000])),
],
)
def test_series_with_dtype_datetime(self, dtype, expected):
s = Series(["2000-01-01"], dtype="datetime64[ns]")
data = s.to_json()
result = read_json(data, typ="series", dtype=dtype)
tm.assert_series_equal(result, expected)
def test_frame_from_json_precise_float(self):
df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]])
result = read_json(df.to_json(), precise_float=True)
tm.assert_frame_equal(result, df)
def test_typ(self):
s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64")
result = read_json(s.to_json(), typ=None)
tm.assert_series_equal(result, s)
def test_reconstruction_index(self):
df = DataFrame([[1, 2, 3], [4, 5, 6]])
result = read_json(df.to_json())
tm.assert_frame_equal(result, df)
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["A", "B", "C"])
result = read_json(df.to_json())
tm.assert_frame_equal(result, df)
def test_path(self, float_frame, int_frame, datetime_frame):
with tm.ensure_clean("test.json") as path:
for df in [float_frame, int_frame, datetime_frame]:
df.to_json(path)
read_json(path)
def test_axis_dates(self, datetime_series, datetime_frame):
# frame
json = datetime_frame.to_json()
result = read_json(json)
tm.assert_frame_equal(result, datetime_frame)
# series
json = datetime_series.to_json()
result = read_json(json, typ="series")
tm.assert_series_equal(result, datetime_series, check_names=False)
assert result.name is None
def test_convert_dates(self, datetime_series, datetime_frame):
# frame
df = datetime_frame
df["date"] = Timestamp("20130101")
json = df.to_json()
result = read_json(json)
tm.assert_frame_equal(result, df)
df["foo"] = 1.0
json = df.to_json(date_unit="ns")
result = read_json(json, convert_dates=False)
expected = df.copy()
expected["date"] = expected["date"].values.view("i8")
expected["foo"] = expected["foo"].astype("int64")
tm.assert_frame_equal(result, expected)
# series
ts = Series(Timestamp("20130101"), index=datetime_series.index)
json = ts.to_json()
result = read_json(json, typ="series")
tm.assert_series_equal(result, ts)
@pytest.mark.parametrize("date_format", ["epoch", "iso"])
@pytest.mark.parametrize("as_object", [True, False])
@pytest.mark.parametrize("date_typ", [datetime.date, datetime.datetime, Timestamp])
def test_date_index_and_values(self, date_format, as_object, date_typ):
data = [date_typ(year=2020, month=1, day=1), pd.NaT]
if as_object:
data.append("a")
ser = Series(data, index=data)
result = ser.to_json(date_format=date_format)
if date_format == "epoch":
expected = '{"1577836800000":1577836800000,"null":null}'
else:
expected = (
'{"2020-01-01T00:00:00.000":"2020-01-01T00:00:00.000","null":null}'
)
if as_object:
expected = expected.replace("}", ',"a":"a"}')
assert result == expected
@pytest.mark.parametrize(
"infer_word",
[
"trade_time",
"date",
"datetime",
"sold_at",
"modified",
"timestamp",
"timestamps",
],
)
def test_convert_dates_infer(self, infer_word):
# GH10747
from pandas.io.json import dumps
data = [{"id": 1, infer_word: 1036713600000}, {"id": 2}]
expected = DataFrame(
[[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word]
)
result = read_json(dumps(data))[["id", infer_word]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"date,date_unit",
[
("20130101 20:43:42.123", None),
("20130101 20:43:42", "s"),
("20130101 20:43:42.123", "ms"),
("20130101 20:43:42.123456", "us"),
("20130101 20:43:42.123456789", "ns"),
],
)
def test_date_format_frame(self, date, date_unit, datetime_frame):
df = datetime_frame
df["date"] = Timestamp(date)
df.iloc[1, df.columns.get_loc("date")] = pd.NaT
df.iloc[5, df.columns.get_loc("date")] = pd.NaT
if date_unit:
json = df.to_json(date_format="iso", date_unit=date_unit)
else:
json = df.to_json(date_format="iso")
result = read_json(json)
expected = df.copy()
tm.assert_frame_equal(result, expected)
def test_date_format_frame_raises(self, datetime_frame):
df = datetime_frame
msg = "Invalid value 'foo' for option 'date_unit'"
with pytest.raises(ValueError, match=msg):
df.to_json(date_format="iso", date_unit="foo")
@pytest.mark.parametrize(
"date,date_unit",
[
("20130101 20:43:42.123", None),
("20130101 20:43:42", "s"),
("20130101 20:43:42.123", "ms"),
("20130101 20:43:42.123456", "us"),
("20130101 20:43:42.123456789", "ns"),
],
)
def test_date_format_series(self, date, date_unit, datetime_series):
ts = Series(Timestamp(date), index=datetime_series.index)
ts.iloc[1] = pd.NaT
ts.iloc[5] = pd.NaT
if date_unit:
json = ts.to_json(date_format="iso", date_unit=date_unit)
else:
json = ts.to_json(date_format="iso")
result = read_json(json, typ="series")
expected = ts.copy()
tm.assert_series_equal(result, expected)
def test_date_format_series_raises(self, datetime_series):
ts = Series(Timestamp("20130101 20:43:42.123"), index=datetime_series.index)
msg = "Invalid value 'foo' for option 'date_unit'"
with pytest.raises(ValueError, match=msg):
ts.to_json(date_format="iso", date_unit="foo")
@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
def test_date_unit(self, unit, datetime_frame):
df = datetime_frame
df["date"] = Timestamp("20130101 20:43:42")
dl = df.columns.get_loc("date")
df.iloc[1, dl] = Timestamp("19710101 20:43:42")
df.iloc[2, dl] = Timestamp("21460101 20:43:42")
df.iloc[4, dl] = pd.NaT
json = df.to_json(date_format="epoch", date_unit=unit)
# force date unit
result = read_json(json, date_unit=unit)
tm.assert_frame_equal(result, df)
# detect date unit
result = read_json(json, date_unit=None)
tm.assert_frame_equal(result, df)
def test_weird_nested_json(self):
# this used to core dump the parser
s = r"""{
"status": "success",
"data": {
"posts": [
{
"id": 1,
"title": "A blog post",
"body": "Some useful content"
},
{
"id": 2,
"title": "Another blog post",
"body": "More content"
}
]
}
}"""
read_json(s)
def test_doc_example(self):
dfj2 = DataFrame(np.random.randn(5, 2), columns=list("AB"))
dfj2["date"] = Timestamp("20130101")
dfj2["ints"] = range(5)
dfj2["bools"] = True
dfj2.index = pd.date_range("20130101", periods=5)
json = dfj2.to_json()
result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_})
tm.assert_frame_equal(result, result)
def test_round_trip_exception_(self, datapath):
# GH 3867
path = datapath("io", "json", "data", "teams.csv")
df = pd.read_csv(path)
s = df.to_json()
result = read_json(s)
tm.assert_frame_equal(result.reindex(index=df.index, columns=df.columns), df)
@pytest.mark.network
@tm.network(
url="https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5",
check_before_test=True,
)
@pytest.mark.parametrize(
"field,dtype",
[
["created_at", pd.DatetimeTZDtype(tz="UTC")],
["closed_at", "datetime64[ns]"],
["updated_at", pd.DatetimeTZDtype(tz="UTC")],
],
)
def test_url(self, field, dtype):
url = "https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5"
result = read_json(url, convert_dates=True)
assert result[field].dtype == dtype
def test_timedelta(self):
converter = lambda x: pd.to_timedelta(x, unit="ms")
ser = Series([timedelta(23), timedelta(seconds=5)])
assert ser.dtype == "timedelta64[ns]"
result = read_json(ser.to_json(), typ="series").apply(converter)
tm.assert_series_equal(result, ser)
ser = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1]))
assert ser.dtype == "timedelta64[ns]"
result = read_json(ser.to_json(), typ="series").apply(converter)
tm.assert_series_equal(result, ser)
frame = DataFrame([timedelta(23), timedelta(seconds=5)])
assert frame[0].dtype == "timedelta64[ns]"
tm.assert_frame_equal(frame, read_json(frame.to_json()).apply(converter))
def test_timedelta2(self):
frame = DataFrame(
{
"a": [timedelta(days=23), timedelta(seconds=5)],
"b": [1, 2],
"c": pd.date_range(start="20130101", periods=2),
}
)
result = read_json(frame.to_json(date_unit="ns"))
result["a"] = pd.to_timedelta(result.a, unit="ns")
result["c"] = pd.to_datetime(result.c)
tm.assert_frame_equal(frame, result)
def test_mixed_timedelta_datetime(self):
td = timedelta(23)
ts = Timestamp("20130101")
frame = DataFrame({"a": [td, ts]}, dtype=object)
expected = DataFrame(
{"a": [pd.Timedelta(td).as_unit("ns")._value, ts.as_unit("ns")._value]}
)
result = read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"})
tm.assert_frame_equal(result, expected, check_index_type=False)
@pytest.mark.parametrize("as_object", [True, False])
@pytest.mark.parametrize("date_format", ["iso", "epoch"])
@pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta])
def test_timedelta_to_json(self, as_object, date_format, timedelta_typ):
# GH28156: to_json not correctly formatting Timedelta
data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT]
if as_object:
data.append("a")
ser = Series(data, index=data)
if date_format == "iso":
expected = (
'{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}'
)
else:
expected = '{"86400000":86400000,"172800000":172800000,"null":null}'
if as_object:
expected = expected.replace("}", ',"a":"a"}')
result = ser.to_json(date_format=date_format)
assert result == expected
def test_default_handler(self):
value = object()
frame = DataFrame({"a": [7, value]})
expected = DataFrame({"a": [7, str(value)]})
result = read_json(frame.to_json(default_handler=str))
tm.assert_frame_equal(expected, result, check_index_type=False)
def test_default_handler_indirect(self):
from pandas.io.json import dumps
def default(obj):
if isinstance(obj, complex):
return [("mathjs", "Complex"), ("re", obj.real), ("im", obj.imag)]
return str(obj)
df_list = [
9,
DataFrame(
{"a": [1, "STR", complex(4, -5)], "b": [float("nan"), None, "N/A"]},
columns=["a", "b"],
),
]
expected = (
'[9,[[1,null],["STR",null],[[["mathjs","Complex"],'
'["re",4.0],["im",-5.0]],"N\\/A"]]]'
)
assert dumps(df_list, default_handler=default, orient="values") == expected
def test_default_handler_numpy_unsupported_dtype(self):
# GH12554 to_json raises 'Unhandled numpy dtype 15'
df = DataFrame(
{"a": [1, 2.3, complex(4, -5)], "b": [float("nan"), None, complex(1.2, 0)]},
columns=["a", "b"],
)
expected = (
'[["(1+0j)","(nan+0j)"],'
'["(2.3+0j)","(nan+0j)"],'
'["(4-5j)","(1.2+0j)"]]'
)
assert df.to_json(default_handler=str, orient="values") == expected
def test_default_handler_raises(self):
msg = "raisin"
def my_handler_raises(obj):
raise TypeError(msg)
with pytest.raises(TypeError, match=msg):
DataFrame({"a": [1, 2, object()]}).to_json(
default_handler=my_handler_raises
)
with pytest.raises(TypeError, match=msg):
DataFrame({"a": [1, 2, complex(4, -5)]}).to_json(
default_handler=my_handler_raises
)
def test_categorical(self):
# GH4377 df.to_json segfaults with non-ndarray blocks
df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]})
df["B"] = df["A"]
expected = df.to_json()
df["B"] = df["A"].astype("category")
assert expected == df.to_json()
s = df["A"]
sc = df["B"]
assert s.to_json() == sc.to_json()
def test_datetime_tz(self):
# GH4377 df.to_json segfaults with non-ndarray blocks
tz_range = pd.date_range("20130101", periods=3, tz="US/Eastern")
tz_naive = tz_range.tz_convert("utc").tz_localize(None)
df = DataFrame({"A": tz_range, "B": pd.date_range("20130101", periods=3)})
df_naive = df.copy()
df_naive["A"] = tz_naive
expected = df_naive.to_json()
assert expected == df.to_json()
stz = Series(tz_range)
s_naive = Series(tz_naive)
assert stz.to_json() == s_naive.to_json()
def test_sparse(self):
# GH4377 df.to_json segfaults with non-ndarray blocks
df = DataFrame(np.random.randn(10, 4))
df.loc[:8] = np.nan
sdf = df.astype("Sparse")
expected = df.to_json()
assert expected == sdf.to_json()
s = Series(np.random.randn(10))
s.loc[:8] = np.nan
ss = s.astype("Sparse")
expected = s.to_json()
assert expected == ss.to_json()
@pytest.mark.parametrize(
"ts",
[
Timestamp("2013-01-10 05:00:00Z"),
Timestamp("2013-01-10 00:00:00", tz="US/Eastern"),
Timestamp("2013-01-10 00:00:00-0500"),
],
)
def test_tz_is_utc(self, ts):
from pandas.io.json import dumps
exp = '"2013-01-10T05:00:00.000Z"'
assert dumps(ts, iso_dates=True) == exp
dt = ts.to_pydatetime()
assert dumps(dt, iso_dates=True) == exp
def test_tz_is_naive(self):
from pandas.io.json import dumps
ts = Timestamp("2013-01-10 05:00:00")
exp = '"2013-01-10T05:00:00.000"'
assert dumps(ts, iso_dates=True) == exp
dt = ts.to_pydatetime()
assert dumps(dt, iso_dates=True) == exp
@pytest.mark.parametrize(
"tz_range",
[
pd.date_range("2013-01-01 05:00:00Z", periods=2),
pd.date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern"),
pd.date_range("2013-01-01 00:00:00-0500", periods=2),
],
)
def test_tz_range_is_utc(self, tz_range):
from pandas.io.json import dumps
exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]'
dfexp = (
'{"DT":{'
'"0":"2013-01-01T05:00:00.000Z",'
'"1":"2013-01-02T05:00:00.000Z"}}'
)
assert dumps(tz_range, iso_dates=True) == exp
dti = DatetimeIndex(tz_range)
# Ensure datetimes in object array are serialized correctly
# in addition to the normal DTI case
assert dumps(dti, iso_dates=True) == exp
assert dumps(dti.astype(object), iso_dates=True) == exp
df = DataFrame({"DT": dti})
result = dumps(df, iso_dates=True)
assert result == dfexp
assert dumps(df.astype({"DT": object}), iso_dates=True)
def test_tz_range_is_naive(self):
from pandas.io.json import dumps
dti = pd.date_range("2013-01-01 05:00:00", periods=2)
exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]'
dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}'
# Ensure datetimes in object array are serialized correctly
# in addition to the normal DTI case
assert dumps(dti, iso_dates=True) == exp
assert dumps(dti.astype(object), iso_dates=True) == exp
df = DataFrame({"DT": dti})
result = dumps(df, iso_dates=True)
assert result == dfexp
assert dumps(df.astype({"DT": object}), iso_dates=True)
def test_read_inline_jsonl(self):
# GH9180
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
@pytest.mark.single_cpu
@td.skip_if_not_us_locale
def test_read_s3_jsonl(self, s3_resource, s3so):
# GH17200
result = read_json(
"s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so
)
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_read_local_jsonl(self):
# GH17200
with tm.ensure_clean("tmp_items.json") as path:
with open(path, "w") as infile:
infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
result = read_json(path, lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_read_jsonl_unicode_chars(self):
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK
# simulate file handle
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
json = StringIO(json)
result = read_json(json, lines=True)
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# simulate string
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
result = read_json(json, lines=True)
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
def test_to_json_large_numbers(self, bigNum):
# GH34473
series = Series(bigNum, dtype=object, index=["articleId"])
json = series.to_json()
expected = '{"articleId":' + str(bigNum) + "}"
assert json == expected
df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0])
json = df.to_json()
expected = '{"0":{"articleId":' + str(bigNum) + "}}"
assert json == expected
@pytest.mark.parametrize("bigNum", [-(2**63) - 1, 2**64])
def test_read_json_large_numbers(self, bigNum):
# GH20599, 26068
json = StringIO('{"articleId":' + str(bigNum) + "}")
msg = r"Value is too small|Value is too big"
with pytest.raises(ValueError, match=msg):
read_json(json)
json = StringIO('{"0":{"articleId":' + str(bigNum) + "}}")
with pytest.raises(ValueError, match=msg):
read_json(json)
def test_read_json_large_numbers2(self):
# GH18842
json = '{"articleId": "1404366058080022500245"}'
json = StringIO(json)
result = read_json(json, typ="series")
expected = Series(1.404366e21, index=["articleId"])
tm.assert_series_equal(result, expected)
json = '{"0": {"articleId": "1404366058080022500245"}}'
json = StringIO(json)
result = read_json(json)
expected = DataFrame(1.404366e21, index=["articleId"], columns=[0])
tm.assert_frame_equal(result, expected)
def test_to_jsonl(self):
# GH9180
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
assert result == expected
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
assert result == expected
tm.assert_frame_equal(read_json(result, lines=True), df)
# GH15096: escaped characters in columns and data
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
assert result == expected
tm.assert_frame_equal(read_json(result, lines=True), df)
# TODO: there is a near-identical test for pytables; can we share?
@pytest.mark.xfail(reason="GH#13774 encoding kwarg not supported", raises=TypeError)
def test_latin_encoding(self):
# GH 13774
values = [
[b"E\xc9, 17", b"", b"a", b"b", b"c"],
[b"E\xc9, 17", b"a", b"b", b"c"],
[b"EE, 17", b"", b"a", b"b", b"c"],
[b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"],
[b"", b"a", b"b", b"c"],
[b"\xf8\xfc", b"a", b"b", b"c"],
[b"A\xf8\xfc", b"", b"a", b"b", b"c"],
[np.nan, b"", b"b", b"c"],
[b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
]
values = [
[x.decode("latin-1") if isinstance(x, bytes) else x for x in y]
for y in values
]
examples = []
for dtype in ["category", object]:
for val in values:
examples.append(Series(val, dtype=dtype))
def roundtrip(s, encoding="latin-1"):
with tm.ensure_clean("test.json") as path:
s.to_json(path, encoding=encoding)
retr = read_json(path, encoding=encoding)
tm.assert_series_equal(s, retr, check_categorical=False)
for s in examples:
roundtrip(s)
def test_data_frame_size_after_to_json(self):
# GH15344
df = DataFrame({"a": [str(1)]})
size_before = df.memory_usage(index=True, deep=True).sum()
df.to_json()
size_after = df.memory_usage(index=True, deep=True).sum()
assert size_before == size_after
@pytest.mark.parametrize(
"index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]]
)
@pytest.mark.parametrize("columns", [["a", "b"], ["1", "2"], ["1.", "2."]])
def test_from_json_to_json_table_index_and_columns(self, index, columns):
# GH25433 GH25435
expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns)
dfjson = expected.to_json(orient="table")
result = read_json(dfjson, orient="table")
tm.assert_frame_equal(result, expected)
def test_from_json_to_json_table_dtypes(self):
# GH21345
expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]})
dfjson = expected.to_json(orient="table")
result = read_json(dfjson, orient="table")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("orient", ["split", "records", "index", "columns"])
def test_to_json_from_json_columns_dtypes(self, orient):
# GH21892 GH33205
expected = DataFrame.from_dict(
{
"Integer": Series([1, 2, 3], dtype="int64"),
"Float": Series([None, 2.0, 3.0], dtype="float64"),
"Object": Series([None, "", "c"], dtype="object"),
"Bool": Series([True, False, True], dtype="bool"),
"Category": Series(["a", "b", None], dtype="category"),
"Datetime": Series(
["2020-01-01", None, "2020-01-03"], dtype="datetime64[ns]"
),
}
)
dfjson = expected.to_json(orient=orient)
result = read_json(
dfjson,
orient=orient,
dtype={
"Integer": "int64",
"Float": "float64",
"Object": "object",
"Bool": "bool",
"Category": "category",
"Datetime": "datetime64[ns]",
},
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}])
def test_read_json_table_dtype_raises(self, dtype):
# GH21345
df = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]})
dfjson = df.to_json(orient="table")
msg = "cannot pass both dtype and orient='table'"
with pytest.raises(ValueError, match=msg):
read_json(dfjson, orient="table", dtype=dtype)
def test_read_json_table_convert_axes_raises(self):
# GH25433 GH25435
df = DataFrame([[1, 2], [3, 4]], index=[1.0, 2.0], columns=["1.", "2."])
dfjson = df.to_json(orient="table")
msg = "cannot pass both convert_axes and orient='table'"
with pytest.raises(ValueError, match=msg):
read_json(dfjson, orient="table", convert_axes=True)
@pytest.mark.parametrize(
"data, expected",
[
(
DataFrame([[1, 2], [4, 5]], columns=["a", "b"]),
{"columns": ["a", "b"], "data": [[1, 2], [4, 5]]},
),
(
DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo"),
{"columns": ["a", "b"], "data": [[1, 2], [4, 5]]},
),
(
DataFrame(
[[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]]
),
{"columns": ["a", "b"], "data": [[1, 2], [4, 5]]},
),
(Series([1, 2, 3], name="A"), {"name": "A", "data": [1, 2, 3]}),
(
Series([1, 2, 3], name="A").rename_axis("foo"),
{"name": "A", "data": [1, 2, 3]},
),
(
Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]]),
{"name": "A", "data": [1, 2]},
),
],
)
def test_index_false_to_json_split(self, data, expected):
# GH 17394
# Testing index=False in to_json with orient='split'
result = data.to_json(orient="split", index=False)
result = json.loads(result)
assert result == expected
@pytest.mark.parametrize(
"data",
[
(DataFrame([[1, 2], [4, 5]], columns=["a", "b"])),
(DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo")),
(
DataFrame(
[[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]]
)
),
(Series([1, 2, 3], name="A")),
(Series([1, 2, 3], name="A").rename_axis("foo")),
(Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]])),
],
)
def test_index_false_to_json_table(self, data):
# GH 17394
# Testing index=False in to_json with orient='table'
result = data.to_json(orient="table", index=False)
result = json.loads(result)
expected = {
"schema": pd.io.json.build_table_schema(data, index=False),
"data": DataFrame(data).to_dict(orient="records"),
}
assert result == expected
@pytest.mark.parametrize("orient", ["records", "index", "columns", "values"])
def test_index_false_error_to_json(self, orient):
# GH 17394
# Testing error message from to_json with index=False
df = DataFrame([[1, 2], [4, 5]], columns=["a", "b"])
msg = "'index=False' is only valid when 'orient' is 'split' or 'table'"
with pytest.raises(ValueError, match=msg):
df.to_json(orient=orient, index=False)
@pytest.mark.parametrize("orient", ["split", "table"])
@pytest.mark.parametrize("index", [True, False])
def test_index_false_from_json_to_json(self, orient, index):
# GH25170
# Test index=False in from_json to_json
expected = DataFrame({"a": [1, 2], "b": [3, 4]})
dfjson = expected.to_json(orient=orient, index=index)
result = read_json(dfjson, orient=orient)
tm.assert_frame_equal(result, expected)
def test_read_timezone_information(self):
# GH 25546
result = read_json(
'{"2019-01-01T11:00:00.000Z":88}', typ="series", orient="index"
)
expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC"))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"url",
[
"s3://example-fsspec/",
"gcs://another-fsspec/file.json",
"https://example-site.com/data",
"some-protocol://data.txt",
],
)
def test_read_json_with_url_value(self, url):
# GH 36271
result = read_json(f'{{"url":{{"0":"{url}"}}}}')
expected = DataFrame({"url": [url]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"compression",
["", ".gz", ".bz2", ".tar"],
)
def test_read_json_with_very_long_file_path(self, compression):
# GH 46718
long_json_path = f'{"a" * 1000}.json{compression}'
with pytest.raises(
FileNotFoundError, match=f"File {long_json_path} does not exist"
):
# path too long for Windows is handled in file_exists() but raises in
# _get_data_from_filepath()
read_json(long_json_path)
@pytest.mark.parametrize(
"date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")]
)
def test_timedelta_as_label(self, date_format, key):
df = DataFrame([[1]], columns=[pd.Timedelta("1D")])
expected = f'{{"{key}":{{"0":1}}}}'
result = df.to_json(date_format=date_format)
assert result == expected
@pytest.mark.parametrize(
"orient,expected",
[
("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"),
("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"),
# TODO: the below have separate encoding procedures
pytest.param(
"split",
"",
marks=pytest.mark.xfail(
reason="Produces JSON but not in a consistent manner"
),
),
pytest.param(
"table",
"",
marks=pytest.mark.xfail(
reason="Produces JSON but not in a consistent manner"
),
),
],
)
def test_tuple_labels(self, orient, expected):
# GH 20500
df = DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")])
result = df.to_json(orient=orient)
assert result == expected
@pytest.mark.parametrize("indent", [1, 2, 4])
def test_to_json_indent(self, indent):
# GH 12004
df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"])
result = df.to_json(indent=indent)
spaces = " " * indent
expected = f"""{{
{spaces}"a":{{
{spaces}{spaces}"0":"foo",
{spaces}{spaces}"1":"baz"
{spaces}}},
{spaces}"b":{{
{spaces}{spaces}"0":"bar",
{spaces}{spaces}"1":"qux"
{spaces}}}
}}"""
assert result == expected
@pytest.mark.parametrize(
"orient,expected",
[
(
"split",
"""{
"columns":[
"a",
"b"
],
"index":[
0,
1
],
"data":[
[
"foo",
"bar"
],
[
"baz",
"qux"
]
]
}""",
),
(
"records",
"""[
{
"a":"foo",
"b":"bar"
},
{
"a":"baz",
"b":"qux"
}
]""",
),
(
"index",
"""{
"0":{
"a":"foo",
"b":"bar"
},
"1":{
"a":"baz",
"b":"qux"
}
}""",
),
(
"columns",
"""{
"a":{
"0":"foo",
"1":"baz"
},
"b":{
"0":"bar",
"1":"qux"
}
}""",
),
(
"values",
"""[
[
"foo",
"bar"
],
[
"baz",
"qux"
]
]""",
),
(
"table",
"""{
"schema":{
"fields":[
{
"name":"index",
"type":"integer"
},
{
"name":"a",
"type":"string"
},
{
"name":"b",
"type":"string"
}
],
"primaryKey":[
"index"
],
"pandas_version":"1.4.0"
},
"data":[
{
"index":0,
"a":"foo",
"b":"bar"
},
{
"index":1,
"a":"baz",
"b":"qux"
}
]
}""",
),
],
)
def test_json_indent_all_orients(self, orient, expected):
# GH 12004
df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"])
result = df.to_json(orient=orient, indent=4)
assert result == expected
def test_json_negative_indent_raises(self):
with pytest.raises(ValueError, match="must be a nonnegative integer"):
DataFrame().to_json(indent=-1)
def test_emca_262_nan_inf_support(self):
# GH 12213
data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]'
result = read_json(data)
expected = DataFrame(
["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"]
)
tm.assert_frame_equal(result, expected)
def test_frame_int_overflow(self):
# GH 30320
encoded_json = json.dumps([{"col": "31900441201190696999"}, {"col": "Text"}])
expected = DataFrame({"col": ["31900441201190696999", "Text"]})
result = read_json(encoded_json)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dataframe,expected",
[
(
DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}),
'{"(0, \'x\')":1,"(0, \'y\')":"a","(1, \'x\')":2,'
'"(1, \'y\')":"b","(2, \'x\')":3,"(2, \'y\')":"c"}',
)
],
)
def test_json_multiindex(self, dataframe, expected):
series = dataframe.stack()
result = series.to_json(orient="index")
assert result == expected
@pytest.mark.single_cpu
def test_to_s3(self, s3_resource, s3so):
# GH 28375
mock_bucket_name, target_file = "pandas-test", "test.json"
df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
timeout = 5
while True:
if target_file in (
obj.key for obj in s3_resource.Bucket("pandas-test").objects.all()
):
break
time.sleep(0.1)
timeout -= 0.1
assert timeout > 0, "Timed out waiting for file to appear on moto"
def test_json_pandas_nulls(self, nulls_fixture, request):
# GH 31615
if isinstance(nulls_fixture, Decimal):
mark = pytest.mark.xfail(reason="not implemented")
request.node.add_marker(mark)
result = DataFrame([[nulls_fixture]]).to_json()
assert result == '{"0":{"0":null}}'
def test_readjson_bool_series(self):
# GH31464
result = read_json("[true, true, false]", typ="series")
expected = Series([True, True, False])
tm.assert_series_equal(result, expected)
def test_to_json_multiindex_escape(self):
# GH 15273
df = DataFrame(
True,
index=pd.date_range("2017-01-20", "2017-01-23"),
columns=["foo", "bar"],
).stack()
result = df.to_json()
expected = (
"{\"(Timestamp('2017-01-20 00:00:00'), 'foo')\":true,"
"\"(Timestamp('2017-01-20 00:00:00'), 'bar')\":true,"
"\"(Timestamp('2017-01-21 00:00:00'), 'foo')\":true,"
"\"(Timestamp('2017-01-21 00:00:00'), 'bar')\":true,"
"\"(Timestamp('2017-01-22 00:00:00'), 'foo')\":true,"
"\"(Timestamp('2017-01-22 00:00:00'), 'bar')\":true,"
"\"(Timestamp('2017-01-23 00:00:00'), 'foo')\":true,"
"\"(Timestamp('2017-01-23 00:00:00'), 'bar')\":true}"
)
assert result == expected
def test_to_json_series_of_objects(self):
class _TestObject:
def __init__(self, a, b, _c, d) -> None:
self.a = a
self.b = b
self._c = _c
self.d = d
def e(self):
return 5
# JSON keys should be all non-callable non-underscore attributes, see GH-42768
series = Series([_TestObject(a=1, b=2, _c=3, d=4)])
assert json.loads(series.to_json()) == {"0": {"a": 1, "b": 2, "d": 4}}
@pytest.mark.parametrize(
"data,expected",
[
(
Series({0: -6 + 8j, 1: 0 + 1j, 2: 9 - 5j}),
'{"0":{"imag":8.0,"real":-6.0},'
'"1":{"imag":1.0,"real":0.0},'
'"2":{"imag":-5.0,"real":9.0}}',
),
(
Series({0: -9.39 + 0.66j, 1: 3.95 + 9.32j, 2: 4.03 - 0.17j}),
'{"0":{"imag":0.66,"real":-9.39},'
'"1":{"imag":9.32,"real":3.95},'
'"2":{"imag":-0.17,"real":4.03}}',
),
(
DataFrame([[-2 + 3j, -1 - 0j], [4 - 3j, -0 - 10j]]),
'{"0":{"0":{"imag":3.0,"real":-2.0},'
'"1":{"imag":-3.0,"real":4.0}},'
'"1":{"0":{"imag":0.0,"real":-1.0},'
'"1":{"imag":-10.0,"real":0.0}}}',
),
(
DataFrame(
[[-0.28 + 0.34j, -1.08 - 0.39j], [0.41 - 0.34j, -0.78 - 1.35j]]
),
'{"0":{"0":{"imag":0.34,"real":-0.28},'
'"1":{"imag":-0.34,"real":0.41}},'
'"1":{"0":{"imag":-0.39,"real":-1.08},'
'"1":{"imag":-1.35,"real":-0.78}}}',
),
],
)
def test_complex_data_tojson(self, data, expected):
# GH41174
result = data.to_json()
assert result == expected
def test_json_uint64(self):
# GH21073
expected = (
'{"columns":["col1"],"index":[0,1],'
'"data":[[13342205958987758245],[12388075603347835679]]}'
)
df = DataFrame(data={"col1": [13342205958987758245, 12388075603347835679]})
result = df.to_json(orient="split")
assert result == expected
@pytest.mark.parametrize(
"orient", ["split", "records", "values", "index", "columns"]
)
def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
# GH#50750
pa = pytest.importorskip("pyarrow")
df = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
"b": Series([1, 2, 3], dtype="Int64"),
"c": Series([1.5, np.nan, 2.5], dtype="Float64"),
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": [True, False, None],
"f": [True, False, True],
"g": ["a", "b", "c"],
"h": ["a", "b", None],
}
)
if string_storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
out = df.to_json(orient=orient)
with pd.option_context("mode.string_storage", string_storage):
result = read_json(out, dtype_backend=dtype_backend, orient=orient)
expected = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
"b": Series([1, 2, 3], dtype="Int64"),
"c": Series([1.5, np.nan, 2.5], dtype="Float64"),
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": Series([True, False, NA], dtype="boolean"),
"f": Series([True, False, True], dtype="boolean"),
"g": string_array,
"h": string_array_na,
}
)
if dtype_backend == "pyarrow":
from pandas.arrays import ArrowExtensionArray
expected = DataFrame(
{
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
for col in expected.columns
}
)
if orient == "values":
expected.columns = list(range(0, 8))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("orient", ["split", "records", "index"])
def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
# GH#50750
pa = pytest.importorskip("pyarrow")
ser = Series([1, np.nan, 3], dtype="Int64")
out = ser.to_json(orient=orient)
with pd.option_context("mode.string_storage", string_storage):
result = read_json(
out, dtype_backend=dtype_backend, orient=orient, typ="series"
)
expected = Series([1, np.nan, 3], dtype="Int64")
if dtype_backend == "pyarrow":
from pandas.arrays import ArrowExtensionArray
expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True)))
tm.assert_series_equal(result, expected)
def test_invalid_dtype_backend(self):
msg = (
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
"'pyarrow' are allowed."
)
with pytest.raises(ValueError, match=msg):
read_json("test", dtype_backend="numpy")
def test_invalid_engine():
# GH 48893
ser = Series(range(1))
out = ser.to_json()
with pytest.raises(ValueError, match="The engine type foo"):
read_json(out, engine="foo")
def test_pyarrow_engine_lines_false():
# GH 48893
ser = Series(range(1))
out = ser.to_json()
with pytest.raises(ValueError, match="currently pyarrow engine only supports"):
read_json(out, engine="pyarrow", lines=False)