import datetime from datetime import timedelta from decimal import Decimal from io import StringIO import json import os import sys import time import numpy as np import pytest from pandas.compat import IS64 import pandas.util._test_decorators as td import pandas as pd from pandas import ( NA, DataFrame, DatetimeIndex, Series, Timestamp, read_json, ) import pandas._testing as tm from pandas.core.arrays import ( ArrowStringArray, StringArray, ) def assert_json_roundtrip_equal(result, expected, orient): if orient in ("records", "values"): expected = expected.reset_index(drop=True) if orient == "values": expected.columns = range(len(expected.columns)) tm.assert_frame_equal(result, expected) class TestPandasContainer: @pytest.fixture def categorical_frame(self): _seriesd = tm.getSeriesData() _cat_frame = DataFrame(_seriesd) cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * (len(_cat_frame) - 15) _cat_frame.index = pd.CategoricalIndex(cat, name="E") _cat_frame["E"] = list(reversed(cat)) _cat_frame["sort"] = np.arange(len(_cat_frame), dtype="int64") return _cat_frame @pytest.fixture def datetime_series(self): # Same as usual datetime_series, but with index freq set to None, # since that doesn't round-trip, see GH#33711 ser = tm.makeTimeSeries() ser.name = "ts" ser.index = ser.index._with_freq(None) return ser @pytest.fixture def datetime_frame(self): # Same as usual datetime_frame, but with index freq set to None, # since that doesn't round-trip, see GH#33711 df = DataFrame(tm.getTimeSeriesData()) df.index = df.index._with_freq(None) return df def test_frame_double_encoded_labels(self, orient): df = DataFrame( [["a", "b"], ["c", "d"]], index=['index " 1', "index / 2"], columns=["a \\ b", "y / z"], ) result = read_json(df.to_json(orient=orient), orient=orient) expected = df.copy() assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("orient", ["split", "records", "values"]) def test_frame_non_unique_index(self, orient): df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"]) result = read_json(df.to_json(orient=orient), orient=orient) expected = df.copy() assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("orient", ["index", "columns"]) def test_frame_non_unique_index_raises(self, orient): df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"]) msg = f"DataFrame index must be unique for orient='{orient}'" with pytest.raises(ValueError, match=msg): df.to_json(orient=orient) @pytest.mark.parametrize("orient", ["split", "values"]) @pytest.mark.parametrize( "data", [ [["a", "b"], ["c", "d"]], [[1.5, 2.5], [3.5, 4.5]], [[1, 2.5], [3, 4.5]], [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], ], ) def test_frame_non_unique_columns(self, orient, data): df = DataFrame(data, index=[1, 2], columns=["x", "x"]) result = read_json( df.to_json(orient=orient), orient=orient, convert_dates=["x"] ) if orient == "values": expected = DataFrame(data) if expected.iloc[:, 0].dtype == "datetime64[ns]": # orient == "values" by default will write Timestamp objects out # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need # TODO: a to_epoch method would also solve; see GH 14772 expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000 elif orient == "split": expected = df expected.columns = ["x", "x.1"] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("orient", ["index", "columns", "records"]) def test_frame_non_unique_columns_raises(self, orient): df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "x"]) msg = f"DataFrame columns must be unique for orient='{orient}'" with pytest.raises(ValueError, match=msg): df.to_json(orient=orient) def test_frame_default_orient(self, float_frame): assert float_frame.to_json() == float_frame.to_json(orient="columns") @pytest.mark.parametrize("dtype", [False, float]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): data = float_frame.to_json(orient=orient) result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype) expected = float_frame assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): data = int_frame.to_json(orient=orient) result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype) expected = int_frame assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("dtype", [None, np.float64, int, "U3"]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_str_axes(self, orient, convert_axes, dtype): df = DataFrame( np.zeros((200, 4)), columns=[str(i) for i in range(4)], index=[str(i) for i in range(200)], dtype=dtype, ) data = df.to_json(orient=orient) result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype) expected = df.copy() if not dtype: expected = expected.astype(np.int64) # index columns, and records orients cannot fully preserve the string # dtype for axes as the index and column labels are used as keys in # JSON objects. JSON keys are by definition strings, so there's no way # to disambiguate whether those keys actually were strings or numeric # beforehand and numeric wins out. if convert_axes and (orient in ("index", "columns")): expected.columns = expected.columns.astype(np.int64) expected.index = expected.index.astype(np.int64) elif orient == "records" and convert_axes: expected.columns = expected.columns.astype(np.int64) elif convert_axes and orient == "split": expected.columns = expected.columns.astype(np.int64) assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_categorical( self, request, orient, categorical_frame, convert_axes ): # TODO: create a better frame to test with and improve coverage if orient in ("index", "columns"): request.node.add_marker( pytest.mark.xfail( reason=f"Can't have duplicate index values for orient '{orient}')" ) ) data = categorical_frame.to_json(orient=orient) result = read_json(data, orient=orient, convert_axes=convert_axes) expected = categorical_frame.copy() expected.index = expected.index.astype(str) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_empty(self, orient, convert_axes): empty_frame = DataFrame() data = empty_frame.to_json(orient=orient) result = read_json(data, orient=orient, convert_axes=convert_axes) if orient == "split": idx = pd.Index([], dtype=(float if convert_axes else object)) expected = DataFrame(index=idx, columns=idx) elif orient in ["index", "columns"]: expected = DataFrame() else: expected = empty_frame.copy() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter data = datetime_frame.to_json(orient=orient) result = read_json(data, orient=orient, convert_axes=convert_axes) expected = datetime_frame.copy() if not convert_axes: # one off for ts handling # DTI gets converted to epoch values idx = expected.index.view(np.int64) // 1000000 if orient != "split": # TODO: handle consistently across orients idx = idx.astype(str) expected.index = idx assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_mixed(self, orient, convert_axes): index = pd.Index(["a", "b", "c", "d", "e"]) values = { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], "D": [True, False, True, False, True], } df = DataFrame(data=values, index=index) data = df.to_json(orient=orient) result = read_json(data, orient=orient, convert_axes=convert_axes) expected = df.copy() expected = expected.assign(**expected.select_dtypes("number").astype(np.int64)) assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.xfail( reason="#50456 Column multiindex is stored and loaded differently", raises=AssertionError, ) @pytest.mark.parametrize( "columns", [ [["2022", "2022"], ["JAN", "FEB"]], [["2022", "2023"], ["JAN", "JAN"]], [["2022", "2022"], ["JAN", "JAN"]], ], ) def test_roundtrip_multiindex(self, columns): df = DataFrame( [[1, 2], [3, 4]], columns=pd.MultiIndex.from_arrays(columns), ) result = read_json(df.to_json(orient="split"), orient="split") tm.assert_frame_equal(result, df) @pytest.mark.parametrize( "data,msg,orient", [ ('{"key":b:a:d}', "Expected object or value", "columns"), # too few indices ( '{"columns":["A","B"],' '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}', "|".join( [ r"Length of values \(3\) does not match length of index \(2\)", ] ), "split", ), # too many columns ( '{"columns":["A","B","C"],' '"index":["1","2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}', "3 columns passed, passed data had 2 columns", "split", ), # bad key ( '{"badkey":["A","B"],' '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}', r"unexpected key\(s\): badkey", "split", ), ], ) def test_frame_from_json_bad_data_raises(self, data, msg, orient): with pytest.raises(ValueError, match=msg): read_json(StringIO(data), orient=orient) @pytest.mark.parametrize("dtype", [True, False]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_frame_from_json_missing_data(self, orient, convert_axes, dtype): num_df = DataFrame([[1, 2], [4, 5, 6]]) result = read_json( num_df.to_json(orient=orient), orient=orient, convert_axes=convert_axes, dtype=dtype, ) assert np.isnan(result.iloc[0, 2]) obj_df = DataFrame([["1", "2"], ["4", "5", "6"]]) result = read_json( obj_df.to_json(orient=orient), orient=orient, convert_axes=convert_axes, dtype=dtype, ) assert np.isnan(result.iloc[0, 2]) @pytest.mark.parametrize("dtype", [True, False]) def test_frame_read_json_dtype_missing_value(self, dtype): # GH28501 Parse missing values using read_json with dtype=False # to NaN instead of None result = read_json("[null]", dtype=dtype) expected = DataFrame([np.nan]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("inf", [np.inf, np.NINF]) @pytest.mark.parametrize("dtype", [True, False]) def test_frame_infinity(self, inf, dtype): # infinities get mapped to nulls which get mapped to NaNs during # deserialisation df = DataFrame([[1, 2], [4, 5, 6]]) df.loc[0, 2] = inf result = read_json(df.to_json(), dtype=dtype) assert np.isnan(result.iloc[0, 2]) @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865") @pytest.mark.parametrize( "value,precision,expected_val", [ (0.95, 1, 1.0), (1.95, 1, 2.0), (-1.95, 1, -2.0), (0.995, 2, 1.0), (0.9995, 3, 1.0), (0.99999999999999944, 15, 1.0), ], ) def test_frame_to_json_float_precision(self, value, precision, expected_val): df = DataFrame([{"a_float": value}]) encoded = df.to_json(double_precision=precision) assert encoded == f'{{"a_float":{{"0":{expected_val}}}}}' def test_frame_to_json_except(self): df = DataFrame([1, 2, 3]) msg = "Invalid value 'garbage' for option 'orient'" with pytest.raises(ValueError, match=msg): df.to_json(orient="garbage") def test_frame_empty(self): df = DataFrame(columns=["jim", "joe"]) assert not df._is_mixed_type tm.assert_frame_equal( read_json(df.to_json(), dtype=dict(df.dtypes)), df, check_index_type=False ) # GH 7445 result = DataFrame({"test": []}, index=[]).to_json(orient="columns") expected = '{"test":{}}' assert result == expected def test_frame_empty_mixedtype(self): # mixed type df = DataFrame(columns=["jim", "joe"]) df["joe"] = df["joe"].astype("i8") assert df._is_mixed_type tm.assert_frame_equal( read_json(df.to_json(), dtype=dict(df.dtypes)), df, check_index_type=False ) def test_frame_mixedtype_orient(self): # GH10289 vals = [ [10, 1, "foo", 0.1, 0.01], [20, 2, "bar", 0.2, 0.02], [30, 3, "baz", 0.3, 0.03], [40, 4, "qux", 0.4, 0.04], ] df = DataFrame( vals, index=list("abcd"), columns=["1st", "2nd", "3rd", "4th", "5th"] ) assert df._is_mixed_type right = df.copy() for orient in ["split", "index", "columns"]: inp = df.to_json(orient=orient) left = read_json(inp, orient=orient, convert_axes=False) tm.assert_frame_equal(left, right) right.index = pd.RangeIndex(len(df)) inp = df.to_json(orient="records") left = read_json(inp, orient="records", convert_axes=False) tm.assert_frame_equal(left, right) right.columns = pd.RangeIndex(df.shape[1]) inp = df.to_json(orient="values") left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) def test_v12_compat(self, datapath): dti = pd.date_range("2000-01-03", "2000-01-07") # freq doesn't roundtrip dti = DatetimeIndex(np.asarray(dti), freq=None) df = DataFrame( [ [1.56808523, 0.65727391, 1.81021139, -0.17251653], [-0.2550111, -0.08072427, -0.03202878, -0.17581665], [1.51493992, 0.11805825, 1.629455, -1.31506612], [-0.02765498, 0.44679743, 0.33192641, -0.27885413], [0.05951614, -2.69652057, 1.28163262, 0.34703478], ], columns=["A", "B", "C", "D"], index=dti, ) df["date"] = Timestamp("19920106 18:21:32.12") df.iloc[3, df.columns.get_loc("date")] = Timestamp("20130101") df["modified"] = df["date"] df.iloc[1, df.columns.get_loc("modified")] = pd.NaT dirpath = datapath("io", "json", "data") v12_json = os.path.join(dirpath, "tsframe_v012.json") df_unser = read_json(v12_json) tm.assert_frame_equal(df, df_unser) df_iso = df.drop(["modified"], axis=1) v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json") df_unser_iso = read_json(v12_iso_json) tm.assert_frame_equal(df_iso, df_unser_iso) def test_blocks_compat_GH9037(self): index = pd.date_range("20000101", periods=10, freq="H") # freq doesn't round-trip index = DatetimeIndex(list(index), freq=None) df_mixed = DataFrame( { "float_1": [ -0.92077639, 0.77434435, 1.25234727, 0.61485564, -0.60316077, 0.24653374, 0.28668979, -2.51969012, 0.95748401, -1.02970536, ], "int_1": [ 19680418, 75337055, 99973684, 65103179, 79373900, 40314334, 21290235, 4991321, 41903419, 16008365, ], "str_1": [ "78c608f1", "64a99743", "13d2ff52", "ca7f4af2", "97236474", "bde7e214", "1a6bde47", "b1190be5", "7a669144", "8d64d068", ], "float_2": [ -0.0428278, -1.80872357, 3.36042349, -0.7573685, -0.48217572, 0.86229683, 1.08935819, 0.93898739, -0.03030452, 1.43366348, ], "str_2": [ "14f04af9", "d085da90", "4bcfac83", "81504caf", "2ffef4a9", "08e2f5c4", "07e1af03", "addbd4a7", "1f6a09ba", "4bfc4d87", ], "int_2": [ 86967717, 98098830, 51927505, 20372254, 12601730, 20884027, 34193846, 10561746, 24867120, 76131025, ], }, index=index, ) # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype("unicode") df_roundtrip = read_json(df_mixed.to_json(orient="split"), orient="split") tm.assert_frame_equal( df_mixed, df_roundtrip, check_index_type=True, check_column_type=True, by_blocks=True, check_exact=True, ) def test_frame_nonprintable_bytes(self): # GH14256: failing column caused segfaults, if it is not the last one class BinaryThing: def __init__(self, hexed) -> None: self.hexed = hexed self.binary = bytes.fromhex(hexed) def __str__(self) -> str: return self.hexed hexed = "574b4454ba8c5eb4f98a8f45" binthing = BinaryThing(hexed) # verify the proper conversion of printable content df_printable = DataFrame({"A": [binthing.hexed]}) assert df_printable.to_json() == f'{{"A":{{"0":"{hexed}"}}}}' # check if non-printable content throws appropriate Exception df_nonprintable = DataFrame({"A": [binthing]}) msg = "Unsupported UTF-8 sequence length when encoding string" with pytest.raises(OverflowError, match=msg): df_nonprintable.to_json() # the same with multiple columns threw segfaults df_mixed = DataFrame({"A": [binthing], "B": [1]}, columns=["A", "B"]) with pytest.raises(OverflowError, match=msg): df_mixed.to_json() # default_handler should resolve exceptions for non-string types result = df_nonprintable.to_json(default_handler=str) expected = f'{{"A":{{"0":"{hexed}"}}}}' assert result == expected assert ( df_mixed.to_json(default_handler=str) == f'{{"A":{{"0":"{hexed}"}},"B":{{"0":1}}}}' ) def test_label_overflow(self): # GH14256: buffer length not checked when writing label result = DataFrame({"bar" * 100000: [1], "foo": [1337]}).to_json() expected = f'{{"{"bar" * 100000}":{{"0":1}},"foo":{{"0":1337}}}}' assert result == expected def test_series_non_unique_index(self): s = Series(["a", "b"], index=[1, 1]) msg = "Series index must be unique for orient='index'" with pytest.raises(ValueError, match=msg): s.to_json(orient="index") tm.assert_series_equal( s, read_json(s.to_json(orient="split"), orient="split", typ="series") ) unserialized = read_json( s.to_json(orient="records"), orient="records", typ="series" ) tm.assert_numpy_array_equal(s.values, unserialized.values) def test_series_default_orient(self, string_series): assert string_series.to_json() == string_series.to_json(orient="index") def test_series_roundtrip_simple(self, orient, string_series): data = string_series.to_json(orient=orient) result = read_json(data, typ="series", orient=orient) expected = string_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": expected.name = None tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = object_series.to_json(orient=orient) result = read_json(data, typ="series", orient=orient, dtype=dtype) expected = object_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": expected.name = None tm.assert_series_equal(result, expected) def test_series_roundtrip_empty(self, orient): empty_series = Series([], index=[], dtype=np.float64) data = empty_series.to_json(orient=orient) result = read_json(data, typ="series", orient=orient) expected = empty_series.reset_index(drop=True) if orient in ("split"): expected.index = expected.index.astype(np.float64) tm.assert_series_equal(result, expected) def test_series_roundtrip_timeseries(self, orient, datetime_series): data = datetime_series.to_json(orient=orient) result = read_json(data, typ="series", orient=orient) expected = datetime_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": expected.name = None tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [np.float64, int]) def test_series_roundtrip_numeric(self, orient, dtype): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"]) data = s.to_json(orient=orient) result = read_json(data, typ="series", orient=orient) expected = s.copy() if orient in ("values", "records"): expected = expected.reset_index(drop=True) tm.assert_series_equal(result, expected) def test_series_to_json_except(self): s = Series([1, 2, 3]) msg = "Invalid value 'garbage' for option 'orient'" with pytest.raises(ValueError, match=msg): s.to_json(orient="garbage") def test_series_from_json_precise_float(self): s = Series([4.56, 4.56, 4.56]) result = read_json(s.to_json(), typ="series", precise_float=True) tm.assert_series_equal(result, s, check_index_type=False) def test_series_with_dtype(self): # GH 21986 s = Series([4.56, 4.56, 4.56]) result = read_json(s.to_json(), typ="series", dtype=np.int64) expected = Series([4] * 3) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "dtype,expected", [ (True, Series(["2000-01-01"], dtype="datetime64[ns]")), (False, Series([946684800000])), ], ) def test_series_with_dtype_datetime(self, dtype, expected): s = Series(["2000-01-01"], dtype="datetime64[ns]") data = s.to_json() result = read_json(data, typ="series", dtype=dtype) tm.assert_series_equal(result, expected) def test_frame_from_json_precise_float(self): df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]]) result = read_json(df.to_json(), precise_float=True) tm.assert_frame_equal(result, df) def test_typ(self): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64") result = read_json(s.to_json(), typ=None) tm.assert_series_equal(result, s) def test_reconstruction_index(self): df = DataFrame([[1, 2, 3], [4, 5, 6]]) result = read_json(df.to_json()) tm.assert_frame_equal(result, df) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["A", "B", "C"]) result = read_json(df.to_json()) tm.assert_frame_equal(result, df) def test_path(self, float_frame, int_frame, datetime_frame): with tm.ensure_clean("test.json") as path: for df in [float_frame, int_frame, datetime_frame]: df.to_json(path) read_json(path) def test_axis_dates(self, datetime_series, datetime_frame): # frame json = datetime_frame.to_json() result = read_json(json) tm.assert_frame_equal(result, datetime_frame) # series json = datetime_series.to_json() result = read_json(json, typ="series") tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame df["date"] = Timestamp("20130101") json = df.to_json() result = read_json(json) tm.assert_frame_equal(result, df) df["foo"] = 1.0 json = df.to_json(date_unit="ns") result = read_json(json, convert_dates=False) expected = df.copy() expected["date"] = expected["date"].values.view("i8") expected["foo"] = expected["foo"].astype("int64") tm.assert_frame_equal(result, expected) # series ts = Series(Timestamp("20130101"), index=datetime_series.index) json = ts.to_json() result = read_json(json, typ="series") tm.assert_series_equal(result, ts) @pytest.mark.parametrize("date_format", ["epoch", "iso"]) @pytest.mark.parametrize("as_object", [True, False]) @pytest.mark.parametrize("date_typ", [datetime.date, datetime.datetime, Timestamp]) def test_date_index_and_values(self, date_format, as_object, date_typ): data = [date_typ(year=2020, month=1, day=1), pd.NaT] if as_object: data.append("a") ser = Series(data, index=data) result = ser.to_json(date_format=date_format) if date_format == "epoch": expected = '{"1577836800000":1577836800000,"null":null}' else: expected = ( '{"2020-01-01T00:00:00.000":"2020-01-01T00:00:00.000","null":null}' ) if as_object: expected = expected.replace("}", ',"a":"a"}') assert result == expected @pytest.mark.parametrize( "infer_word", [ "trade_time", "date", "datetime", "sold_at", "modified", "timestamp", "timestamps", ], ) def test_convert_dates_infer(self, infer_word): # GH10747 from pandas.io.json import dumps data = [{"id": 1, infer_word: 1036713600000}, {"id": 2}] expected = DataFrame( [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] ) result = read_json(dumps(data))[["id", infer_word]] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "date,date_unit", [ ("20130101 20:43:42.123", None), ("20130101 20:43:42", "s"), ("20130101 20:43:42.123", "ms"), ("20130101 20:43:42.123456", "us"), ("20130101 20:43:42.123456789", "ns"), ], ) def test_date_format_frame(self, date, date_unit, datetime_frame): df = datetime_frame df["date"] = Timestamp(date) df.iloc[1, df.columns.get_loc("date")] = pd.NaT df.iloc[5, df.columns.get_loc("date")] = pd.NaT if date_unit: json = df.to_json(date_format="iso", date_unit=date_unit) else: json = df.to_json(date_format="iso") result = read_json(json) expected = df.copy() tm.assert_frame_equal(result, expected) def test_date_format_frame_raises(self, datetime_frame): df = datetime_frame msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") @pytest.mark.parametrize( "date,date_unit", [ ("20130101 20:43:42.123", None), ("20130101 20:43:42", "s"), ("20130101 20:43:42.123", "ms"), ("20130101 20:43:42.123456", "us"), ("20130101 20:43:42.123456789", "ns"), ], ) def test_date_format_series(self, date, date_unit, datetime_series): ts = Series(Timestamp(date), index=datetime_series.index) ts.iloc[1] = pd.NaT ts.iloc[5] = pd.NaT if date_unit: json = ts.to_json(date_format="iso", date_unit=date_unit) else: json = ts.to_json(date_format="iso") result = read_json(json, typ="series") expected = ts.copy() tm.assert_series_equal(result, expected) def test_date_format_series_raises(self, datetime_series): ts = Series(Timestamp("20130101 20:43:42.123"), index=datetime_series.index) msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_date_unit(self, unit, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101 20:43:42") dl = df.columns.get_loc("date") df.iloc[1, dl] = Timestamp("19710101 20:43:42") df.iloc[2, dl] = Timestamp("21460101 20:43:42") df.iloc[4, dl] = pd.NaT json = df.to_json(date_format="epoch", date_unit=unit) # force date unit result = read_json(json, date_unit=unit) tm.assert_frame_equal(result, df) # detect date unit result = read_json(json, date_unit=None) tm.assert_frame_equal(result, df) def test_weird_nested_json(self): # this used to core dump the parser s = r"""{ "status": "success", "data": { "posts": [ { "id": 1, "title": "A blog post", "body": "Some useful content" }, { "id": 2, "title": "Another blog post", "body": "More content" } ] } }""" read_json(s) def test_doc_example(self): dfj2 = DataFrame(np.random.randn(5, 2), columns=list("AB")) dfj2["date"] = Timestamp("20130101") dfj2["ints"] = range(5) dfj2["bools"] = True dfj2.index = pd.date_range("20130101", periods=5) json = dfj2.to_json() result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_}) tm.assert_frame_equal(result, result) def test_round_trip_exception_(self, datapath): # GH 3867 path = datapath("io", "json", "data", "teams.csv") df = pd.read_csv(path) s = df.to_json() result = read_json(s) tm.assert_frame_equal(result.reindex(index=df.index, columns=df.columns), df) @pytest.mark.network @tm.network( url="https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5", check_before_test=True, ) @pytest.mark.parametrize( "field,dtype", [ ["created_at", pd.DatetimeTZDtype(tz="UTC")], ["closed_at", "datetime64[ns]"], ["updated_at", pd.DatetimeTZDtype(tz="UTC")], ], ) def test_url(self, field, dtype): url = "https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5" result = read_json(url, convert_dates=True) assert result[field].dtype == dtype def test_timedelta(self): converter = lambda x: pd.to_timedelta(x, unit="ms") ser = Series([timedelta(23), timedelta(seconds=5)]) assert ser.dtype == "timedelta64[ns]" result = read_json(ser.to_json(), typ="series").apply(converter) tm.assert_series_equal(result, ser) ser = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1])) assert ser.dtype == "timedelta64[ns]" result = read_json(ser.to_json(), typ="series").apply(converter) tm.assert_series_equal(result, ser) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) assert frame[0].dtype == "timedelta64[ns]" tm.assert_frame_equal(frame, read_json(frame.to_json()).apply(converter)) def test_timedelta2(self): frame = DataFrame( { "a": [timedelta(days=23), timedelta(seconds=5)], "b": [1, 2], "c": pd.date_range(start="20130101", periods=2), } ) result = read_json(frame.to_json(date_unit="ns")) result["a"] = pd.to_timedelta(result.a, unit="ns") result["c"] = pd.to_datetime(result.c) tm.assert_frame_equal(frame, result) def test_mixed_timedelta_datetime(self): td = timedelta(23) ts = Timestamp("20130101") frame = DataFrame({"a": [td, ts]}, dtype=object) expected = DataFrame( {"a": [pd.Timedelta(td).as_unit("ns")._value, ts.as_unit("ns")._value]} ) result = read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) @pytest.mark.parametrize("as_object", [True, False]) @pytest.mark.parametrize("date_format", ["iso", "epoch"]) @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta]) def test_timedelta_to_json(self, as_object, date_format, timedelta_typ): # GH28156: to_json not correctly formatting Timedelta data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT] if as_object: data.append("a") ser = Series(data, index=data) if date_format == "iso": expected = ( '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}' ) else: expected = '{"86400000":86400000,"172800000":172800000,"null":null}' if as_object: expected = expected.replace("}", ',"a":"a"}') result = ser.to_json(date_format=date_format) assert result == expected def test_default_handler(self): value = object() frame = DataFrame({"a": [7, value]}) expected = DataFrame({"a": [7, str(value)]}) result = read_json(frame.to_json(default_handler=str)) tm.assert_frame_equal(expected, result, check_index_type=False) def test_default_handler_indirect(self): from pandas.io.json import dumps def default(obj): if isinstance(obj, complex): return [("mathjs", "Complex"), ("re", obj.real), ("im", obj.imag)] return str(obj) df_list = [ 9, DataFrame( {"a": [1, "STR", complex(4, -5)], "b": [float("nan"), None, "N/A"]}, columns=["a", "b"], ), ] expected = ( '[9,[[1,null],["STR",null],[[["mathjs","Complex"],' '["re",4.0],["im",-5.0]],"N\\/A"]]]' ) assert dumps(df_list, default_handler=default, orient="values") == expected def test_default_handler_numpy_unsupported_dtype(self): # GH12554 to_json raises 'Unhandled numpy dtype 15' df = DataFrame( {"a": [1, 2.3, complex(4, -5)], "b": [float("nan"), None, complex(1.2, 0)]}, columns=["a", "b"], ) expected = ( '[["(1+0j)","(nan+0j)"],' '["(2.3+0j)","(nan+0j)"],' '["(4-5j)","(1.2+0j)"]]' ) assert df.to_json(default_handler=str, orient="values") == expected def test_default_handler_raises(self): msg = "raisin" def my_handler_raises(obj): raise TypeError(msg) with pytest.raises(TypeError, match=msg): DataFrame({"a": [1, 2, object()]}).to_json( default_handler=my_handler_raises ) with pytest.raises(TypeError, match=msg): DataFrame({"a": [1, 2, complex(4, -5)]}).to_json( default_handler=my_handler_raises ) def test_categorical(self): # GH4377 df.to_json segfaults with non-ndarray blocks df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]}) df["B"] = df["A"] expected = df.to_json() df["B"] = df["A"].astype("category") assert expected == df.to_json() s = df["A"] sc = df["B"] assert s.to_json() == sc.to_json() def test_datetime_tz(self): # GH4377 df.to_json segfaults with non-ndarray blocks tz_range = pd.date_range("20130101", periods=3, tz="US/Eastern") tz_naive = tz_range.tz_convert("utc").tz_localize(None) df = DataFrame({"A": tz_range, "B": pd.date_range("20130101", periods=3)}) df_naive = df.copy() df_naive["A"] = tz_naive expected = df_naive.to_json() assert expected == df.to_json() stz = Series(tz_range) s_naive = Series(tz_naive) assert stz.to_json() == s_naive.to_json() def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks df = DataFrame(np.random.randn(10, 4)) df.loc[:8] = np.nan sdf = df.astype("Sparse") expected = df.to_json() assert expected == sdf.to_json() s = Series(np.random.randn(10)) s.loc[:8] = np.nan ss = s.astype("Sparse") expected = s.to_json() assert expected == ss.to_json() @pytest.mark.parametrize( "ts", [ Timestamp("2013-01-10 05:00:00Z"), Timestamp("2013-01-10 00:00:00", tz="US/Eastern"), Timestamp("2013-01-10 00:00:00-0500"), ], ) def test_tz_is_utc(self, ts): from pandas.io.json import dumps exp = '"2013-01-10T05:00:00.000Z"' assert dumps(ts, iso_dates=True) == exp dt = ts.to_pydatetime() assert dumps(dt, iso_dates=True) == exp def test_tz_is_naive(self): from pandas.io.json import dumps ts = Timestamp("2013-01-10 05:00:00") exp = '"2013-01-10T05:00:00.000"' assert dumps(ts, iso_dates=True) == exp dt = ts.to_pydatetime() assert dumps(dt, iso_dates=True) == exp @pytest.mark.parametrize( "tz_range", [ pd.date_range("2013-01-01 05:00:00Z", periods=2), pd.date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern"), pd.date_range("2013-01-01 00:00:00-0500", periods=2), ], ) def test_tz_range_is_utc(self, tz_range): from pandas.io.json import dumps exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' dfexp = ( '{"DT":{' '"0":"2013-01-01T05:00:00.000Z",' '"1":"2013-01-02T05:00:00.000Z"}}' ) assert dumps(tz_range, iso_dates=True) == exp dti = DatetimeIndex(tz_range) # Ensure datetimes in object array are serialized correctly # in addition to the normal DTI case assert dumps(dti, iso_dates=True) == exp assert dumps(dti.astype(object), iso_dates=True) == exp df = DataFrame({"DT": dti}) result = dumps(df, iso_dates=True) assert result == dfexp assert dumps(df.astype({"DT": object}), iso_dates=True) def test_tz_range_is_naive(self): from pandas.io.json import dumps dti = pd.date_range("2013-01-01 05:00:00", periods=2) exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]' dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}' # Ensure datetimes in object array are serialized correctly # in addition to the normal DTI case assert dumps(dti, iso_dates=True) == exp assert dumps(dti.astype(object), iso_dates=True) == exp df = DataFrame({"DT": dti}) result = dumps(df, iso_dates=True) assert result == dfexp assert dumps(df.astype({"DT": object}), iso_dates=True) def test_read_inline_jsonl(self): # GH9180 result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu @td.skip_if_not_us_locale def test_read_s3_jsonl(self, s3_resource, s3so): # GH17200 result = read_json( "s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so ) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) def test_read_local_jsonl(self): # GH17200 with tm.ensure_clean("tmp_items.json") as path: with open(path, "w") as infile: infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') result = read_json(path, lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) def test_read_jsonl_unicode_chars(self): # GH15132: non-ascii unicode characters # \u201d == RIGHT DOUBLE QUOTATION MARK # simulate file handle json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' json = StringIO(json) result = read_json(json, lines=True) expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) # simulate string json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' result = read_json(json, lines=True) expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) def test_to_json_large_numbers(self, bigNum): # GH34473 series = Series(bigNum, dtype=object, index=["articleId"]) json = series.to_json() expected = '{"articleId":' + str(bigNum) + "}" assert json == expected df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) json = df.to_json() expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected @pytest.mark.parametrize("bigNum", [-(2**63) - 1, 2**64]) def test_read_json_large_numbers(self, bigNum): # GH20599, 26068 json = StringIO('{"articleId":' + str(bigNum) + "}") msg = r"Value is too small|Value is too big" with pytest.raises(ValueError, match=msg): read_json(json) json = StringIO('{"0":{"articleId":' + str(bigNum) + "}}") with pytest.raises(ValueError, match=msg): read_json(json) def test_read_json_large_numbers2(self): # GH18842 json = '{"articleId": "1404366058080022500245"}' json = StringIO(json) result = read_json(json, typ="series") expected = Series(1.404366e21, index=["articleId"]) tm.assert_series_equal(result, expected) json = '{"0": {"articleId": "1404366058080022500245"}}' json = StringIO(json) result = read_json(json) expected = DataFrame(1.404366e21, index=["articleId"], columns=[0]) tm.assert_frame_equal(result, expected) def test_to_jsonl(self): # GH9180 df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) result = df.to_json(orient="records", lines=True) expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n' assert result == expected df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"]) result = df.to_json(orient="records", lines=True) expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n' assert result == expected tm.assert_frame_equal(read_json(result, lines=True), df) # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n' assert result == expected tm.assert_frame_equal(read_json(result, lines=True), df) # TODO: there is a near-identical test for pytables; can we share? @pytest.mark.xfail(reason="GH#13774 encoding kwarg not supported", raises=TypeError) def test_latin_encoding(self): # GH 13774 values = [ [b"E\xc9, 17", b"", b"a", b"b", b"c"], [b"E\xc9, 17", b"a", b"b", b"c"], [b"EE, 17", b"", b"a", b"b", b"c"], [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], [b"", b"a", b"b", b"c"], [b"\xf8\xfc", b"a", b"b", b"c"], [b"A\xf8\xfc", b"", b"a", b"b", b"c"], [np.nan, b"", b"b", b"c"], [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], ] values = [ [x.decode("latin-1") if isinstance(x, bytes) else x for x in y] for y in values ] examples = [] for dtype in ["category", object]: for val in values: examples.append(Series(val, dtype=dtype)) def roundtrip(s, encoding="latin-1"): with tm.ensure_clean("test.json") as path: s.to_json(path, encoding=encoding) retr = read_json(path, encoding=encoding) tm.assert_series_equal(s, retr, check_categorical=False) for s in examples: roundtrip(s) def test_data_frame_size_after_to_json(self): # GH15344 df = DataFrame({"a": [str(1)]}) size_before = df.memory_usage(index=True, deep=True).sum() df.to_json() size_after = df.memory_usage(index=True, deep=True).sum() assert size_before == size_after @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) @pytest.mark.parametrize("columns", [["a", "b"], ["1", "2"], ["1.", "2."]]) def test_from_json_to_json_table_index_and_columns(self, index, columns): # GH25433 GH25435 expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns) dfjson = expected.to_json(orient="table") result = read_json(dfjson, orient="table") tm.assert_frame_equal(result, expected) def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) dfjson = expected.to_json(orient="table") result = read_json(dfjson, orient="table") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 expected = DataFrame.from_dict( { "Integer": Series([1, 2, 3], dtype="int64"), "Float": Series([None, 2.0, 3.0], dtype="float64"), "Object": Series([None, "", "c"], dtype="object"), "Bool": Series([True, False, True], dtype="bool"), "Category": Series(["a", "b", None], dtype="category"), "Datetime": Series( ["2020-01-01", None, "2020-01-03"], dtype="datetime64[ns]" ), } ) dfjson = expected.to_json(orient=orient) result = read_json( dfjson, orient=orient, dtype={ "Integer": "int64", "Float": "float64", "Object": "object", "Bool": "bool", "Category": "category", "Datetime": "datetime64[ns]", }, ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345 df = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) dfjson = df.to_json(orient="table") msg = "cannot pass both dtype and orient='table'" with pytest.raises(ValueError, match=msg): read_json(dfjson, orient="table", dtype=dtype) def test_read_json_table_convert_axes_raises(self): # GH25433 GH25435 df = DataFrame([[1, 2], [3, 4]], index=[1.0, 2.0], columns=["1.", "2."]) dfjson = df.to_json(orient="table") msg = "cannot pass both convert_axes and orient='table'" with pytest.raises(ValueError, match=msg): read_json(dfjson, orient="table", convert_axes=True) @pytest.mark.parametrize( "data, expected", [ ( DataFrame([[1, 2], [4, 5]], columns=["a", "b"]), {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]}, ), ( DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo"), {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]}, ), ( DataFrame( [[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]] ), {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]}, ), (Series([1, 2, 3], name="A"), {"name": "A", "data": [1, 2, 3]}), ( Series([1, 2, 3], name="A").rename_axis("foo"), {"name": "A", "data": [1, 2, 3]}, ), ( Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]]), {"name": "A", "data": [1, 2]}, ), ], ) def test_index_false_to_json_split(self, data, expected): # GH 17394 # Testing index=False in to_json with orient='split' result = data.to_json(orient="split", index=False) result = json.loads(result) assert result == expected @pytest.mark.parametrize( "data", [ (DataFrame([[1, 2], [4, 5]], columns=["a", "b"])), (DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo")), ( DataFrame( [[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]] ) ), (Series([1, 2, 3], name="A")), (Series([1, 2, 3], name="A").rename_axis("foo")), (Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]])), ], ) def test_index_false_to_json_table(self, data): # GH 17394 # Testing index=False in to_json with orient='table' result = data.to_json(orient="table", index=False) result = json.loads(result) expected = { "schema": pd.io.json.build_table_schema(data, index=False), "data": DataFrame(data).to_dict(orient="records"), } assert result == expected @pytest.mark.parametrize("orient", ["records", "index", "columns", "values"]) def test_index_false_error_to_json(self, orient): # GH 17394 # Testing error message from to_json with index=False df = DataFrame([[1, 2], [4, 5]], columns=["a", "b"]) msg = "'index=False' is only valid when 'orient' is 'split' or 'table'" with pytest.raises(ValueError, match=msg): df.to_json(orient=orient, index=False) @pytest.mark.parametrize("orient", ["split", "table"]) @pytest.mark.parametrize("index", [True, False]) def test_index_false_from_json_to_json(self, orient, index): # GH25170 # Test index=False in from_json to_json expected = DataFrame({"a": [1, 2], "b": [3, 4]}) dfjson = expected.to_json(orient=orient, index=index) result = read_json(dfjson, orient=orient) tm.assert_frame_equal(result, expected) def test_read_timezone_information(self): # GH 25546 result = read_json( '{"2019-01-01T11:00:00.000Z":88}', typ="series", orient="index" ) expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "url", [ "s3://example-fsspec/", "gcs://another-fsspec/file.json", "https://example-site.com/data", "some-protocol://data.txt", ], ) def test_read_json_with_url_value(self, url): # GH 36271 result = read_json(f'{{"url":{{"0":"{url}"}}}}') expected = DataFrame({"url": [url]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "compression", ["", ".gz", ".bz2", ".tar"], ) def test_read_json_with_very_long_file_path(self, compression): # GH 46718 long_json_path = f'{"a" * 1000}.json{compression}' with pytest.raises( FileNotFoundError, match=f"File {long_json_path} does not exist" ): # path too long for Windows is handled in file_exists() but raises in # _get_data_from_filepath() read_json(long_json_path) @pytest.mark.parametrize( "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] ) def test_timedelta_as_label(self, date_format, key): df = DataFrame([[1]], columns=[pd.Timedelta("1D")]) expected = f'{{"{key}":{{"0":1}}}}' result = df.to_json(date_format=date_format) assert result == expected @pytest.mark.parametrize( "orient,expected", [ ("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"), ("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"), # TODO: the below have separate encoding procedures pytest.param( "split", "", marks=pytest.mark.xfail( reason="Produces JSON but not in a consistent manner" ), ), pytest.param( "table", "", marks=pytest.mark.xfail( reason="Produces JSON but not in a consistent manner" ), ), ], ) def test_tuple_labels(self, orient, expected): # GH 20500 df = DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")]) result = df.to_json(orient=orient) assert result == expected @pytest.mark.parametrize("indent", [1, 2, 4]) def test_to_json_indent(self, indent): # GH 12004 df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) result = df.to_json(indent=indent) spaces = " " * indent expected = f"""{{ {spaces}"a":{{ {spaces}{spaces}"0":"foo", {spaces}{spaces}"1":"baz" {spaces}}}, {spaces}"b":{{ {spaces}{spaces}"0":"bar", {spaces}{spaces}"1":"qux" {spaces}}} }}""" assert result == expected @pytest.mark.parametrize( "orient,expected", [ ( "split", """{ "columns":[ "a", "b" ], "index":[ 0, 1 ], "data":[ [ "foo", "bar" ], [ "baz", "qux" ] ] }""", ), ( "records", """[ { "a":"foo", "b":"bar" }, { "a":"baz", "b":"qux" } ]""", ), ( "index", """{ "0":{ "a":"foo", "b":"bar" }, "1":{ "a":"baz", "b":"qux" } }""", ), ( "columns", """{ "a":{ "0":"foo", "1":"baz" }, "b":{ "0":"bar", "1":"qux" } }""", ), ( "values", """[ [ "foo", "bar" ], [ "baz", "qux" ] ]""", ), ( "table", """{ "schema":{ "fields":[ { "name":"index", "type":"integer" }, { "name":"a", "type":"string" }, { "name":"b", "type":"string" } ], "primaryKey":[ "index" ], "pandas_version":"1.4.0" }, "data":[ { "index":0, "a":"foo", "b":"bar" }, { "index":1, "a":"baz", "b":"qux" } ] }""", ), ], ) def test_json_indent_all_orients(self, orient, expected): # GH 12004 df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) result = df.to_json(orient=orient, indent=4) assert result == expected def test_json_negative_indent_raises(self): with pytest.raises(ValueError, match="must be a nonnegative integer"): DataFrame().to_json(indent=-1) def test_emca_262_nan_inf_support(self): # GH 12213 data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' result = read_json(data) expected = DataFrame( ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] ) tm.assert_frame_equal(result, expected) def test_frame_int_overflow(self): # GH 30320 encoded_json = json.dumps([{"col": "31900441201190696999"}, {"col": "Text"}]) expected = DataFrame({"col": ["31900441201190696999", "Text"]}) result = read_json(encoded_json) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "dataframe,expected", [ ( DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}), '{"(0, \'x\')":1,"(0, \'y\')":"a","(1, \'x\')":2,' '"(1, \'y\')":"b","(2, \'x\')":3,"(2, \'y\')":"c"}', ) ], ) def test_json_multiindex(self, dataframe, expected): series = dataframe.stack() result = series.to_json(orient="index") assert result == expected @pytest.mark.single_cpu def test_to_s3(self, s3_resource, s3so): # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so) timeout = 5 while True: if target_file in ( obj.key for obj in s3_resource.Bucket("pandas-test").objects.all() ): break time.sleep(0.1) timeout -= 0.1 assert timeout > 0, "Timed out waiting for file to appear on moto" def test_json_pandas_nulls(self, nulls_fixture, request): # GH 31615 if isinstance(nulls_fixture, Decimal): mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) result = DataFrame([[nulls_fixture]]).to_json() assert result == '{"0":{"0":null}}' def test_readjson_bool_series(self): # GH31464 result = read_json("[true, true, false]", typ="series") expected = Series([True, True, False]) tm.assert_series_equal(result, expected) def test_to_json_multiindex_escape(self): # GH 15273 df = DataFrame( True, index=pd.date_range("2017-01-20", "2017-01-23"), columns=["foo", "bar"], ).stack() result = df.to_json() expected = ( "{\"(Timestamp('2017-01-20 00:00:00'), 'foo')\":true," "\"(Timestamp('2017-01-20 00:00:00'), 'bar')\":true," "\"(Timestamp('2017-01-21 00:00:00'), 'foo')\":true," "\"(Timestamp('2017-01-21 00:00:00'), 'bar')\":true," "\"(Timestamp('2017-01-22 00:00:00'), 'foo')\":true," "\"(Timestamp('2017-01-22 00:00:00'), 'bar')\":true," "\"(Timestamp('2017-01-23 00:00:00'), 'foo')\":true," "\"(Timestamp('2017-01-23 00:00:00'), 'bar')\":true}" ) assert result == expected def test_to_json_series_of_objects(self): class _TestObject: def __init__(self, a, b, _c, d) -> None: self.a = a self.b = b self._c = _c self.d = d def e(self): return 5 # JSON keys should be all non-callable non-underscore attributes, see GH-42768 series = Series([_TestObject(a=1, b=2, _c=3, d=4)]) assert json.loads(series.to_json()) == {"0": {"a": 1, "b": 2, "d": 4}} @pytest.mark.parametrize( "data,expected", [ ( Series({0: -6 + 8j, 1: 0 + 1j, 2: 9 - 5j}), '{"0":{"imag":8.0,"real":-6.0},' '"1":{"imag":1.0,"real":0.0},' '"2":{"imag":-5.0,"real":9.0}}', ), ( Series({0: -9.39 + 0.66j, 1: 3.95 + 9.32j, 2: 4.03 - 0.17j}), '{"0":{"imag":0.66,"real":-9.39},' '"1":{"imag":9.32,"real":3.95},' '"2":{"imag":-0.17,"real":4.03}}', ), ( DataFrame([[-2 + 3j, -1 - 0j], [4 - 3j, -0 - 10j]]), '{"0":{"0":{"imag":3.0,"real":-2.0},' '"1":{"imag":-3.0,"real":4.0}},' '"1":{"0":{"imag":0.0,"real":-1.0},' '"1":{"imag":-10.0,"real":0.0}}}', ), ( DataFrame( [[-0.28 + 0.34j, -1.08 - 0.39j], [0.41 - 0.34j, -0.78 - 1.35j]] ), '{"0":{"0":{"imag":0.34,"real":-0.28},' '"1":{"imag":-0.34,"real":0.41}},' '"1":{"0":{"imag":-0.39,"real":-1.08},' '"1":{"imag":-1.35,"real":-0.78}}}', ), ], ) def test_complex_data_tojson(self, data, expected): # GH41174 result = data.to_json() assert result == expected def test_json_uint64(self): # GH21073 expected = ( '{"columns":["col1"],"index":[0,1],' '"data":[[13342205958987758245],[12388075603347835679]]}' ) df = DataFrame(data={"col1": [13342205958987758245, 12388075603347835679]}) result = df.to_json(orient="split") assert result == expected @pytest.mark.parametrize( "orient", ["split", "records", "values", "index", "columns"] ) def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): # GH#50750 pa = pytest.importorskip("pyarrow") df = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), "b": Series([1, 2, 3], dtype="Int64"), "c": Series([1.5, np.nan, 2.5], dtype="Float64"), "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": [True, False, None], "f": [True, False, True], "g": ["a", "b", "c"], "h": ["a", "b", None], } ) if string_storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) else: string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): result = read_json(out, dtype_backend=dtype_backend, orient=orient) expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), "b": Series([1, 2, 3], dtype="Int64"), "c": Series([1.5, np.nan, 2.5], dtype="Float64"), "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), "g": string_array, "h": string_array_na, } ) if dtype_backend == "pyarrow": from pandas.arrays import ArrowExtensionArray expected = DataFrame( { col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) for col in expected.columns } ) if orient == "values": expected.columns = list(range(0, 8)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("orient", ["split", "records", "index"]) def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): # GH#50750 pa = pytest.importorskip("pyarrow") ser = Series([1, np.nan, 3], dtype="Int64") out = ser.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): result = read_json( out, dtype_backend=dtype_backend, orient=orient, typ="series" ) expected = Series([1, np.nan, 3], dtype="Int64") if dtype_backend == "pyarrow": from pandas.arrays import ArrowExtensionArray expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True))) tm.assert_series_equal(result, expected) def test_invalid_dtype_backend(self): msg = ( "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): read_json("test", dtype_backend="numpy") def test_invalid_engine(): # GH 48893 ser = Series(range(1)) out = ser.to_json() with pytest.raises(ValueError, match="The engine type foo"): read_json(out, engine="foo") def test_pyarrow_engine_lines_false(): # GH 48893 ser = Series(range(1)) out = ser.to_json() with pytest.raises(ValueError, match="currently pyarrow engine only supports"): read_json(out, engine="pyarrow", lines=False)