import datetime as dt from datetime import datetime import gzip import io import os import struct import warnings import numpy as np import pytest from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd import pandas._testing as tm from pandas.core.frame import DataFrame, Series from pandas.io.parsers import read_csv from pandas.io.stata import ( InvalidColumnName, PossiblePrecisionLoss, StataMissingValue, StataReader, StataWriterUTF8, read_stata, ) @pytest.fixture() def mixed_frame(): return pd.DataFrame( { "a": [1, 2, 3, 4], "b": [1.0, 3.0, 27.0, 81.0], "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], } ) @pytest.fixture def dirpath(datapath): return datapath("io", "data", "stata") @pytest.fixture def parsed_114(dirpath): dta14_114 = os.path.join(dirpath, "stata5_114.dta") parsed_114 = read_stata(dta14_114, convert_dates=True) parsed_114.index.name = "index" return parsed_114 class TestStata: @pytest.fixture(autouse=True) def setup_method(self, datapath): self.dirpath = datapath("io", "data", "stata") self.dta1_114 = os.path.join(self.dirpath, "stata1_114.dta") self.dta1_117 = os.path.join(self.dirpath, "stata1_117.dta") self.dta2_113 = os.path.join(self.dirpath, "stata2_113.dta") self.dta2_114 = os.path.join(self.dirpath, "stata2_114.dta") self.dta2_115 = os.path.join(self.dirpath, "stata2_115.dta") self.dta2_117 = os.path.join(self.dirpath, "stata2_117.dta") self.dta3_113 = os.path.join(self.dirpath, "stata3_113.dta") self.dta3_114 = os.path.join(self.dirpath, "stata3_114.dta") self.dta3_115 = os.path.join(self.dirpath, "stata3_115.dta") self.dta3_117 = os.path.join(self.dirpath, "stata3_117.dta") self.csv3 = os.path.join(self.dirpath, "stata3.csv") self.dta4_113 = os.path.join(self.dirpath, "stata4_113.dta") self.dta4_114 = os.path.join(self.dirpath, "stata4_114.dta") self.dta4_115 = os.path.join(self.dirpath, "stata4_115.dta") self.dta4_117 = os.path.join(self.dirpath, "stata4_117.dta") self.dta_encoding = os.path.join(self.dirpath, "stata1_encoding.dta") self.dta_encoding_118 = os.path.join(self.dirpath, "stata1_encoding_118.dta") self.csv14 = os.path.join(self.dirpath, "stata5.csv") self.dta14_113 = os.path.join(self.dirpath, "stata5_113.dta") self.dta14_114 = os.path.join(self.dirpath, "stata5_114.dta") self.dta14_115 = os.path.join(self.dirpath, "stata5_115.dta") self.dta14_117 = os.path.join(self.dirpath, "stata5_117.dta") self.csv15 = os.path.join(self.dirpath, "stata6.csv") self.dta15_113 = os.path.join(self.dirpath, "stata6_113.dta") self.dta15_114 = os.path.join(self.dirpath, "stata6_114.dta") self.dta15_115 = os.path.join(self.dirpath, "stata6_115.dta") self.dta15_117 = os.path.join(self.dirpath, "stata6_117.dta") self.dta16_115 = os.path.join(self.dirpath, "stata7_115.dta") self.dta16_117 = os.path.join(self.dirpath, "stata7_117.dta") self.dta17_113 = os.path.join(self.dirpath, "stata8_113.dta") self.dta17_115 = os.path.join(self.dirpath, "stata8_115.dta") self.dta17_117 = os.path.join(self.dirpath, "stata8_117.dta") self.dta18_115 = os.path.join(self.dirpath, "stata9_115.dta") self.dta18_117 = os.path.join(self.dirpath, "stata9_117.dta") self.dta19_115 = os.path.join(self.dirpath, "stata10_115.dta") self.dta19_117 = os.path.join(self.dirpath, "stata10_117.dta") self.dta20_115 = os.path.join(self.dirpath, "stata11_115.dta") self.dta20_117 = os.path.join(self.dirpath, "stata11_117.dta") self.dta21_117 = os.path.join(self.dirpath, "stata12_117.dta") self.dta22_118 = os.path.join(self.dirpath, "stata14_118.dta") self.dta23 = os.path.join(self.dirpath, "stata15.dta") self.dta24_111 = os.path.join(self.dirpath, "stata7_111.dta") self.dta25_118 = os.path.join(self.dirpath, "stata16_118.dta") self.dta26_119 = os.path.join(self.dirpath, "stata1_119.dta.gz") self.stata_dates = os.path.join(self.dirpath, "stata13_dates.dta") def read_dta(self, file): # Legacy default reader configuration return read_stata(file, convert_dates=True) def read_csv(self, file): return read_csv(file, parse_dates=True) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_empty_dta(self, version): empty_ds = DataFrame(columns=["unit"]) # GH 7369, make sure can read a 0-obs dta file with tm.ensure_clean() as path: empty_ds.to_stata(path, write_index=False, version=version) empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2) @pytest.mark.parametrize("file", ["dta1_114", "dta1_117"]) def test_read_dta1(self, file): file = getattr(self, file) parsed = self.read_dta(file) # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. expected = DataFrame( [(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"], ) # this is an oddity as really the nan should be float64, but # the casting doesn't fail so need to match stata here expected["float_miss"] = expected["float_miss"].astype(np.float32) tm.assert_frame_equal(parsed, expected) def test_read_dta2(self): expected = DataFrame.from_records( [ ( datetime(2006, 11, 19, 23, 13, 20), 1479596223000, datetime(2010, 1, 20), datetime(2010, 1, 8), datetime(2010, 1, 1), datetime(1974, 7, 1), datetime(2010, 1, 1), datetime(2010, 1, 1), ), ( datetime(1959, 12, 31, 20, 3, 20), -1479590, datetime(1953, 10, 2), datetime(1948, 6, 10), datetime(1955, 1, 1), datetime(1955, 7, 1), datetime(1955, 1, 1), datetime(2, 1, 1), ), (pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT), ], columns=[ "datetime_c", "datetime_big_c", "date", "weekly_date", "monthly_date", "quarterly_date", "half_yearly_date", "yearly_date", ], ) expected["yearly_date"] = expected["yearly_date"].astype("O") with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") parsed_114 = self.read_dta(self.dta2_114) parsed_115 = self.read_dta(self.dta2_115) parsed_117 = self.read_dta(self.dta2_117) # 113 is buggy due to limits of date format support in Stata # parsed_113 = self.read_dta(self.dta2_113) # Remove resource warnings w = [x for x in w if x.category is UserWarning] # should get warning for each call to read_dta assert len(w) == 3 # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) @pytest.mark.parametrize("file", ["dta3_113", "dta3_114", "dta3_115", "dta3_117"]) def test_read_dta3(self, file): file = getattr(self, file) parsed = self.read_dta(file) # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) expected["year"] = expected["year"].astype(np.int16) expected["quarter"] = expected["quarter"].astype(np.int8) tm.assert_frame_equal(parsed, expected) @pytest.mark.parametrize("file", ["dta4_113", "dta4_114", "dta4_115", "dta4_117"]) def test_read_dta4(self, file): file = getattr(self, file) parsed = self.read_dta(file) expected = DataFrame.from_records( [ ["one", "ten", "one", "one", "one"], ["two", "nine", "two", "two", "two"], ["three", "eight", "three", "three", "three"], ["four", "seven", 4, "four", "four"], ["five", "six", 5, np.nan, "five"], ["six", "five", 6, np.nan, "six"], ["seven", "four", 7, np.nan, "seven"], ["eight", "three", 8, np.nan, "eight"], ["nine", "two", 9, np.nan, "nine"], ["ten", "one", "ten", np.nan, "ten"], ], columns=[ "fully_labeled", "fully_labeled2", "incompletely_labeled", "labeled_with_missings", "float_labelled", ], ) # these are all categoricals expected = pd.concat( [expected[col].astype("category") for col in expected], axis=1 ) # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected, check_categorical=False) # File containing strls def test_read_dta12(self): parsed_117 = self.read_dta(self.dta21_117) expected = DataFrame.from_records( [ [1, "abc", "abcdefghi"], [3, "cba", "qwertywertyqwerty"], [93, "", "strl"], ], columns=["x", "y", "z"], ) tm.assert_frame_equal(parsed_117, expected, check_dtype=False) def test_read_dta18(self): parsed_118 = self.read_dta(self.dta22_118) parsed_118["Bytes"] = parsed_118["Bytes"].astype("O") expected = DataFrame.from_records( [ ["Cat", "Bogota", "Bogotá", 1, 1.0, "option b Ünicode", 1.0], ["Dog", "Boston", "Uzunköprü", np.nan, np.nan, np.nan, np.nan], ["Plane", "Rome", "Tromsø", 0, 0.0, "option a", 0.0], ["Potato", "Tokyo", "Elâzığ", -4, 4.0, 4, 4], ["", "", "", 0, 0.3332999, "option a", 1 / 3.0], ], columns=[ "Things", "Cities", "Unicode_Cities_Strl", "Ints", "Floats", "Bytes", "Longs", ], ) expected["Floats"] = expected["Floats"].astype(np.float32) for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) with StataReader(self.dta22_118) as rdr: vl = rdr.variable_labels() vl_expected = { "Unicode_Cities_Strl": "Here are some strls with Ünicode chars", "Longs": "long data", "Things": "Here are some things", "Bytes": "byte data", "Ints": "int data", "Cities": "Here are some cities", "Floats": "float data", } tm.assert_dict_equal(vl, vl_expected) assert rdr.data_label == "This is a Ünicode data label" def test_read_write_dta5(self): original = DataFrame( [(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"], ) original.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path, None) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), original) def test_write_dta6(self): original = self.read_csv(self.csv3) original.index.name = "index" original.index = original.index.astype(np.int32) original["year"] = original["year"].astype(np.int32) original["quarter"] = original["quarter"].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None) written_and_read_again = self.read_dta(path) tm.assert_frame_equal( written_and_read_again.set_index("index"), original, check_index_type=False, ) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta10(self, version): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], columns=["string", "object", "integer", "floating", "datetime"], ) original["object"] = Series(original["object"], dtype=object) original.index.name = "index" original.index = original.index.astype(np.int32) original["integer"] = original["integer"].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {"datetime": "tc"}, version=version) written_and_read_again = self.read_dta(path) # original.index is np.int32, read index is np.int64 tm.assert_frame_equal( written_and_read_again.set_index("index"), original, check_index_type=False, ) def test_stata_doc_examples(self): with tm.ensure_clean() as path: df = DataFrame(np.random.randn(10, 2), columns=list("AB")) df.to_stata(path) def test_write_preserves_original(self): # 9795 np.random.seed(423) df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd")) df.loc[2, "a":"c"] = np.nan df_copy = df.copy() with tm.ensure_clean() as path: df.to_stata(path, write_index=False) tm.assert_frame_equal(df, df_copy) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) encoded = read_stata(self.dta_encoding) result = encoded.kreis1849[0] expected = raw.kreis1849[0] assert result == expected assert isinstance(result, str) with tm.ensure_clean() as path: encoded.to_stata(path, write_index=False, version=version) reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded) def test_read_write_dta11(self): original = DataFrame( [(1, 2, 3, 4)], columns=[ "good", "b\u00E4d", "8number", "astringwithmorethan32characters______", ], ) formatted = DataFrame( [(1, 2, 3, 4)], columns=["good", "b_d", "_8number", "astringwithmorethan32characters_"], ) formatted.index.name = "index" formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with tm.assert_produces_warning(pd.io.stata.InvalidColumnName): original.to_stata(path, None) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta12(self, version): original = DataFrame( [(1, 2, 3, 4, 5, 6)], columns=[ "astringwithmorethan32characters_1", "astringwithmorethan32characters_2", "+", "-", "short", "delete", ], ) formatted = DataFrame( [(1, 2, 3, 4, 5, 6)], columns=[ "astringwithmorethan32characters_", "_0astringwithmorethan32character", "_", "_1_", "_short", "_delete", ], ) formatted.index.name = "index" formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", InvalidColumnName) original.to_stata(path, None, version=version) # should get a warning for that format. assert len(w) == 1 written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) def test_read_write_dta13(self): s1 = Series(2 ** 9, dtype=np.int16) s2 = Series(2 ** 17, dtype=np.int32) s3 = Series(2 ** 33, dtype=np.int64) original = DataFrame({"int16": s1, "int32": s2, "int64": s3}) original.index.name = "index" formatted = original formatted["int64"] = formatted["int64"].astype(np.float64) with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize( "file", ["dta14_113", "dta14_114", "dta14_115", "dta14_117"] ) def test_read_write_reread_dta14(self, file, parsed_114, version): file = getattr(self, file) parsed = self.read_dta(file) parsed.index.name = "index" expected = self.read_csv(self.csv14) cols = ["byte_", "int_", "long_", "float_", "double_"] for col in cols: expected[col] = expected[col]._convert(datetime=True, numeric=True) expected["float_"] = expected["float_"].astype(np.float32) expected["date_td"] = pd.to_datetime(expected["date_td"], errors="coerce") tm.assert_frame_equal(parsed_114, parsed) with tm.ensure_clean() as path: parsed_114.to_stata(path, {"date_td": "td"}, version=version) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), parsed_114) @pytest.mark.parametrize( "file", ["dta15_113", "dta15_114", "dta15_115", "dta15_117"] ) def test_read_write_reread_dta15(self, file): expected = self.read_csv(self.csv15) expected["byte_"] = expected["byte_"].astype(np.int8) expected["int_"] = expected["int_"].astype(np.int16) expected["long_"] = expected["long_"].astype(np.int32) expected["float_"] = expected["float_"].astype(np.float32) expected["double_"] = expected["double_"].astype(np.float64) expected["date_td"] = expected["date_td"].apply( datetime.strptime, args=("%Y-%m-%d",) ) file = getattr(self, file) parsed = self.read_dta(file) tm.assert_frame_equal(expected, parsed) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_timestamp_and_label(self, version): original = DataFrame([(1,)], columns=["variable"]) time_stamp = datetime(2000, 2, 29, 14, 21) data_label = "This is a data file." with tm.ensure_clean() as path: original.to_stata( path, time_stamp=time_stamp, data_label=data_label, version=version ) with StataReader(path) as reader: assert reader.time_stamp == "29 Feb 2000 14:21" assert reader.data_label == data_label @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_invalid_timestamp(self, version): original = DataFrame([(1,)], columns=["variable"]) time_stamp = "01 Jan 2000, 00:00:00" with tm.ensure_clean() as path: msg = "time_stamp should be datetime type" with pytest.raises(ValueError, match=msg): original.to_stata(path, time_stamp=time_stamp, version=version) def test_numeric_column_names(self): original = DataFrame(np.reshape(np.arange(25.0), (5, 5))) original.index.name = "index" with tm.ensure_clean() as path: # should get a warning for that format. with tm.assert_produces_warning(InvalidColumnName): original.to_stata(path) written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index("index") columns = list(written_and_read_again.columns) convert_col_name = lambda x: int(x[1]) written_and_read_again.columns = map(convert_col_name, columns) tm.assert_frame_equal(original, written_and_read_again) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nan_to_missing_value(self, version): s1 = Series(np.arange(4.0), dtype=np.float32) s2 = Series(np.arange(4.0), dtype=np.float64) s1[::2] = np.nan s2[1::2] = np.nan original = DataFrame({"s1": s1, "s2": s2}) original.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path, version=version) written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index("index") tm.assert_frame_equal(written_and_read_again, original) def test_no_index(self): columns = ["x", "y"] original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns) original.index.name = "index_not_written" with tm.ensure_clean() as path: original.to_stata(path, write_index=False) written_and_read_again = self.read_dta(path) with pytest.raises(KeyError, match=original.index.name): written_and_read_again["index_not_written"] def test_string_no_dates(self): s1 = Series(["a", "A longer string"]) s2 = Series([1.0, 2.0], dtype=np.float64) original = DataFrame({"s1": s1, "s2": s2}) original.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), original) def test_large_value_conversion(self): s0 = Series([1, 99], dtype=np.int8) s1 = Series([1, 127], dtype=np.int8) s2 = Series([1, 2 ** 15 - 1], dtype=np.int16) s3 = Series([1, 2 ** 63 - 1], dtype=np.int64) original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3}) original.index.name = "index" with tm.ensure_clean() as path: with tm.assert_produces_warning(PossiblePrecisionLoss): original.to_stata(path) written_and_read_again = self.read_dta(path) modified = original.copy() modified["s1"] = Series(modified["s1"], dtype=np.int16) modified["s2"] = Series(modified["s2"], dtype=np.int32) modified["s3"] = Series(modified["s3"], dtype=np.float64) tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) def test_dates_invalid_column(self): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) original.index.name = "index" with tm.ensure_clean() as path: with tm.assert_produces_warning(InvalidColumnName): original.to_stata(path, {0: "tc"}) written_and_read_again = self.read_dta(path) modified = original.copy() modified.columns = ["_0"] tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) def test_105(self): # Data obtained from: # http://go.worldbank.org/ZXY29PVJ21 dpath = os.path.join(self.dirpath, "S4_EDUC1.dta") df = pd.read_stata(dpath) df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]] df0 = pd.DataFrame(df0) df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"] df0["clustnum"] = df0["clustnum"].astype(np.int16) df0["pri_schl"] = df0["pri_schl"].astype(np.int8) df0["psch_num"] = df0["psch_num"].astype(np.int8) df0["psch_dis"] = df0["psch_dis"].astype(np.float32) tm.assert_frame_equal(df.head(3), df0) def test_value_labels_old_format(self): # GH 19417 # # Test that value_labels() returns an empty dict if the file format # predates supporting value labels. dpath = os.path.join(self.dirpath, "S4_EDUC1.dta") reader = StataReader(dpath) assert reader.value_labels() == {} reader.close() def test_date_export_formats(self): columns = ["tc", "td", "tw", "tm", "tq", "th", "ty"] conversions = {c: c for c in columns} data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns) original = DataFrame([data], columns=columns) original.index.name = "index" expected_values = [ datetime(2006, 11, 20, 23, 13, 20), # Time datetime(2006, 11, 20), # Day datetime(2006, 11, 19), # Week datetime(2006, 11, 1), # Month datetime(2006, 10, 1), # Quarter year datetime(2006, 7, 1), # Half year datetime(2006, 1, 1), ] # Year expected = DataFrame([expected_values], columns=columns) expected.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path, conversions) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) def test_write_missing_strings(self): original = DataFrame([["1"], [None]], columns=["foo"]) expected = DataFrame([["1"], [""]], columns=["foo"]) expected.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize("byteorder", [">", "<"]) def test_bool_uint(self, byteorder, version): s0 = Series([0, 1, True], dtype=np.bool) s1 = Series([0, 1, 100], dtype=np.uint8) s2 = Series([0, 1, 255], dtype=np.uint8) s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16) s4 = Series([0, 1, 2 ** 16 - 1], dtype=np.uint16) s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32) s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32) original = DataFrame( {"s0": s0, "s1": s1, "s2": s2, "s3": s3, "s4": s4, "s5": s5, "s6": s6} ) original.index.name = "index" expected = original.copy() expected_types = ( np.int8, np.int8, np.int16, np.int16, np.int32, np.int32, np.float64, ) for c, t in zip(expected.columns, expected_types): expected[c] = expected[c].astype(t) with tm.ensure_clean() as path: original.to_stata(path, byteorder=byteorder, version=version) written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index("index") tm.assert_frame_equal(written_and_read_again, expected) def test_variable_labels(self): with StataReader(self.dta16_115) as rdr: sr_115 = rdr.variable_labels() with StataReader(self.dta16_117) as rdr: sr_117 = rdr.variable_labels() keys = ("var1", "var2", "var3") labels = ("label1", "label2", "label3") for k, v in sr_115.items(): assert k in sr_117 assert v == sr_117[k] assert k in keys assert v in labels def test_minimal_size_col(self): str_lens = (1, 100, 244) s = {} for str_len in str_lens: s["s" + str(str_len)] = Series( ["a" * str_len, "b" * str_len, "c" * str_len] ) original = DataFrame(s) with tm.ensure_clean() as path: original.to_stata(path, write_index=False) with StataReader(path) as sr: typlist = sr.typlist variables = sr.varlist formats = sr.fmtlist for variable, fmt, typ in zip(variables, formats, typlist): assert int(variable[1:]) == int(fmt[1:-1]) assert int(variable[1:]) == typ def test_excessively_long_string(self): str_lens = (1, 244, 500) s = {} for str_len in str_lens: s["s" + str(str_len)] = Series( ["a" * str_len, "b" * str_len, "c" * str_len] ) original = DataFrame(s) msg = ( r"Fixed width strings in Stata \.dta files are limited to 244" r" \(or fewer\)\ncharacters\. Column 's500' does not satisfy" r" this restriction\. Use the\n'version=117' parameter to write" r" the newer \(Stata 13 and later\) format\." ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: original.to_stata(path) def test_missing_value_generator(self): types = ("b", "h", "l") df = DataFrame([[0.0]], columns=["float_"]) with tm.ensure_clean() as path: df.to_stata(path) with StataReader(path) as rdr: valid_range = rdr.VALID_RANGE expected_values = ["." + chr(97 + i) for i in range(26)] expected_values.insert(0, ".") for t in types: offset = valid_range[t][1] for i in range(0, 27): val = StataMissingValue(offset + 1 + i) assert val.string == expected_values[i] # Test extremes for floats val = StataMissingValue(struct.unpack("