""" Tests that apply specifically to the CParser. Unless specifically stated as a CParser-specific issue, the goal is to eventually move as many of these tests out of this module as soon as the Python parser can accept further arguments when parsing. """ from io import BytesIO, StringIO, TextIOWrapper import mmap import os import tarfile import numpy as np import pytest from pandas.compat import IS64 from pandas.errors import ParserError import pandas.util._test_decorators as td from pandas import DataFrame, concat import pandas._testing as tm @pytest.mark.parametrize( "malformed", ["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"], ids=["words pointer", "stream pointer", "lines pointer"], ) def test_buffer_overflow(c_parser_only, malformed): # see gh-9205: test certain malformed input files that cause # buffer overflows in tokenizer.c msg = "Buffer overflow caught - possible malformed input file." parser = c_parser_only with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(malformed)) def test_buffer_rd_bytes(c_parser_only): # see gh-12098: src->buffer in the C parser can be freed twice leading # to a segfault if a corrupt gzip file is read with 'read_csv', and the # buffer is filled more than once before gzip raises an Exception. data = ( "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" "\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO" ) parser = c_parser_only for _ in range(100): try: parser.read_csv(StringIO(data), compression="gzip", delim_whitespace=True) except Exception: pass def test_delim_whitespace_custom_terminator(c_parser_only): # See gh-12912 data = "a b c~1 2 3~4 5 6~7 8 9" parser = c_parser_only df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) def test_dtype_and_names_error(c_parser_only): # see gh-8833: passing both dtype and names # resulting in an error reporting issue parser = c_parser_only data = """ 1.0 1 2.0 2 3.0 3 """ # base cases result = parser.read_csv(StringIO(data), sep=r"\s+", header=None) expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) tm.assert_frame_equal(result, expected) result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"]) expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) # fallback casting result = parser.read_csv( StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32} ) expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"]) expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(result, expected) data = """ 1.0 1 nan 2 3.0 3 """ # fallback casting, but not castable with pytest.raises(ValueError, match="cannot safely convert"): parser.read_csv( StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}, ) @pytest.mark.parametrize( "match,kwargs", [ # For each of these cases, all of the dtypes are valid, just unsupported. ( ( "the dtype datetime64 is not supported for parsing, " "pass this column using parse_dates instead" ), {"dtype": {"A": "datetime64", "B": "float64"}}, ), ( ( "the dtype datetime64 is not supported for parsing, " "pass this column using parse_dates instead" ), {"dtype": {"A": "datetime64", "B": "float64"}, "parse_dates": ["B"]}, ), ( "the dtype timedelta64 is not supported for parsing", {"dtype": {"A": "timedelta64", "B": "float64"}}, ), ("the dtype 262144b) parser = c_parser_only header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n" data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n" header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n" data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n" test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2 df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010) assert df.size == 1010 * 10 def test_float_precision_round_trip_with_text(c_parser_only): # see gh-15140 parser = c_parser_only df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip") tm.assert_frame_equal(df, DataFrame({0: ["a"]})) def test_large_difference_in_columns(c_parser_only): # see gh-14125 parser = c_parser_only count = 10000 large_row = ("X," * count)[:-1] + "\n" normal_row = "XXXXXX XXXXXX,111111111111111\n" test_input = (large_row + normal_row * 6)[:-1] result = parser.read_csv(StringIO(test_input), header=None, usecols=[0]) rows = test_input.split("\n") expected = DataFrame([row.split(",")[0] for row in rows]) tm.assert_frame_equal(result, expected) def test_data_after_quote(c_parser_only): # see gh-15910 parser = c_parser_only data = 'a\n1\n"b"a' result = parser.read_csv(StringIO(data)) expected = DataFrame({"a": ["1", "ba"]}) tm.assert_frame_equal(result, expected) def test_comment_whitespace_delimited(c_parser_only, capsys): parser = c_parser_only test_input = """\ 1 2 2 2 3 3 2 3 # 3 fields 4 2 3# 3 fields 5 2 # 2 fields 6 2# 2 fields 7 # 1 field, NaN 8# 1 field, NaN 9 2 3 # skipped line # comment""" df = parser.read_csv( StringIO(test_input), comment="#", header=None, delimiter="\\s+", skiprows=0, error_bad_lines=False, ) captured = capsys.readouterr() # skipped lines 2, 3, 4, 9 for line_num in (2, 3, 4, 9): assert f"Skipping line {line_num}" in captured.err expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) def test_file_like_no_next(c_parser_only): # gh-16530: the file-like need not have a "next" or "__next__" # attribute despite having an "__iter__" attribute. # # NOTE: This is only true for the C engine, not Python engine. class NoNextBuffer(StringIO): def __next__(self): raise AttributeError("No next method") next = __next__ parser = c_parser_only data = "a\n1" expected = DataFrame({"a": [1]}) result = parser.read_csv(NoNextBuffer(data)) tm.assert_frame_equal(result, expected) def test_buffer_rd_bytes_bad_unicode(c_parser_only): # see gh-22748 t = BytesIO(b"\xB0") t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape") msg = "'utf-8' codec can't encode character" with pytest.raises(UnicodeError, match=msg): c_parser_only.read_csv(t, encoding="UTF-8") @pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"]) def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix): # see gh-16530 # # Unfortunately, Python's CSV library can't handle # tarfile objects (expects string, not bytes when # iterating through a file-like). parser = c_parser_only tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix) with tarfile.open(tar_path, "r") as tar: data_file = tar.extractfile("tar_data.csv") out = parser.read_csv(data_file) expected = DataFrame({"a": [1]}) tm.assert_frame_equal(out, expected) @pytest.mark.high_memory def test_bytes_exceed_2gb(c_parser_only): # see gh-16798 # # Read from a "CSV" that has a column larger than 2GB. parser = c_parser_only if parser.low_memory: pytest.skip("not a high_memory test") csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) df = parser.read_csv(csv) assert not df.empty def test_chunk_whitespace_on_boundary(c_parser_only): # see gh-9735: this issue is C parser-specific (bug when # parsing whitespace and characters at chunk boundary) # # This test case has a field too large for the Python parser / CSV library. parser = c_parser_only chunk1 = "a" * (1024 * 256 - 2) + "\na" chunk2 = "\n a" result = parser.read_csv(StringIO(chunk1 + chunk2), header=None) expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"]) tm.assert_frame_equal(result, expected) def test_file_handles_mmap(c_parser_only, csv1): # gh-14418 # # Don't close user provided file handles. parser = c_parser_only with open(csv1) as f: m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) parser.read_csv(m) assert not m.closed m.close() def test_file_binary_mode(c_parser_only): # see gh-23779 parser = c_parser_only expected = DataFrame([[1, 2, 3], [4, 5, 6]]) with tm.ensure_clean() as path: with open(path, "w") as f: f.write("1,2,3\n4,5,6") with open(path, "rb") as f: result = parser.read_csv(f, header=None) tm.assert_frame_equal(result, expected) def test_unix_style_breaks(c_parser_only): # GH 11020 parser = c_parser_only with tm.ensure_clean() as path: with open(path, "w", newline="\n") as f: f.write("blah\n\ncol_1,col_2,col_3\n\n") result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") expected = DataFrame(columns=["col_1", "col_2", "col_3"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) @pytest.mark.parametrize( "data,thousands,decimal", [ ( """A|B|C 1|2,334.01|5 10|13|10. """, ",", ".", ), ( """A|B|C 1|2.334,01|5 10|13|10, """, ".", ",", ), ], ) def test_1000_sep_with_decimal( c_parser_only, data, thousands, decimal, float_precision ): parser = c_parser_only expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) result = parser.read_csv( StringIO(data), sep="|", thousands=thousands, decimal=decimal, float_precision=float_precision, ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) @pytest.mark.parametrize( "value,expected", [ ("-1,0", -1.0), ("-1,2e0", -1.2), ("-1e0", -1.0), ("+1e0", 1.0), ("+1e+0", 1.0), ("+1e-1", 0.1), ("+,1e1", 1.0), ("+1,e0", 1.0), ("-,1e1", -1.0), ("-1,e0", -1.0), ("0,1", 0.1), ("1,", 1.0), (",1", 0.1), ("-,1", -0.1), ("1_,", 1.0), ("1_234,56", 1234.56), ("1_234,56e0", 1234.56), # negative cases; must not parse as float ("_", "_"), ("-_", "-_"), ("-_1", "-_1"), ("-_1e0", "-_1e0"), ("_1", "_1"), ("_1,", "_1,"), ("_1,_", "_1,_"), ("_1e0", "_1e0"), ("1,2e_1", "1,2e_1"), ("1,2e1_0", "1,2e1_0"), ("1,_2", "1,_2"), (",1__2", ",1__2"), (",1e", ",1e"), ("-,1e", "-,1e"), ("1_000,000_000", "1_000,000_000"), ("1,e1_2", "1,e1_2"), ], ) def test_1000_sep_decimal_float_precision( c_parser_only, value, expected, float_precision ): # test decimal and thousand sep handling in across 'float_precision' # parsers parser = c_parser_only df = parser.read_csv( StringIO(value), sep="|", thousands="_", decimal=",", header=None, float_precision=float_precision, ) val = df.iloc[0, 0] assert val == expected def test_float_precision_options(c_parser_only): # GH 17154, 36228 parser = c_parser_only s = "foo\n243.164\n" df = parser.read_csv(StringIO(s)) df2 = parser.read_csv(StringIO(s), float_precision="high") tm.assert_frame_equal(df, df2) df3 = parser.read_csv(StringIO(s), float_precision="legacy") if IS64: assert not df.iloc[0, 0] == df3.iloc[0, 0] else: assert df.iloc[0, 0] == df3.iloc[0, 0] msg = "Unrecognized float_precision option: junk" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(s), float_precision="junk")