""" Tests that apply specifically to the Python parser. Unless specifically stated as a Python-specific issue, the goal is to eventually move as many of these tests out of this module as soon as the C parser can accept further arguments when parsing. """ import csv from io import BytesIO, StringIO import pytest from pandas.errors import ParserError from pandas import DataFrame, Index, MultiIndex import pandas._testing as tm def test_default_separator(python_parser_only): # see gh-17333 # # csv.Sniffer in Python treats "o" as separator. data = "aob\n1o2\n3o4" parser = python_parser_only expected = DataFrame({"a": [1, 3], "b": [2, 4]}) result = parser.read_csv(StringIO(data), sep=None) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("skipfooter", ["foo", 1.5, True]) def test_invalid_skipfooter_non_int(python_parser_only, skipfooter): # see gh-15925 (comment) data = "a\n1\n2" parser = python_parser_only msg = "skipfooter must be an integer" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), skipfooter=skipfooter) def test_invalid_skipfooter_negative(python_parser_only): # see gh-15925 (comment) data = "a\n1\n2" parser = python_parser_only msg = "skipfooter cannot be negative" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), skipfooter=-1) @pytest.mark.parametrize("kwargs", [dict(sep=None), dict(delimiter="|")]) def test_sniff_delimiter(python_parser_only, kwargs): data = """index|A|B|C foo|1|2|3 bar|4|5|6 baz|7|8|9 """ parser = python_parser_only result = parser.read_csv(StringIO(data), index_col=0, **kwargs) expected = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"], index=Index(["foo", "bar", "baz"], name="index"), ) tm.assert_frame_equal(result, expected) def test_sniff_delimiter_comment(python_parser_only): data = """# comment line index|A|B|C # comment line foo|1|2|3 # ignore | this bar|4|5|6 baz|7|8|9 """ parser = python_parser_only result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#") expected = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"], index=Index(["foo", "bar", "baz"], name="index"), ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("encoding", [None, "utf-8"]) def test_sniff_delimiter_encoding(python_parser_only, encoding): parser = python_parser_only data = """ignore this ignore this too index|A|B|C foo|1|2|3 bar|4|5|6 baz|7|8|9 """ if encoding is not None: from io import TextIOWrapper data = data.encode(encoding) data = BytesIO(data) data = TextIOWrapper(data, encoding=encoding) else: data = StringIO(data) result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding) expected = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"], index=Index(["foo", "bar", "baz"], name="index"), ) tm.assert_frame_equal(result, expected) def test_single_line(python_parser_only): # see gh-6607: sniff separator parser = python_parser_only result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None) expected = DataFrame({"a": [1], "b": [2]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("kwargs", [dict(skipfooter=2), dict(nrows=3)]) def test_skipfooter(python_parser_only, kwargs): # see gh-6607 data = """A,B,C 1,2,3 4,5,6 7,8,9 want to skip this also also skip this """ parser = python_parser_only result = parser.read_csv(StringIO(data), **kwargs) expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")] ) def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): # see gh-6607 parser = python_parser_only with open(csv1, "rb") as f: data = f.read() data = data.replace(b",", b"::") expected = parser.read_csv(csv1) module = pytest.importorskip(compression) klass = getattr(module, klass) with tm.ensure_clean() as path: tmp = klass(path, mode="wb") tmp.write(data) tmp.close() result = parser.read_csv(path, sep="::", compression=compression) tm.assert_frame_equal(result, expected) def test_read_csv_buglet_4x_multi_index(python_parser_only): # see gh-6607 data = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" parser = python_parser_only expected = DataFrame( [ [-0.5109, -2.3358, -0.4645, 0.05076, 0.3640], [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838], ], columns=["A", "B", "C", "D", "E"], index=MultiIndex.from_tuples( [("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)], names=["one", "two", "three", "four"], ), ) result = parser.read_csv(StringIO(data), sep=r"\s+") tm.assert_frame_equal(result, expected) def test_read_csv_buglet_4x_multi_index2(python_parser_only): # see gh-6893 data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9" parser = python_parser_only expected = DataFrame.from_records( [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], columns=list("abcABC"), index=list("abc"), ) result = parser.read_csv(StringIO(data), sep=r"\s+") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("add_footer", [True, False]) def test_skipfooter_with_decimal(python_parser_only, add_footer): # see gh-6971 data = "1#2\n3#4" parser = python_parser_only expected = DataFrame({"a": [1.2, 3.4]}) if add_footer: # The stray footer line should not mess with the # casting of the first two lines if we skip it. kwargs = dict(skipfooter=1) data += "\nFooter" else: kwargs = dict() result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"] ) @pytest.mark.parametrize( "encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"] ) def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding): # see gh-3404 expected = DataFrame({"a": [1], "b": [2]}) parser = python_parser_only data = "1" + sep + "2" encoded_data = data.encode(encoding) result = parser.read_csv( BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) def test_multi_char_sep_quotes(python_parser_only, quoting): # see gh-13374 kwargs = dict(sep=",,") parser = python_parser_only data = 'a,,b\n1,,a\n2,,"2,,b"' msg = "ignored when a multi-char delimiter is used" def fail_read(): with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), quoting=quoting, **kwargs) if quoting == csv.QUOTE_NONE: # We expect no match, so there should be an assertion # error out of the inner context manager. with pytest.raises(AssertionError): fail_read() else: fail_read() def test_none_delimiter(python_parser_only, capsys): # see gh-13374 and gh-17465 parser = python_parser_only data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]}) # We expect the third line in the data to be # skipped because it is malformed, but we do # not expect any errors to occur. result = parser.read_csv( StringIO(data), header=0, sep=None, warn_bad_lines=True, error_bad_lines=False ) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() assert "Skipping line 3" in captured.err @pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) @pytest.mark.parametrize("skipfooter", [0, 1]) def test_skipfooter_bad_row(python_parser_only, data, skipfooter): # see gh-13879 and gh-15910 msg = "parsing errors in the skipped footer rows" parser = python_parser_only def fail_read(): with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), skipfooter=skipfooter) if skipfooter: fail_read() else: # We expect no match, so there should be an assertion # error out of the inner context manager. with pytest.raises(AssertionError): fail_read() def test_malformed_skipfooter(python_parser_only): parser = python_parser_only data = """ignore A,B,C 1,2,3 # comment 1,2,3,4,5 2,3,4 footer """ msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)